From 052e67dda833b9a0eee775e02a7587516647aad6 Mon Sep 17 00:00:00 2001 From: Maitreyee Sharma Date: Wed, 27 Mar 2024 20:26:22 -0400 Subject: [PATCH 1/4] Add(src files): Added all source files for single cluster run --- src/BO.ipynb | 296 +++++++++++++++++++++++++++++++ src/code_inputs.py | 66 +++++++ src/feature_selection_methods.py | 78 ++++++++ src/input_class.py | 75 ++++++++ src/surrogate_models.py | 95 ++++++++++ src/utils_dataset.py | 125 +++++++++++++ 6 files changed, 735 insertions(+) create mode 100644 src/BO.ipynb create mode 100644 src/code_inputs.py create mode 100644 src/feature_selection_methods.py create mode 100644 src/input_class.py create mode 100644 src/surrogate_models.py create mode 100644 src/utils_dataset.py diff --git a/src/BO.ipynb b/src/BO.ipynb new file mode 100644 index 0000000..1bf25ca --- /dev/null +++ b/src/BO.ipynb @@ -0,0 +1,296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "a45452e9-567c-4658-bfa1-f9a6f6b70bd1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "# This file costructs surrogate models for the input datasets\n", + "import numpy as np \n", + "import pandas as pd\n", + "import os\n", + "import shutil\n", + "import json\n", + "import math\n", + "import time\n", + "import warnings\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "# Torch specific module imports\n", + "import torch\n", + "import gpytorch \n", + "\n", + "# botorch specific modules\n", + "from botorch.fit import fit_gpytorch_model\n", + "from botorch.models.gpytorch import GPyTorchModel\n", + "from botorch.optim import optimize_acqf, optimize_acqf_discrete\n", + "from botorch import fit_gpytorch_mll\n", + "from botorch.acquisition.monte_carlo import (\n", + " qExpectedImprovement,\n", + " qNoisyExpectedImprovement,\n", + ")\n", + "from botorch.sampling.normal import SobolQMCNormalSampler\n", + "from botorch.exceptions import BadInitialCandidatesWarning\n", + "from botorch.acquisition import UpperConfidenceBound, ExpectedImprovement\n", + "\n", + "# Plotting libraries\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Tick parameters\n", + "plt.rcParams['xtick.labelsize'] = 15\n", + "plt.rcParams['ytick.labelsize'] = 15\n", + "plt.rcParams['xtick.major.size'] = 5\n", + "plt.rcParams['xtick.major.width'] = 1\n", + "plt.rcParams['xtick.minor.size'] = 5\n", + "plt.rcParams['xtick.minor.width'] = 1\n", + "plt.rcParams['ytick.major.size'] = 5\n", + "plt.rcParams['ytick.major.width'] = 1\n", + "plt.rcParams['ytick.minor.size'] = 5\n", + "plt.rcParams['ytick.minor.width'] = 1\n", + "\n", + "plt.rcParams['axes.labelsize'] = 15\n", + "plt.rcParams['axes.titlesize'] = 15\n", + "plt.rcParams['legend.fontsize'] = 15\n", + "\n", + "# User defined python classes and files\n", + "import input_class \n", + "import code_inputs as model_input\n", + "import utils_dataset as utilsd\n", + "import surrogate_models" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "955bc734-96c5-4d3a-9325-920c041e256b", + "metadata": {}, + "outputs": [], + "source": [ + "bounds = torch.tensor([[-10.0], [12.0]])\n", + "\n", + "batch_size = 1\n", + "num_restarts= 10 \n", + "raw_samples = 512\n", + "\n", + "def optimize_acqf_and_get_observation(acq_func, X_test, Y_test):\n", + " \"\"\"Optimizes the acquisition function, and returns a new candidate\"\"\"\n", + " # optimize\n", + " candidates, _ = optimize_acqf_discrete(\n", + " acq_function=acq_func,\n", + " choices=X_test,\n", + " q=batch_size,\n", + " max_batch_size=2048,\n", + " num_restarts=num_restarts,\n", + " raw_samples=raw_samples, # used for intialization heuristic\n", + " options={\"batch_limit\": 5, \"maxiter\": 200},\n", + " unique=True\n", + " )\n", + " \n", + " # observe new values\n", + " new_x = candidates.detach()\n", + " b = [1 if torch.all(X_test[i].eq(new_x)) else 0 for i in range(0,X_test.shape[0]) ]\n", + " b = torch.tensor(b).to(torch.int)\n", + " index = b.nonzero()[0][0]\n", + " new_y = torch.reshape(Y_test[0,index],(1,1))\n", + " \n", + " X_test_new = X_test[torch.arange(0, X_test.shape[0]) != index, ...]\n", + " Y_test_new = Y_test[..., torch.arange(0, Y_test.shape[1]) != index]\n", + " \n", + " return new_x, new_y, index, X_test_new, Y_test_new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72bb9112-1749-44fd-bc9d-7c0edb2e59a6", + "metadata": {}, + "outputs": [], + "source": [ + "warnings.filterwarnings(\"ignore\", category=BadInitialCandidatesWarning)\n", + "warnings.filterwarnings(\"ignore\", category=RuntimeWarning)\n", + "\n", + "# Create a new directory if it does not exist\n", + "isExist = os.path.exists(model_input.output_folder)\n", + "if not isExist:\n", + " os.makedirs(model_input.output_folder)\n", + " print(\"The new directory is created!\", model_input.output_folder)\n", + " \n", + "# Copy input parameters file to output folder\n", + "shutil.copy2('surrogate_model_inputs.py',model_input.output_folder)\n", + "# Copy surrogate model file to output folder\n", + "shutil.copy2('surrogate_models.py',model_input.output_folder)\n", + "\n", + "# BO Trials\n", + "n_trials = model_input.n_trials\n", + "n_update = model_input.n_update\n", + "verbose = model_input.verbose\n", + "\n", + "test_size = model_input.test_size\n", + "train_GP = model_input.train_GP\n", + "\n", + "GP_0 = model_input.GP_0_BO\n", + "\n", + "\n", + "num_nodes = model_input.num_nodes\n", + "saveModel_filename = model_input.saveModel_filename\n", + " \n", + "best_observed_all_ei0 = []\n", + "\n", + "# Average over multiple trials\n", + "for trial in range(1, n_trials + 1):\n", + " t0 = time.monotonic()\n", + " if model_input.random_seed == 'time':\n", + " random_seed = int(t0)\n", + " elif model_input.random_seed == 'iteration':\n", + " random_seed = trial\n", + " \n", + " print(f\"\\n -------------------- Trial {trial:>2} of {n_trials} --------------------\\n\", end=\"\")\n", + " best_observed0 = []\n", + "\n", + " # Getting initial data and fitting models with initial data\n", + " if model_input.standardize_data:\n", + " X_train, X_test, Y_train, Y_test, Var_train, Var_test, scalerX_transform, scalerY_transform = utilsd.generate_training_data(random_seed,model_input.test_size)\n", + " else:\n", + " X_train, X_test, Y_train, Y_test, Var_train, Var_test = utilsd.generate_training_data(random_seed,model_input.test_size)\n", + " \n", + " # Finding best value in initial data\n", + " if model_input.maximization:\n", + " best_observed_value = Y_train.max()\n", + " optimal_solution = torch.cat([Y_train[0],Y_test[0]]).max()\n", + " else:\n", + " best_observed_value = Y_train.min()\n", + " optimal_solution = torch.cat([Y_train[0],Y_test[0]]).min()\n", + " \n", + " # If optimal value is present in the initial dataset sample remove it \n", + " if (best_observed_value.eq(optimal_solution)) and model_input.maximization:\n", + " print('Max in training set, removing it before training models.')\n", + " optimal_position = torch.argmax(Y_train)\n", + " \n", + " # Add max value to test/exploration set\n", + " X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n", + " X_test = torch.cat([X_test,X_add_toTest])\n", + " Y_add_toTest = torch.reshape(optimal_solution,(1,1)) \n", + " Y_test = torch.cat((Y_test,Y_add_toTest),1)\n", + " \n", + " # Remove max value from training set\n", + " X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n", + " Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n", + " \n", + " # Update best observed value\n", + " best_observed_value = Y_train.max()\n", + " \n", + " elif (best_observed_value.eq(optimal_solution)) and not model_input.maximization:\n", + " print('Min in training set, removing it before training models.')\n", + " optimal_position = torch.argmin(Y_train)\n", + " \n", + " # Add min value to test/exploration set\n", + " X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n", + " X_test = torch.cat([X_test,X_add_toTest])\n", + " Y_add_toTest = torch.reshape(optimal_solution,(1,1)) \n", + " Y_test = torch.cat((Y_test,Y_add_toTest),1)\n", + " \n", + " # Remove min value from training set\n", + " X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n", + " Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n", + " \n", + " # Update best observed value\n", + " best_observed_value = Y_train.min()\n", + " \n", + " # Initialize data for training gp-0 and gp-l models\n", + " X_train0, Y_train0, X_test0, Y_test0 = X_train, Y_train, X_test, Y_test\n", + " \n", + " n_batch = model_input.n_batch_perTrial\n", + " \n", + " # Initialize likelihood, GP model and acquisition function for the models\n", + " #--------------------------- GP-0 ---------------------------#\n", + " if GP_0:\n", + " likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood()\n", + " model_gp0 = surrogate_models.ExactGPModel(X_train0, Y_train0, likelihood_gp0) \n", + " # AcqFunc_0 = UpperConfidenceBound(model_gp0, beta=0.1) \n", + " AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_observed_value, maximize=model_input.maximization)\n", + " best_observed0.append(best_observed_value) # Appending to best_observed list for the given trial\n", + " \n", + " # run N_BATCH rounds of BayesOpt after the initial random batch\n", + " for iteration in range(1, n_batch + 1):\n", + "\n", + " if GP_0:\n", + " if ((iteration-1)%n_update==0):\n", + " # fit the models every 10 iterations\n", + " model_gp0, likelihood_gp0 = surrogate_models.train_surrogate_gp0(saveModel_filename, test_size, num_nodes, X_train0, Y_train0)\n", + " \n", + " # optimize and get new observation using acquisition function\n", + " new_x0, new_y0, index, X_test_new0, Y_test_new0 = optimize_acqf_and_get_observation(AcqFunc_0, X_test0, Y_test0)\n", + " \n", + " # Update remaining choices tensor\n", + " X_test0 = X_test_new0\n", + " Y_test0 = Y_test_new0\n", + "\n", + " # Update training points\n", + " X_train0 = torch.cat([X_train0, new_x0])\n", + " Y_train0 = torch.cat([Y_train0[0], new_y0[0]])\n", + " Y_train0 = torch.reshape(Y_train0,(1,Y_train0.shape[0]))\n", + "\n", + " # update progress\n", + " if model_input.maximization:\n", + " best_value_ei0 = Y_train0.max()\n", + " elif not model_input.maximization:\n", + " best_value_ei0 = Y_train0.min()\n", + " best_observed0.append(best_value_ei0)\n", + "\n", + " # AcqFunc_0 = UpperConfidenceBound(model_gp0, beta=0.1) \n", + " AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_value_ei0, maximize=model_input.maximization)\n", + " \n", + " if verbose:\n", + " print(\n", + " f\"\\nBatch {iteration:>2}: best_value (GP-0, GP-Linear, GP-NN) = \",\n", + " f\"({best_value_ei0:>4.2f}, {best_value_eiL:>4.2f}, {best_value_eiNN:>4.2f})\",\n", + " end=\"\",)\n", + "\n", + " t1 = time.monotonic()\n", + " \n", + " print(f\"time = {t1-t0:>4.2f}.\")\n", + " # Appending to common list of best observed values, with number of rows equal to number of trials\n", + " if GP_0:\n", + " best_observed_all_ei0.append(best_observed0) \n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/code_inputs.py b/src/code_inputs.py new file mode 100644 index 0000000..108ae45 --- /dev/null +++ b/src/code_inputs.py @@ -0,0 +1,66 @@ +# This file contaings surrogate model inputs +import numpy as np +import os +import json +import math + +# Torch specific module imports +import torch +import gpytorch +from torch import nn +from torch.utils.data import DataLoader, Dataset +from torchvision import datasets, transforms +from torch.nn import functional as F + +# botorch specific modules +from botorch.fit import fit_gpytorch_model +from botorch.models.gpytorch import GPyTorchModel + +# Plotting libraries +import matplotlib as mpl +import matplotlib.pyplot as plt + +# User defined python classes and files +import sys + +import utils_dataset as utilsd +import input_class + +np.random.seed(0) +torch.manual_seed(0) + +# General inputs +run_folder = '/Users/maitreyeesharma/WORKSPACE/PostDoc/EngChem/MatDisc_ML/python_notebook_bo/' # Folder where code is run and input json exist +num_run = 3 +test_size = 0.01 +output_folder = run_folder+'../bo_output/' # Folder where all outputs are stored +output_folder = output_folder+'Space@Hopkins_recommendations/mpea_hv_forEddie_'+str(test_size)+'p_ThirdPass_Mar5_24/' +verbose = True +deep_verbose = False + +# Reading and data processing inputs +add_target_noise = False +standardize_data = True + +# Feature selection inputs +test_size_fs = 0.1 +select_features_otherModels = False + +# BO inputs +n_trials = 5 +n_update = 1000 +GP_0_BO = True +GP_L_BO = True +GP_NN_BO = False +random_seed = 'iteration' +maximization = True +new_values_predict_from_model = False +n_batch_perTrial = 100 + +# Surrogate training boolean inputs +train_GP = True + +# GP Model parameters +kernel = 'Matern' +learning_rate_gp0 = 0.01 +epochs_GP0 = 500 diff --git a/src/feature_selection_methods.py b/src/feature_selection_methods.py new file mode 100644 index 0000000..f0e4c37 --- /dev/null +++ b/src/feature_selection_methods.py @@ -0,0 +1,78 @@ +import numpy as np +import csv +import copy +import random +import pandas as pd +import scipy +import sklearn as sk + +import xgboost as xgb +from xgboost.sklearn import XGBRegressor +from sklearn.metrics import mean_squared_error + +# User defined files and classes +import utils_dataset as utilsd + +# sklearn functions +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import KFold +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.linear_model import Lasso + + +class feature_selection_algorithms: + + def __init__(self,XX,YY,test_size=0.33,random_state=42): + + # Train Data + self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(XX, YY, test_size=test_size, random_state=random_state) + + # XGBoost + def xgboost(self, **kwargs): + + clf = XGBRegressor(n_estimators=100, learning_rate=0.025, max_depth=20, verbosity=0, booster='gbtree', + reg_alpha=np.exp(-6.788644799030888), reg_lambda=np.exp(-7.450413274554533), + gamma=np.exp(-5.374463422208394), subsample=0.5, objective= 'reg:squarederror', n_jobs=1) + + paras = clf.get_params() + + clf.fit(self.X_train, self.y_train) + return clf + + # Features selected by XGBoost + def selected_features_xgboost(self, descriptors, deep_verbose=False): + + clf = self.xgboost() + score = clf.score(self.X_train, self.y_train) + if deep_verbose: + print("XGBoost Training score: ", score) + + scores = cross_val_score(clf, self.X_train, self.y_train,cv=10) + if deep_verbose: + print("XGBoost Mean cross-validation score: %.2f" % scores.mean()) + + + ypred = clf.predict(self.X_test) + mse = mean_squared_error(self.y_test, ypred) + if deep_verbose: + print("XGBoost MSE: %.2f" % mse) + print("XGBoost RMSE: %.2f" % (mse**(1/2.0))) + + f_importance = clf.get_booster().get_score(importance_type='gain') + feature_importance_dict={} + + for f,value in f_importance.items(): + feature_index = int(f.split('f')[1]) + feature_importance_dict[descriptors[feature_index]] = value + if deep_verbose: + print(f"Column: {feature_index}, descriptor: {descriptors[feature_index]}") + + return feature_importance_dict.keys() + + +if __name__=="__main__": + + print('Feature selection methods are in this class') \ No newline at end of file diff --git a/src/input_class.py b/src/input_class.py new file mode 100644 index 0000000..c0dae6b --- /dev/null +++ b/src/input_class.py @@ -0,0 +1,75 @@ +import sklearn +import numpy as np +import csv +import copy +import random +import pandas as pd +import pickle +import json +import openpyxl +import itertools + +# User defined files and classes +import feature_selection_methods as feature_selection +import utils_dataset as utilsd + +# Plotting libraries +import matplotlib as mpl +import matplotlib.pyplot as plt + +# Tick parameters +plt.rcParams['xtick.labelsize'] = 15 +plt.rcParams['ytick.labelsize'] = 15 +plt.rcParams['xtick.major.size'] = 5 +plt.rcParams['xtick.major.width'] = 1 +plt.rcParams['xtick.minor.size'] = 5 +plt.rcParams['xtick.minor.width'] = 1 +plt.rcParams['ytick.major.size'] = 5 +plt.rcParams['ytick.major.width'] = 1 +plt.rcParams['ytick.minor.size'] = 5 +plt.rcParams['ytick.minor.width'] = 1 + +plt.rcParams['axes.labelsize'] = 20 +plt.rcParams['axes.titlesize'] = 20 +plt.rcParams['legend.fontsize'] = 15 + + +class inputs: + def __init__(self,input_type='COF',input_path='.',input_file='properties.csv'): + self.input_type = input_type + self.input_path = input_path + self.input_file = input_file + self.filename = self.input_path + self.input_file + + def read_inputs(self): + ''' + This function reads the dataset from the COF paper: https://pubs.acs.org/doi/10.1021/acs.chemmater.8b01425 + input_type='COF', + input_path='.', + input_file='properties.csv' + ''' + data = pd.read_csv(self.filename) + descriptors = ['dimensions', 'bond type', 'void fraction [widom]', 'supercell volume [A^3]', 'density [kg/m^3]', + 'heat desorption high P [kJ/mol]','absolute methane uptake high P [molec/unit cell]', + 'absolute methane uptake high P [mol/kg]', 'excess methane uptake high P [molec/unit cell]', + 'excess methane uptake high P [mol/kg]', 'heat desorption low P [kJ/mol]', + 'absolute methane uptake low P [molec/unit cell]', + 'absolute methane uptake low P [mol/kg]', + 'excess methane uptake low P [molec/unit cell]', + 'excess methane uptake low P [mol/kg]', 'surface area [m^2/g]', 'linkerA', 'linkerB', 'net', + 'cell_a [A]', 'cell_b [A]', 'cell_c [A]', 'alpha [deg]', 'beta [deg]', 'gamma [deg]', + 'num carbon', 'num fluorine', 'num hydrogen', 'num nitrogen', 'num oxygen', 'num sulfur', + 'num silicon', 'vertices', 'edges', 'genus', 'largest included sphere diameter [A]', + 'largest free sphere diameter [A]', 'largest included sphere along free sphere path diameter [A]', + 'absolute methane uptake high P [v STP/v]', 'absolute methane uptake low P [v STP/v]]'] + XX = pd.DataFrame(data, columns=descriptors) + target = copy.deepcopy(data['deliverable capacity [v STP/v]'].to_numpy()) + YY = target.reshape(-1,1) + + return XX, YY, descriptors + + +if __name__=="__main__": + + print('Reading inputs') + \ No newline at end of file diff --git a/src/surrogate_models.py b/src/surrogate_models.py new file mode 100644 index 0000000..4ef23e5 --- /dev/null +++ b/src/surrogate_models.py @@ -0,0 +1,95 @@ +# This file costructs surrogate models for the input datasets +import numpy as np +import os +import json +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +import math + +# Torch specific module imports +import torch +import gpytorch + +# botorch specific modules +from botorch.fit import fit_gpytorch_model +from botorch.models.gpytorch import GPyTorchModel + +# Plotting libraries +import matplotlib as mpl +import matplotlib.pyplot as plt + +# User defined python classes and files +import utils_dataset as utilsd +import input_class +import code_inputs as model_input + +np.random.seed(0) +torch.manual_seed(0) + +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using {device} device") + +# We will use the simplest form of GP model, exact inference +class ExactGPModel(gpytorch.models.ExactGP,GPyTorchModel): + _num_outputs = 1 # to inform GPyTorchModel API + MIN_INFERRED_NOISE_LEVEL = 1e-5 + def __init__(self, train_x, train_y, likelihood): + super(ExactGPModel, self).__init__(train_x, train_y, likelihood) + self.mean_module = gpytorch.means.ConstantMean() + if model_input.kernel=='RBF': + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()) + elif model_input.kernel=='Matern': + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5)) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + +#--------------------------- GP-0 ---------------------------# +def train_surrogate_gp0(saveModel_filename,test_size,num_nodes,X_train,Y_train): + + mse_gp0 = 0.0 + training_iter = model_input.epochs_GP0 + + # initialize likelihood and model + likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood() + model_gp0 = ExactGPModel(X_train, Y_train, likelihood_gp0) + + # Find optimal model hyperparameters + model_gp0.train() + likelihood_gp0.train() + + # Use the adam optimizer + optimizer = torch.optim.Adam(model_gp0.parameters(), lr=model_input.learning_rate_gp0) # Includes GaussianLikelihood parameters + + # "Loss" for GPs - the marginal log likelihood + mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood_gp0, model_gp0) + + for i in range(training_iter): + optimizer.zero_grad() # Zero gradients from previous iteration + output = model_gp0(X_train) # Output from model + loss = -mll(output, Y_train) # Calc loss and backprop gradients + loss.backward() + optimizer.step() + + return model_gp0, likelihood_gp0 + +def predict_surrogates(model, likelihood, X): + + # Get into evaluation (predictive posterior) mode + model.eval() + likelihood.eval() + + # Make predictions by feeding model through likelihood + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + prediction = model(X) + prediction = likelihood(model(X)) + + observed_mean = prediction.mean + observed_var = prediction.variance + observed_covar = prediction.covariance_matrix + + return observed_mean, observed_var + + \ No newline at end of file diff --git a/src/utils_dataset.py b/src/utils_dataset.py new file mode 100644 index 0000000..b5c5143 --- /dev/null +++ b/src/utils_dataset.py @@ -0,0 +1,125 @@ +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +from torch.utils.data import Dataset, random_split, DataLoader + +import torch +import json +import pandas as pd +import numpy as np + +# User defined python classes and files +import utils_dataset as utilsd +import input_class +import code_inputs as model_input +import feature_selection_methods as feature_selection + +# Add slicing of the input XX tensor with additional input for the columns picked out by XGBoost or other feature selection methods +class InputDataset(Dataset): + """ Input dataset used for training """ + + def __init__(self, XX, YY, Var=None, transform=None): + """ + Args: + XX: NN Input features vector as a torch tensor + YY: NN Labels vector as a torch tensor + descriptors(list of strings): Names of the input features + transform (callable, optional): Optional transform to be applied + on a sample. + """ + self.XX = XX + self.YY = YY + self.var = Var + self.transform = transform + + def __len__(self): + return self.XX.shape[0] + + def __getitem__(self, idx): + if torch.is_tensor(idx): + idx = idx.tolist() + + x = self.XX[idx,:] + y = self.YY[:,idx] + if self.var != None: + var = self.var[idx] + item = {'in_features':x,'labels':y,'variance':var} + else: + item = {'in_features':x,'labels':y} + + return item + + +def standardize_data(x): + scalerX = StandardScaler().fit(x) + x_train = scalerX.transform(x) + return x_train, scalerX + +def standardize_test_data(x,scalerX): + x_test = scalerX.transform(x) + return x_test + +def generate_training_data(random_state,test_size): + + # Reading the input json file with dataset filename and path information + with open(model_input.run_folder+'inputs.json', "r") as f: + input_dict = json.load(f) + + input_type = input_dict['InputType'] + input_path = input_dict['InputPath'] + input_file = input_dict['InputFile'] + add_target_noise = input_dict['AddTargetNoise'] + + input = input_class.inputs(input_type=input_type, + input_path=input_path, + input_file=input_file, + add_target_noise=add_target_noise) + + XX, YY, descriptors = input.read_inputs(model_input.verbose) + + # Transforming datasets by standardization + if model_input.standardize_data: + X_stand, scalerX_transform = utilsd.standardize_data(XX) + Y_stand, scalerY_transform = utilsd.standardize_data(YY) + else: + X_stand=XX.to_numpy() + Y_stand = YY + + # Checking if we should use xgboost recommended descriptors or all descriptors + if model_input.select_features_otherModels: + fs = feature_selection.feature_selection_algorithms(X_stand,Y_stand, + test_size=model_input.test_size_fs, + random_state=random_state) + xg_boost_descriptors = fs.selected_features_xgboost(descriptors) + if model_input.verbose: + print('Selected Features, ', xg_boost_descriptors) + else: + xg_boost_descriptors = descriptors + + XX = pd.DataFrame(XX, columns=xg_boost_descriptors) + if model_input.standardize_data: + X_stand, scalerX_transform = utilsd.standardize_data(XX) + else: + X_stand=XX.to_numpy() + + # Creating train-test split in data + X_train, X_test, Y_train, Y_test = train_test_split(X_stand, Y_stand, + test_size=test_size, + random_state=random_state) #,stratify=Y_stand) + + Var_train = torch.ones(len(Y_train)) + Var_test = torch.ones(len(Y_test)) + + # Converting data arrays to torch tensors + X_train = torch.tensor(X_train).to(torch.float32) + Y_train = np.transpose(Y_train) # Ytrain has to have only one row for GP training + Y_train = torch.tensor(Y_train).to(torch.float32) + + X_test = torch.tensor(X_test).to(torch.float32) + Y_test = np.transpose(Y_test) # Ytrain has to have only one row for GP training + Y_test = torch.tensor(Y_test).to(torch.float32) + + if model_input.standardize_data: + return X_train, X_test, Y_train, Y_test, Var_train, Var_test, scalerX_transform, scalerY_transform + else: + return X_train, X_test, Y_train, Y_test, Var_train, Var_test + From 16f4788289b6e9e7dc21c62353c627cd434a1c52 Mon Sep 17 00:00:00 2001 From: Nikhil-Thota Date: Wed, 27 Mar 2024 22:35:21 -0400 Subject: [PATCH 2/4] mod(test.ipynb) : Added some code for parallel 'for' loops --- .DS_Store | Bin 0 -> 6148 bytes src/input_class.py | 2 +- src/surrogate_models.py | 2 +- src/test.ipynb | 225 ++++++++++++++++++++++++++++++++++++++++ test.ipynb | 33 ------ 5 files changed, 227 insertions(+), 35 deletions(-) create mode 100644 .DS_Store create mode 100644 src/test.ipynb delete mode 100644 test.ipynb diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..45a00115f3602e77b0301a82462ee5e03071687d GIT binary patch literal 6148 zcmeHK%}T>S5S}#wt0MHGc+An0SQO8(#3$$rXd7vTq$&MFddkhCPvuK^5=7902l3+3 zncYdcYubtz5t)JAFPWd+FW-`E764Rp5;p+q08qn5=x<`Tz$i|wVr%T#K~#E-4rI`W zF@YG zbbEb&clupL{KJYt&t`N;#C+u6#V^j-Vf!@MwPQB7{b{q{R@SE3oXMiNGw*Xw^R5^3 zLJua;c4i0fy_#WjIC(4e`7(~pWO;^Sn^ zm06y(fRBeCSVL1bAuQ`Q>GNrlqx=AC{vF9 zfb&Ae+@L83#fJ~YGb=u!C_OulALw*Yp+V_I0a3tJz%#dX+W*g%pa0z=xf2CMfq$id z@}pMN#7E-2weoPZ*9O?mv5`?;ZcvwC$J?>Kpsjchn>LIEbb%N$<_6J&CO-mN2I)kB HUsd1}opH7I literal 0 HcmV?d00001 diff --git a/src/input_class.py b/src/input_class.py index c0dae6b..2bf5d13 100644 --- a/src/input_class.py +++ b/src/input_class.py @@ -35,7 +35,7 @@ class inputs: - def __init__(self,input_type='COF',input_path='.',input_file='properties.csv'): + def __init__(self,input_type='COF',input_path='../',input_file='properties.csv'): self.input_type = input_type self.input_path = input_path self.input_file = input_file diff --git a/src/surrogate_models.py b/src/surrogate_models.py index 4ef23e5..6e90c68 100644 --- a/src/surrogate_models.py +++ b/src/surrogate_models.py @@ -47,7 +47,7 @@ def forward(self, x): return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) #--------------------------- GP-0 ---------------------------# -def train_surrogate_gp0(saveModel_filename,test_size,num_nodes,X_train,Y_train): +def train_surrogate_gp0(X_train,Y_train): mse_gp0 = 0.0 training_iter = model_input.epochs_GP0 diff --git a/src/test.ipynb b/src/test.ipynb new file mode 100644 index 0000000..ada635f --- /dev/null +++ b/src/test.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dde8a292", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2234ceda", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/botorch/models/gp_regression.py:161: UserWarning: The model inputs are of type torch.float32. It is strongly recommended to use double precision in BoTorch, as this improves both precision and stability and can help avoid numerical errors. See https://github.com/pytorch/botorch/discussions/1444\n", + " self._validate_tensor_args(X=transformed_X, Y=train_Y, Yvar=train_Yvar)\n" + ] + }, + { + "data": { + "text/plain": [ + "ExactMarginalLogLikelihood(\n", + " (likelihood): GaussianLikelihood(\n", + " (noise_covar): HomoskedasticNoise(\n", + " (noise_prior): GammaPrior()\n", + " (raw_noise_constraint): GreaterThan(1.000E-04)\n", + " )\n", + " )\n", + " (model): SingleTaskGP(\n", + " (likelihood): GaussianLikelihood(\n", + " (noise_covar): HomoskedasticNoise(\n", + " (noise_prior): GammaPrior()\n", + " (raw_noise_constraint): GreaterThan(1.000E-04)\n", + " )\n", + " )\n", + " (mean_module): ConstantMean()\n", + " (covar_module): ScaleKernel(\n", + " (base_kernel): MaternKernel(\n", + " (lengthscale_prior): GammaPrior()\n", + " (raw_lengthscale_constraint): Positive()\n", + " )\n", + " (outputscale_prior): GammaPrior()\n", + " (raw_outputscale_constraint): Positive()\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "from botorch.models import SingleTaskGP\n", + "from botorch.fit import fit_gpytorch_mll\n", + "from botorch.utils import standardize\n", + "from gpytorch.mlls import ExactMarginalLogLikelihood\n", + "\n", + "train_X = torch.rand(10, 2)\n", + "Y = 1 - torch.linalg.norm(train_X - 0.5, dim=-1, keepdim=True)\n", + "Y = Y + 0.1 * torch.randn_like(Y) # add some noise\n", + "train_Y = standardize(Y)\n", + "\n", + "gp = SingleTaskGP(train_X, train_Y)\n", + "mll = ExactMarginalLogLikelihood(gp.likelihood, gp)\n", + "fit_gpytorch_mll(mll)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d4524f72", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15.3 ms ± 6.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "786 ns ± 94.2 ns per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "from joblib import Parallel, delayed\n", + "\n", + "def compute_sq(x):\n", + " return x**2\n", + "\n", + "# Time the computation of the squares of the numbers 0 to 9\n", + "\n", + "%timeit -n 10 Parallel(n_jobs=-1, backend='loky', verbose=0)(delayed(compute_sq)(x) for x in range(10))\n", + "%timeit -n 10 [compute_sq(x) for x in range(10)]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "92f541d2", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'deliverable capacity [v STP/v]'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'deliverable capacity [v STP/v]'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 7\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01minput_class\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m inputs\n\u001b[1;32m 5\u001b[0m inputs_obj \u001b[38;5;241m=\u001b[39m inputs()\n\u001b[0;32m----> 7\u001b[0m XX, YY \u001b[38;5;241m=\u001b[39m \u001b[43minputs_obj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery/src/input_class.py:66\u001b[0m, in \u001b[0;36minputs.read_inputs\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 52\u001b[0m descriptors \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdimensions\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbond type\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mvoid fraction [widom]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msupercell volume [A^3]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdensity [kg/m^3]\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m 53\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mheat desorption high P [kJ/mol]\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute methane uptake high P [molec/unit cell]\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m 54\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute methane uptake high P [mol/kg]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mexcess methane uptake high P [molec/unit cell]\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlargest free sphere diameter [A]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlargest included sphere along free sphere path diameter [A]\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 64\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute methane uptake high P [v STP/v]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute methane uptake low P [v STP/v]]\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 65\u001b[0m XX \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(data, columns\u001b[38;5;241m=\u001b[39mdescriptors)\n\u001b[0;32m---> 66\u001b[0m target \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdeliverable capacity [v STP/v]\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mto_numpy())\n\u001b[1;32m 67\u001b[0m YY \u001b[38;5;241m=\u001b[39m target\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 69\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m XX, YY, descriptors\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'deliverable capacity [v STP/v]'" + ] + } + ], + "source": [ + "# Read the inputs\n", + "\n", + "from input_class import inputs\n", + "\n", + "inputs_obj = inputs()\n", + "\n", + "XX, YY = inputs_obj.read_inputs()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "97d372f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[0.9786],\n", + " [0.7992],\n", + " [0.4615],\n", + " [0.7805],\n", + " [0.1183],\n", + " [0.6399],\n", + " [0.1434],\n", + " [0.9447],\n", + " [0.5218],\n", + " [0.4147]], dtype=torch.float64)\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "grad can be implicitly created only for scalar outputs", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m train_Y \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mfrom_numpy(np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mrand(\u001b[38;5;241m10\u001b[39m,\u001b[38;5;241m1\u001b[39m))\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(train_Y)\n\u001b[0;32m----> 8\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimeit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m-n 10 train_surrogate_gp0(train_X, train_Y)\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/IPython/core/interactiveshell.py:2480\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2478\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m 2479\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2480\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2482\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2483\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2484\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2485\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/IPython/core/magics/execution.py:1189\u001b[0m, in \u001b[0;36mExecutionMagics.timeit\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n\u001b[1;32m 1186\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time_number \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.2\u001b[39m:\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m-> 1189\u001b[0m all_runs \u001b[38;5;241m=\u001b[39m \u001b[43mtimer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepeat\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepeat\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumber\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1190\u001b[0m best \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmin\u001b[39m(all_runs) \u001b[38;5;241m/\u001b[39m number\n\u001b[1;32m 1191\u001b[0m worst \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmax\u001b[39m(all_runs) \u001b[38;5;241m/\u001b[39m number\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/timeit.py:208\u001b[0m, in \u001b[0;36mTimer.repeat\u001b[0;34m(self, repeat, number)\u001b[0m\n\u001b[1;32m 206\u001b[0m r \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(repeat):\n\u001b[0;32m--> 208\u001b[0m t \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimeit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnumber\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 209\u001b[0m r\u001b[38;5;241m.\u001b[39mappend(t)\n\u001b[1;32m 210\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/IPython/core/magics/execution.py:173\u001b[0m, in \u001b[0;36mTimer.timeit\u001b[0;34m(self, number)\u001b[0m\n\u001b[1;32m 171\u001b[0m gc\u001b[38;5;241m.\u001b[39mdisable()\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 173\u001b[0m timing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gcold:\n", + "File \u001b[0;32m:1\u001b[0m, in \u001b[0;36minner\u001b[0;34m(_it, _timer)\u001b[0m\n", + "File \u001b[0;32m~/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery/src/surrogate_models.py:73\u001b[0m, in \u001b[0;36mtrain_surrogate_gp0\u001b[0;34m(X_train, Y_train)\u001b[0m\n\u001b[1;32m 71\u001b[0m output \u001b[38;5;241m=\u001b[39m model_gp0(X_train) \u001b[38;5;66;03m# Output from model\u001b[39;00m\n\u001b[1;32m 72\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39mmll(output, Y_train) \u001b[38;5;66;03m# Calc loss and backprop gradients \u001b[39;00m\n\u001b[0;32m---> 73\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 74\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model_gp0, likelihood_gp0\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 513\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 514\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 515\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 520\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 521\u001b[0m )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/autograd/__init__.py:259\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 250\u001b[0m inputs \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 251\u001b[0m (inputs,)\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(inputs, (torch\u001b[38;5;241m.\u001b[39mTensor, graph\u001b[38;5;241m.\u001b[39mGradientEdge))\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m()\n\u001b[1;32m 256\u001b[0m )\n\u001b[1;32m 258\u001b[0m grad_tensors_ \u001b[38;5;241m=\u001b[39m _tensor_or_tensors_to_tuple(grad_tensors, \u001b[38;5;28mlen\u001b[39m(tensors))\n\u001b[0;32m--> 259\u001b[0m grad_tensors_ \u001b[38;5;241m=\u001b[39m \u001b[43m_make_grads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_grads_batched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m retain_graph \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 261\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/autograd/__init__.py:132\u001b[0m, in \u001b[0;36m_make_grads\u001b[0;34m(outputs, grads, is_grads_batched)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out\u001b[38;5;241m.\u001b[39mrequires_grad:\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out\u001b[38;5;241m.\u001b[39mnumel() \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m--> 132\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgrad can be implicitly created only for scalar outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 134\u001b[0m )\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m out\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mis_floating_point:\n\u001b[1;32m 136\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgrad can be implicitly created only for real scalar outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mout\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 139\u001b[0m )\n", + "\u001b[0;31mRuntimeError\u001b[0m: grad can be implicitly created only for scalar outputs" + ] + } + ], + "source": [ + "from surrogate_models import train_surrogate_gp0\n", + "import numpy as np\n", + "\n", + "train_X = torch.randn(10,2, requires_grad=True)\n", + "train_Y = torch.from_numpy(np.random.rand(10,1))\n", + "print(train_Y)\n", + "\n", + "%timeit -n 10 train_surrogate_gp0(train_X, train_Y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07d11c64", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 4356ad3..0000000 --- a/test.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "dde8a292", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From db2a2f71aba6a749799f97fafc330d435a77ea17 Mon Sep 17 00:00:00 2001 From: Nikhil-Thota Date: Thu, 28 Mar 2024 02:21:34 -0400 Subject: [PATCH 3/4] mod(a bunch of files) : Added gigi's code and the joblib code for running multiple GPs in parallel --- .DS_Store | Bin 6148 -> 6148 bytes bo_output/.DS_Store | Bin 0 -> 6148 bytes bo_output/test/scalerX_0.joblib | Bin 0 -> 773 bytes bo_output/test/scalerX_1.joblib | Bin 0 -> 773 bytes bo_output/test/scalerX_2.joblib | Bin 0 -> 773 bytes bo_output/test/scalerY_0.joblib | Bin 0 -> 629 bytes bo_output/test/scalerY_1.joblib | Bin 0 -> 629 bytes bo_output/test/scalerY_2.joblib | Bin 0 -> 629 bytes bo_output/test/surrogate_models.py | 95 ++++ src/.DS_Store | Bin 0 -> 6148 bytes src/BO.ipynb | 831 +++++++++++++++++++++++++---- src/code_inputs.py | 6 +- src/input_class.py | 43 +- src/kmeans.py | 30 ++ src/surrogate_models.py | 2 +- src/test.ipynb | 128 +---- src/utils_dataset.py | 30 +- 17 files changed, 911 insertions(+), 254 deletions(-) create mode 100644 bo_output/.DS_Store create mode 100644 bo_output/test/scalerX_0.joblib create mode 100644 bo_output/test/scalerX_1.joblib create mode 100644 bo_output/test/scalerX_2.joblib create mode 100644 bo_output/test/scalerY_0.joblib create mode 100644 bo_output/test/scalerY_1.joblib create mode 100644 bo_output/test/scalerY_2.joblib create mode 100644 bo_output/test/surrogate_models.py create mode 100644 src/.DS_Store create mode 100644 src/kmeans.py diff --git a/.DS_Store b/.DS_Store index 45a00115f3602e77b0301a82462ee5e03071687d..84ca3361e53be15ff025b5c65f29f1766380982c 100644 GIT binary patch delta 193 zcmZoMXfc@J&nUJrU^g?P*yMREa+9@K_QdXZz`(%3!l1{H&XCDalAG`1l9ZF51Qg>C zd)vNk;;o~OsPZXz%G1f&v9or^OUc)8{Wu0P^Vq#pdv5<*zGdss$egNzcI643T delta 186 zcmZoMXfc@J&nUVvU^g?P=;V1Ua+9@K_QaNoF)%Q&Fz7L)Gh{N9YPNJi1j@L*%1BI9Ouj=%f>TLd$1 diff --git a/bo_output/.DS_Store b/bo_output/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..1ce14b06bd8e8a2f8083a5dbb7792cc0c0f1681a GIT binary patch literal 6148 zcmeH~JqiLr422WjLa^D=avBfd4F=H@cmdJ16D-tzj_%73f~&QNyg>3zG82}4#m+`V zbbTLIBE5*r;6_}#|6BN{`G3^HlnPLRKT|-v z&2F>COXc1A_Ig&|XVumX4*GF~m!AM6b`>w+ZrCrj0Bf=ZQGxMCz-3^d0$)|&0hY}Y A!vFvP literal 0 HcmV?d00001 diff --git a/bo_output/test/scalerX_0.joblib b/bo_output/test/scalerX_0.joblib new file mode 100644 index 0000000000000000000000000000000000000000..86a99f4f2c23c9ede8c079f63c55b9a5a8721f0e GIT binary patch literal 773 zcma))O=uHA6vvaeHBqriDe56m{HPLZT?teW6tj|p(8XZwp(h-%xu`TA z#$!>iES+=^B%T@&6cFD;l-l073M?2x&bmiK=vk0hqb76^;Wzl+OZ+^~u?^F9h=Gfs z$LlN={m{@7bH((7su;UHP)0DU5JZ9P0j?)m+5;52fg>^$A(3NSyowuCRnsR(bt_J2 zLqcGcv-MQqpv1ARXo(oDuo@tlN!W+@wfYP{#dG>P7C*vErP6RyMD)>PAvcPt_bq!d zepRevT~esv^-lPPV;gGJ69l#?ymQu5k}8f#^caCbu-0daRt(dDlybb8CFM~_fl?R6 zQ0UQaFrKbDEcpAqS;*B|9HN5e4^d&XQ6ny_TpQ* ztR0^HKKp9ujMlSc?K!Y@tW#gCEuUTK+om=8S9uxGe%4?L5o-G$XPffskeXKmpN4r} GG1R{i*DF2% literal 0 HcmV?d00001 diff --git a/bo_output/test/scalerX_1.joblib b/bo_output/test/scalerX_1.joblib new file mode 100644 index 0000000000000000000000000000000000000000..9fa980c8aea151be77a94ef51515897d88069011 GIT binary patch literal 773 zcma))Pe@cj9LLvf&CSpZiw;&8S(wH%p+ZP|vv`QwSR@Z39;SKwM(5T0*UVexL4~5$ zngfN-(w{@rQRxsOMCipbLc$_C72Pw4QnxNe*u1BP(52sD<~KiP=JWl|Z+c_7M+K2E zcm=y?Vnl7lrP!rT8Z&0tT?*8ZhXgNe^^mP2s<)<*iK!SBhx)`7QO&pX7#@KZMz+ZE zc(`DmF7sI0aeXBrU!7zdk9Ka*Ny`6K$%~O`Iz7;`kSkxTVpUa0fXFnuYzx~S)O-&! zku1w-@xWvd$NJyTw-;Xq*@xE0@!wNw$8>G}Zk&R|;&j!inl_Ms;AQV6nnB@Aeq{I9 zg!*G(cJ9X288uh4+VlvIbt4J|saWkM*-I+PzQ79EPpZfPQcY^Iq&8s20^abqTjRg_ zKY;1nR9$p=2Y6OI|MXZ?@W%{OoJYVPnl zFKj6OFJg3M%tu!e7ark(^M365BDy9ZtMUP3nPiaJuM(ITiNNioCTAU$6uUp rT2jA%nHzondP&Vyt@Fy@+P8+?m@>n$1>ah)998NS*I{10$Y{zRkj+7j literal 0 HcmV?d00001 diff --git a/bo_output/test/scalerX_2.joblib b/bo_output/test/scalerX_2.joblib new file mode 100644 index 0000000000000000000000000000000000000000..a4e2b81fbb469323479883f8d6ec70fbe7fb5015 GIT binary patch literal 773 zcma))&1(}u7{-&Dw6VpsQmSC^yM?sXtpN{OirG*^Fpz^4ZK*Jw-JNvS?iVw=i8+XR zP%Dgh$V9YQH%%iQbiQv%r=#R7oWp?yv)q+d1mH%toAe- z3B^)$&O(qlibs$~TpdwrI?0tY3I#lt0{H~fl+WTnkK!SBr9=I3}izl}x5c%e`@+Y}0$e8lJ4c;e_rya>M~ z*0H!WDtNt@UClBzB|H;&rY^E`wzD8zp-oZNLRY-%<7sXKc z$#>ioW0x3W6N`@j^@+P#TXBm|-21Of)(I`kEd$$zp185phO3YOpk>ZFHgbHRReVHw zz9-NUGNe>d>cgVi${1->Pc^DbTwRV&FUNn;RvoCjt<_!D>i6uzL#al#Hq_d|VkJlb z$El6i#QX38d{9QghjBk1z-c@*kB22QCVBAP$o=1=$=oJy;{4vpdMTQFcK6Ka^X@fu zWu|LoX8MczV*ks}Ke89pW0&r|UQdsywIiFn@Ildk5!;&0sl_Mt(|fM9a(v6`gI6BE zc~n*(w7*)I`-;@a$4@_(I&JlReRy(Y?U4F*ATz#xvQMq`Z}NIT>sx~oBGhyp&bl(n OaV4X8F7-1!(3C$|>n=k8 literal 0 HcmV?d00001 diff --git a/bo_output/test/scalerY_0.joblib b/bo_output/test/scalerY_0.joblib new file mode 100644 index 0000000000000000000000000000000000000000..d20bc117053c2b3f73884860ed16f7ed14799bda GIT binary patch literal 629 zcma))&ubJh6vubgt*(mwLGiR8iWSyX0t*#HJgoHQz#i5K;g{y+aK$SuXwaXxK@OsvUulnowQ!AQG1C?I z@%`)_KEYvn&Lb9|GBi+!J}}_uSC$GxB0xvq{v9ylWLI%@8~zC zk>XL5eriDLvp)N_&T$HBIo;)LZz>F1@LVtNow#k_2%DOSu8TkW8<@zW9 literal 0 HcmV?d00001 diff --git a/bo_output/test/scalerY_1.joblib b/bo_output/test/scalerY_1.joblib new file mode 100644 index 0000000000000000000000000000000000000000..78f313176a965cd81fba55071b56608a3dc06ee1 GIT binary patch literal 629 zcma))L2DE-6vubAyLMGzg-%f|jw;ROvFK z#&CHUlN>CdTiUZAbI|!-27+ol!$RUUM)u_kJfURqgjBNe(F3 zN>f9)B$z~QSrP+<5+vFbKT~O-*a0l8X|4J zg&!xU_yPyXf{UKwcszbS^VTd$%VN6Q`*P7Ae(?8PH-LJ*)z@Vzcp5h~iQ<{>9o_A5 zpx6o0`x=zG+Go#HX(qrJWVhU67)1aKT!w+?Num?)mpb=JF53Oe$(CLF;0h;O{g=|+ z03pg_QWT*4#QJlpMuC9fhq9p3lC-weV6sxBz0hQ^o&BHX_lK`W(~bEfwp`~3G(py% zzYbetciFvw#_qFq_J9r8#)Lf%H98^x*XL*5^Ov7S)6L8LX5jR{Dzs;_AHTn@eLLG8 gP1i5;SIEUTB1cpkUP^Q~hw+nm7^~9QAy#Sp2jDXK-2eap literal 0 HcmV?d00001 diff --git a/bo_output/test/scalerY_2.joblib b/bo_output/test/scalerY_2.joblib new file mode 100644 index 0000000000000000000000000000000000000000..ee98cec268d64b440f9d6d0d578d311474df6fe6 GIT binary patch literal 629 zcma))zi$&U6vuNRg#aQgs+fZKA&Qi?U?~y<5)47zdQt{DFq(IFZf)@SEZf)W2Bb1T z1Mbv{rIQja&l>Q1x zSsl@`1RW+eTvD|P6r?y%B~y-Ojbj$mwL0rZlQBNeKK?lV`EoMfI(6xLhhQl3;qrF) zEq;gJjcEKH-{AN8h;PmKcI?rM_20g14t8&zOy&>H>f3>{|LV|PEPnG}x4(~{Oy(PB c^=tI>8PP*#tf&41poj5 literal 0 HcmV?d00001 diff --git a/bo_output/test/surrogate_models.py b/bo_output/test/surrogate_models.py new file mode 100644 index 0000000..a96e173 --- /dev/null +++ b/bo_output/test/surrogate_models.py @@ -0,0 +1,95 @@ +# This file costructs surrogate models for the input datasets +import numpy as np +import os +import json +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +import math + +# Torch specific module imports +import torch +import gpytorch + +# botorch specific modules +from botorch.fit import fit_gpytorch_model +from botorch.models.gpytorch import GPyTorchModel + +# Plotting libraries +import matplotlib as mpl +import matplotlib.pyplot as plt + +# User defined python classes and files +import utils_dataset as utilsd +import input_class +import code_inputs as model_input + +np.random.seed(0) +torch.manual_seed(0) + +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using {device} device") + +# We will use the simplest form of GP model, exact inference +class ExactGPModel(gpytorch.models.ExactGP,GPyTorchModel): + _num_outputs = 1 # to inform GPyTorchModel API + MIN_INFERRED_NOISE_LEVEL = 1e-5 + def __init__(self, train_x, train_y, likelihood): + super(ExactGPModel, self).__init__(train_x, train_y, likelihood) + self.mean_module = gpytorch.means.ConstantMean() + if model_input.kernel=='RBF': + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()) + elif model_input.kernel=='Matern': + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5)) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + +#--------------------------- GP-0 ---------------------------# +def train_surrogate_gp0(X_train,Y_train): + + mse_gp0 = 0.0 + training_iter = model_input.epochs_GP0 + + # initialize likelihood and model + likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood() + model_gp0 = ExactGPModel(X_train, Y_train, likelihood_gp0) + + # Find optimal model hyperparameters + model_gp0.train() + likelihood_gp0.train() + + # Use the adam optimizer + optimizer = torch.optim.Adam(model_gp0.parameters(), lr=model_input.learning_rate_gp0) # Includes GaussianLikelihood parameters + + # "Loss" for GPs - the marginal log likelihood + mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood_gp0, model_gp0) + + for i in range(training_iter): + optimizer.zero_grad() # Zero gradients from previous iteration + output = model_gp0(X_train) # Output from model + loss = -mll(output, Y_train) # Calc loss and backprop gradients + loss.backward() + optimizer.step() + + return model_gp0, likelihood_gp0 + +def predict_surrogates(model, likelihood, X): + + # Get into evaluation (predictive posterior) mode + model.eval() + likelihood.eval() + + # Make predictions by feeding model through likelihood + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + prediction = model(X) + prediction = likelihood(model(X)) + + observed_mean = prediction.mean + observed_var = prediction.variance + observed_covar = prediction.covariance_matrix + + return observed_mean, observed_var + + \ No newline at end of file diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e5b0917d1b0b633248c73b7743331f3f8af3d74c GIT binary patch literal 6148 zcmeHK!AiqG5S^_RD)rE#7h%DhxAY5QNROU+6{<~(1sjr!ptt;i_yzhi-t-6j8sF>; zO+wRx2qH2AyDyo2lbO7QY=($ja=(a(Mnp8gVC)@X>M&mCv1L8=90Of=Mn*T3(TbKd z+hUvJFDk%$H^gqusH8{iUA=#^^=&lGioBGvFVA)}^CC^>>tYVZ;Pv!n^8ERBv5&p@ z2D@A}s|dv$QceYGy`!jIBfTSQW3^e=$5rz5Aa%7EbuHbDc7BfpjSSjE?+VgPAE{3YY>vQ2=K)+h8bYvngN-m;xIG`1{~77-PjsFnl^Ng%$wVhdUUq zxtHJ=uNW&?n$2?a{1(-y-?IQ(9Dv0^1C;pDXWaGKd^hvGtZJl_X8xmeIBpyZirkklmRArog{a!1dE" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -27,6 +36,7 @@ "import warnings\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_squared_error\n", + "from joblib import Parallel, delayed, dump\n", "\n", "# Torch specific module imports\n", "import torch\n", @@ -72,16 +82,499 @@ "import input_class \n", "import code_inputs as model_input\n", "import utils_dataset as utilsd\n", - "import surrogate_models" + "import surrogate_models\n", + "import kmeans as km\n", + "\n", + "# Set the random seeds\n", + "np.random.seed(0)\n", + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "8b29bdc5", + "metadata": {}, + "source": [ + "#### K means clustering" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a8de62ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['dimensions',\n", + " ' bond type',\n", + " ' void fraction [widom]',\n", + " ' supercell volume [A^3]',\n", + " ' density [kg/m^3]',\n", + " ' heat desorption high P [kJ/mol]',\n", + " ' absolute methane uptake high P [molec/unit cell]',\n", + " ' absolute methane uptake high P [mol/kg]',\n", + " ' excess methane uptake high P [molec/unit cell]',\n", + " ' excess methane uptake high P [mol/kg]',\n", + " ' heat desorption low P [kJ/mol]',\n", + " ' absolute methane uptake low P [molec/unit cell]',\n", + " ' absolute methane uptake low P [mol/kg]',\n", + " ' excess methane uptake low P [molec/unit cell]',\n", + " ' excess methane uptake low P [mol/kg]',\n", + " ' surface area [m^2/g]',\n", + " ' linkerA',\n", + " ' linkerB',\n", + " ' net',\n", + " ' cell_a [A]',\n", + " ' cell_b [A]',\n", + " ' cell_c [A]',\n", + " ' alpha [deg]',\n", + " ' beta [deg]',\n", + " ' gamma [deg]',\n", + " ' num carbon',\n", + " ' num fluorine',\n", + " ' num hydrogen',\n", + " ' num nitrogen',\n", + " ' num oxygen',\n", + " ' num sulfur',\n", + " ' num silicon',\n", + " ' vertices',\n", + " ' edges',\n", + " ' genus',\n", + " ' largest included sphere diameter [A]',\n", + " ' largest free sphere diameter [A]',\n", + " ' largest included sphere along free sphere path diameter [A]',\n", + " ' absolute methane uptake high P [v STP/v]',\n", + " ' absolute methane uptake low P [v STP/v]']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Input = input_class.inputs(input_path='../')\n", + "XX_prop, YY, descriptors = Input.read_inputs()\n", + "descriptors" ] }, { "cell_type": "code", "execution_count": 3, + "id": "147caa07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num carbonnum fluorinenum hydrogennum nitrogennum oxygennum sulfurnum silicon
036002161447200
1360021614414400
243203601447200
3360014421621600
4360014421621600
........................
69835996057696000
698361020057648000
698371360076864000
6983818880115212812800
69839536028832000
\n", + "

69840 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " num carbon num fluorine num hydrogen num nitrogen num oxygen \\\n", + "0 360 0 216 144 72 \n", + "1 360 0 216 144 144 \n", + "2 432 0 360 144 72 \n", + "3 360 0 144 216 216 \n", + "4 360 0 144 216 216 \n", + "... ... ... ... ... ... \n", + "69835 996 0 576 96 0 \n", + "69836 1020 0 576 48 0 \n", + "69837 1360 0 768 64 0 \n", + "69838 1888 0 1152 128 128 \n", + "69839 536 0 288 32 0 \n", + "\n", + " num sulfur num silicon \n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "... ... ... \n", + "69835 0 0 \n", + "69836 0 0 \n", + "69837 0 0 \n", + "69838 0 0 \n", + "69839 0 0 \n", + "\n", + "[69840 rows x 7 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "XX_comp_df, YY_df = Input.get_comp()\n", + "XX_comp_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fe95ea1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num carbonnum fluorinenum hydrogennum nitrogennum oxygennum sulfurnum silicondeliverable capacity [v STP/v]
08320448384000165.565439
1115208321286400152.524690
2137608962566400115.996501
38640720192000143.024802
410880768128000153.528996
...........................
6921968012002884800116.161354
693204801024640000152.702060
69415360144038400099.338457
695144001104384000135.714021
696153601152768000133.680986
\n", + "

697 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " num carbon num fluorine num hydrogen num nitrogen num oxygen \\\n", + "0 832 0 448 384 0 \n", + "1 1152 0 832 128 64 \n", + "2 1376 0 896 256 64 \n", + "3 864 0 720 192 0 \n", + "4 1088 0 768 128 0 \n", + ".. ... ... ... ... ... \n", + "692 1968 0 1200 288 48 \n", + "693 2048 0 1024 640 0 \n", + "694 1536 0 1440 384 0 \n", + "695 1440 0 1104 384 0 \n", + "696 1536 0 1152 768 0 \n", + "\n", + " num sulfur num silicon deliverable capacity [v STP/v] \n", + "0 0 0 165.565439 \n", + "1 0 0 152.524690 \n", + "2 0 0 115.996501 \n", + "3 0 0 143.024802 \n", + "4 0 0 153.528996 \n", + ".. ... ... ... \n", + "692 0 0 116.161354 \n", + "693 0 0 152.702060 \n", + "694 0 0 99.338457 \n", + "695 0 0 135.714021 \n", + "696 0 0 133.680986 \n", + "\n", + "[697 rows x 8 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_cluster = 3\n", + "clustered_dfs = km.k_means(XX_comp_df, YY_df, num_cluster)\n", + "sample_dfs = km.draw_samples(clustered_dfs, sample_fraction = 0.01)\n", + "samples = km.concat(sample_dfs)\n", + "samples" + ] + }, + { + "cell_type": "markdown", + "id": "35ed4601", + "metadata": {}, + "source": [ + "#### Acquisition function " + ] + }, + { + "cell_type": "code", + "execution_count": 9, "id": "955bc734-96c5-4d3a-9325-920c041e256b", "metadata": {}, "outputs": [], "source": [ + "## TODO: TO BE Check\n", "bounds = torch.tensor([[-10.0], [12.0]])\n", "\n", "batch_size = 1\n", @@ -102,6 +595,7 @@ " unique=True\n", " )\n", " \n", + " print(candidates)\n", " # observe new values\n", " new_x = candidates.detach()\n", " b = [1 if torch.all(X_test[i].eq(new_x)) else 0 for i in range(0,X_test.shape[0]) ]\n", @@ -111,129 +605,144 @@ " \n", " X_test_new = X_test[torch.arange(0, X_test.shape[0]) != index, ...]\n", " Y_test_new = Y_test[..., torch.arange(0, Y_test.shape[1]) != index]\n", + " print(X_test_new)\n", + " print(Y_test_new)\n", " \n", " return new_x, new_y, index, X_test_new, Y_test_new" ] }, + { + "cell_type": "markdown", + "id": "f93f668d", + "metadata": {}, + "source": [ + "#### GP Train Function" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "72bb9112-1749-44fd-bc9d-7c0edb2e59a6", "metadata": {}, "outputs": [], "source": [ - "warnings.filterwarnings(\"ignore\", category=BadInitialCandidatesWarning)\n", - "warnings.filterwarnings(\"ignore\", category=RuntimeWarning)\n", - "\n", - "# Create a new directory if it does not exist\n", - "isExist = os.path.exists(model_input.output_folder)\n", - "if not isExist:\n", - " os.makedirs(model_input.output_folder)\n", - " print(\"The new directory is created!\", model_input.output_folder)\n", + "def create_train_test_data(cluster_dataXX, cluster_dataYY, random_seed):\n", + " if model_input.standardize_data:\n", + " cluster_dataXX, scalerX_transform = utilsd.standardize_data(cluster_dataXX)\n", + " cluster_dataYY, scalerY_transform = utilsd.standardize_data(cluster_dataYY.reshape(-1,1))\n", + " else:\n", + " scalerX_transform = None\n", + " scalerY_transform = None\n", " \n", - "# Copy input parameters file to output folder\n", - "shutil.copy2('surrogate_model_inputs.py',model_input.output_folder)\n", - "# Copy surrogate model file to output folder\n", - "shutil.copy2('surrogate_models.py',model_input.output_folder)\n", + " ## TODO : Incase for feature selection\n", + " # ....\n", + " # ....\n", + " # ....\n", "\n", - "# BO Trials\n", - "n_trials = model_input.n_trials\n", - "n_update = model_input.n_update\n", - "verbose = model_input.verbose\n", + " # Create train and test sets\n", + " X_train, X_test, Y_train, Y_test = train_test_split(cluster_dataXX, cluster_dataYY, test_size=model_input.test_size, random_state=random_seed)\n", "\n", - "test_size = model_input.test_size\n", - "train_GP = model_input.train_GP\n", + " # Convert to tensors\n", + " X_train = torch.tensor(X_train, dtype=torch.float32)\n", + " X_test = torch.tensor(X_test, dtype=torch.float32)\n", + " Y_train = np.transpose(Y_train) # IMP : Has to have only one row for GP training\n", + " Y_train = torch.tensor(Y_train, dtype=torch.float32)\n", + " Y_test = np.transpose(Y_test)\n", + " Y_test = torch.tensor(Y_test, dtype=torch.float32)\n", "\n", - "GP_0 = model_input.GP_0_BO\n", + " return X_train, X_test, Y_train, Y_test, scalerX_transform, scalerY_transform\n", "\n", + "def train_gp(cluster_idx):\n", + " best_observed_all_ei0 = []\n", + " # Average over multiple trials\n", + " for trial in range(1, model_input.n_trials + 1):\n", + " t0 = time.monotonic()\n", + " if model_input.random_seed == 'time':\n", + " random_seed = int(t0)\n", + " elif model_input.random_seed == 'iteration':\n", + " random_seed = trial\n", + " \n", + " print(f\"\\n -------------------- Trial {trial:>2} of {model_input.n_trials} --------------------\\n\", end=\"\")\n", + " best_observed0 = []\n", "\n", - "num_nodes = model_input.num_nodes\n", - "saveModel_filename = model_input.saveModel_filename\n", - " \n", - "best_observed_all_ei0 = []\n", - "\n", - "# Average over multiple trials\n", - "for trial in range(1, n_trials + 1):\n", - " t0 = time.monotonic()\n", - " if model_input.random_seed == 'time':\n", - " random_seed = int(t0)\n", - " elif model_input.random_seed == 'iteration':\n", - " random_seed = trial\n", - " \n", - " print(f\"\\n -------------------- Trial {trial:>2} of {n_trials} --------------------\\n\", end=\"\")\n", - " best_observed0 = []\n", - "\n", - " # Getting initial data and fitting models with initial data\n", - " if model_input.standardize_data:\n", - " X_train, X_test, Y_train, Y_test, Var_train, Var_test, scalerX_transform, scalerY_transform = utilsd.generate_training_data(random_seed,model_input.test_size)\n", - " else:\n", - " X_train, X_test, Y_train, Y_test, Var_train, Var_test = utilsd.generate_training_data(random_seed,model_input.test_size)\n", - " \n", - " # Finding best value in initial data\n", - " if model_input.maximization:\n", - " best_observed_value = Y_train.max()\n", - " optimal_solution = torch.cat([Y_train[0],Y_test[0]]).max()\n", - " else:\n", - " best_observed_value = Y_train.min()\n", - " optimal_solution = torch.cat([Y_train[0],Y_test[0]]).min()\n", - " \n", - " # If optimal value is present in the initial dataset sample remove it \n", - " if (best_observed_value.eq(optimal_solution)) and model_input.maximization:\n", - " print('Max in training set, removing it before training models.')\n", - " optimal_position = torch.argmax(Y_train)\n", - " \n", - " # Add max value to test/exploration set\n", - " X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n", - " X_test = torch.cat([X_test,X_add_toTest])\n", - " Y_add_toTest = torch.reshape(optimal_solution,(1,1)) \n", - " Y_test = torch.cat((Y_test,Y_add_toTest),1)\n", - " \n", - " # Remove max value from training set\n", - " X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n", - " Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n", - " \n", - " # Update best observed value\n", - " best_observed_value = Y_train.max()\n", - " \n", - " elif (best_observed_value.eq(optimal_solution)) and not model_input.maximization:\n", - " print('Min in training set, removing it before training models.')\n", - " optimal_position = torch.argmin(Y_train)\n", + " XX_desc = list(sample_dfs[cluster_idx].columns[:-1])\n", + " YY_desc = sample_dfs[cluster_idx].columns[-1]\n", + " (\n", + " X_train,\n", + " X_test,\n", + " Y_train,\n", + " Y_test,\n", + " scalerX, \n", + " scalerY\n", + " ) = create_train_test_data(sample_dfs[cluster_idx][XX_desc].to_numpy(), sample_dfs[cluster_idx][YY_desc].to_numpy(), random_seed)\n", + " if trial == 1:\n", + " # Dump the scalers to model output folder\n", + " dump(scalerX, os.path.join(model_input.output_folder, f'scalerX_{cluster_idx}.joblib'))\n", + " dump(scalerY, os.path.join(model_input.output_folder, f'scalerY_{cluster_idx}.joblib'))\n", + " \n", + " # Finding best value in initial data\n", + " if model_input.maximization:\n", + " best_observed_value = Y_train.max()\n", + " optimal_solution = torch.cat([Y_train[0],Y_test[0]]).max()\n", + " else:\n", + " best_observed_value = Y_train.min()\n", + " optimal_solution = torch.cat([Y_train[0],Y_test[0]]).min()\n", " \n", - " # Add min value to test/exploration set\n", - " X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n", - " X_test = torch.cat([X_test,X_add_toTest])\n", - " Y_add_toTest = torch.reshape(optimal_solution,(1,1)) \n", - " Y_test = torch.cat((Y_test,Y_add_toTest),1)\n", + " # If optimal value is present in the initial dataset sample remove it \n", + " if (best_observed_value.eq(optimal_solution)) and model_input.maximization:\n", + " print('Max in training set, removing it before training models.')\n", + " optimal_position = torch.argmax(Y_train)\n", + " \n", + " # Add max value to test/exploration set\n", + " X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n", + " X_test = torch.cat([X_test,X_add_toTest])\n", + " Y_add_toTest = torch.reshape(optimal_solution,(1,1)) \n", + " Y_test = torch.cat((Y_test,Y_add_toTest),1)\n", + " \n", + " # Remove max value from training set\n", + " X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n", + " Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n", + " \n", + " # Update best observed value\n", + " best_observed_value = Y_train.max()\n", + " \n", + " elif (best_observed_value.eq(optimal_solution)) and not model_input.maximization:\n", + " print('Min in training set, removing it before training models.')\n", + " optimal_position = torch.argmin(Y_train)\n", + " \n", + " # Add min value to test/exploration set\n", + " X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n", + " X_test = torch.cat([X_test,X_add_toTest])\n", + " Y_add_toTest = torch.reshape(optimal_solution,(1,1)) \n", + " Y_test = torch.cat((Y_test,Y_add_toTest),1)\n", + " \n", + " # Remove min value from training set\n", + " X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n", + " Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n", + " \n", + " # Update best observed value\n", + " best_observed_value = Y_train.min()\n", " \n", - " # Remove min value from training set\n", - " X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n", - " Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n", + " # Initialize data for training gp-0 and gp-l models\n", + " X_train0, Y_train0, X_test0, Y_test0 = X_train, Y_train, X_test, Y_test\n", + " \n", + " n_batch = model_input.n_batch_perTrial\n", " \n", - " # Update best observed value\n", - " best_observed_value = Y_train.min()\n", - " \n", - " # Initialize data for training gp-0 and gp-l models\n", - " X_train0, Y_train0, X_test0, Y_test0 = X_train, Y_train, X_test, Y_test\n", - " \n", - " n_batch = model_input.n_batch_perTrial\n", - " \n", - " # Initialize likelihood, GP model and acquisition function for the models\n", - " #--------------------------- GP-0 ---------------------------#\n", - " if GP_0:\n", + " # Initialize likelihood, GP model and acquisition function for the models\n", + " #--------------------------- GP-0 ---------------------------#\n", " likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood()\n", " model_gp0 = surrogate_models.ExactGPModel(X_train0, Y_train0, likelihood_gp0) \n", " # AcqFunc_0 = UpperConfidenceBound(model_gp0, beta=0.1) \n", " AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_observed_value, maximize=model_input.maximization)\n", " best_observed0.append(best_observed_value) # Appending to best_observed list for the given trial\n", - " \n", - " # run N_BATCH rounds of BayesOpt after the initial random batch\n", - " for iteration in range(1, n_batch + 1):\n", + " \n", + " # run N_BATCH rounds of BayesOpt after the initial random batch\n", + " for iteration in range(1, n_batch + 1):\n", "\n", - " if GP_0:\n", - " if ((iteration-1)%n_update==0):\n", + " if ((iteration-1)%model_input.n_update==0):\n", " # fit the models every 10 iterations\n", - " model_gp0, likelihood_gp0 = surrogate_models.train_surrogate_gp0(saveModel_filename, test_size, num_nodes, X_train0, Y_train0)\n", - " \n", + " model_gp0, likelihood_gp0 = surrogate_models.train_surrogate_gp0(X_train0, Y_train0)\n", + " \n", " # optimize and get new observation using acquisition function\n", " new_x0, new_y0, index, X_test_new0, Y_test_new0 = optimize_acqf_and_get_observation(AcqFunc_0, X_test0, Y_test0)\n", " \n", @@ -255,21 +764,115 @@ "\n", " # AcqFunc_0 = UpperConfidenceBound(model_gp0, beta=0.1) \n", " AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_value_ei0, maximize=model_input.maximization)\n", - " \n", - " if verbose:\n", - " print(\n", - " f\"\\nBatch {iteration:>2}: best_value (GP-0, GP-Linear, GP-NN) = \",\n", - " f\"({best_value_ei0:>4.2f}, {best_value_eiL:>4.2f}, {best_value_eiNN:>4.2f})\",\n", - " end=\"\",)\n", + " \n", + " if model_input.verbose:\n", + " print(\n", + " f\"\\nBatch {iteration:>2}: best_value (GP-0) = \",\n", + " f\"({best_value_ei0:>4.2f}\",\n", + " end=\"\",)\n", "\n", - " t1 = time.monotonic()\n", + " t1 = time.monotonic()\n", + " \n", + " print(f\"time = {t1-t0:>4.2f}.\")\n", + " # Appending to common list of best observed values, with number of rows equal to number of trials\n", + " best_observed_all_ei0.append(best_observed0) \n", + " return best_observed_all_ei0" + ] + }, + { + "cell_type": "markdown", + "id": "17fbf21d", + "metadata": {}, + "source": [ + "#### Main Function" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bfb03e8a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using cpu device\n", + "Using cpu device\n", + "Using cpu device\n", + "\n", + " -------------------- Trial 1 of 5 --------------------\n", + "\n", + " -------------------- Trial 1 of 5 --------------------\n", + "\n", + " -------------------- Trial 1 of 5 --------------------\n", + "Max in training set, removing it before training models.\n", + "Max in training set, removing it before training models.Max in training set, removing it before training models.\n", + "\n", + "tensor([[-0.8793, 0.0000, -0.7922, 1.8256, -0.5337, -0.1508, -0.1508]])\n", + "tensor([[-0.4935, 0.0000, -1.7229, -0.7278, 0.7059, -0.1508, -0.1508]])\n", + "tensor([[-1.0916]])\n", + "\n", + "Batch 1: best_value (GP-0) = (1.79tensor([[-0.4935, 0.0000, -1.7229, -0.7278, 0.7059, -0.1508, -0.1508]])\n", + "tensor([], size=(0, 7))\n", + "tensor([], size=(1, 0))\n", + "\n", + "Batch 2: best_value (GP-0) = (1.79" + ] + }, + { + "ename": "InputDataError", + "evalue": "`choices` must be non-emtpy.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31m_RemoteTraceback\u001b[0m Traceback (most recent call last)", + "\u001b[0;31m_RemoteTraceback\u001b[0m: \n\"\"\"\nTraceback (most recent call last):\n File \"/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py\", line 463, in _process_worker\n r = call_item()\n ^^^^^^^^^^^\n File \"/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py\", line 291, in __call__\n return self.fn(*self.args, **self.kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py\", line 589, in __call__\n return [func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/var/folders/k2/lw7ggcln7zx_cnkbk2zbgvm80000gn/T/ipykernel_16086/3447261974.py\", line 119, in train_gp\n File \"/var/folders/k2/lw7ggcln7zx_cnkbk2zbgvm80000gn/T/ipykernel_16086/1975869805.py\", line 11, in optimize_acqf_and_get_observation\n File \"/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/botorch/optim/optimize.py\", line 1053, in optimize_acqf_discrete\n raise InputDataError(\"`choices` must be non-emtpy.\")\nbotorch.exceptions.errors.InputDataError: `choices` must be non-emtpy.\n\"\"\"", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mInputDataError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 20\u001b[0m\n\u001b[1;32m 14\u001b[0m shutil\u001b[38;5;241m.\u001b[39mcopy2(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msurrogate_models.py\u001b[39m\u001b[38;5;124m'\u001b[39m,model_input\u001b[38;5;241m.\u001b[39moutput_folder)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Training a single GP for test\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# a = train_gp(0)\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# Train the cluster of GP models in a parallel for loop\u001b[39;00m\n\u001b[0;32m---> 20\u001b[0m best_observed_all_ei0 \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_gp\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mrange\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mnum_cluster\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:1952\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 1946\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[1;32m 1947\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[1;32m 1948\u001b[0m \u001b[38;5;66;03m# reach the first `yield` statement. This starts the aynchronous\u001b[39;00m\n\u001b[1;32m 1949\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[1;32m 1950\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[0;32m-> 1952\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:1595\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[0;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[1;32m 1592\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[1;32m 1594\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[0;32m-> 1595\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[1;32m 1597\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[1;32m 1598\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[1;32m 1599\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[1;32m 1600\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[1;32m 1601\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:1699\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1692\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wait_retrieval():\n\u001b[1;32m 1693\u001b[0m \n\u001b[1;32m 1694\u001b[0m \u001b[38;5;66;03m# If the callback thread of a worker has signaled that its task\u001b[39;00m\n\u001b[1;32m 1695\u001b[0m \u001b[38;5;66;03m# triggered an exception, or if the retrieval loop has raised an\u001b[39;00m\n\u001b[1;32m 1696\u001b[0m \u001b[38;5;66;03m# exception (e.g. `GeneratorExit`), exit the loop and surface the\u001b[39;00m\n\u001b[1;32m 1697\u001b[0m \u001b[38;5;66;03m# worker traceback.\u001b[39;00m\n\u001b[1;32m 1698\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_aborting:\n\u001b[0;32m-> 1699\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_error_fast\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1700\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 1702\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[1;32m 1703\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:1734\u001b[0m, in \u001b[0;36mParallel._raise_error_fast\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1730\u001b[0m \u001b[38;5;66;03m# If this error job exists, immediatly raise the error by\u001b[39;00m\n\u001b[1;32m 1731\u001b[0m \u001b[38;5;66;03m# calling get_result. This job might not exists if abort has been\u001b[39;00m\n\u001b[1;32m 1732\u001b[0m \u001b[38;5;66;03m# called directly or if the generator is gc'ed.\u001b[39;00m\n\u001b[1;32m 1733\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1734\u001b[0m \u001b[43merror_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_result\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:736\u001b[0m, in \u001b[0;36mBatchCompletionCallBack.get_result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 730\u001b[0m backend \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparallel\u001b[38;5;241m.\u001b[39m_backend\n\u001b[1;32m 732\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m backend\u001b[38;5;241m.\u001b[39msupports_retrieve_callback:\n\u001b[1;32m 733\u001b[0m \u001b[38;5;66;03m# We assume that the result has already been retrieved by the\u001b[39;00m\n\u001b[1;32m 734\u001b[0m \u001b[38;5;66;03m# callback thread, and is stored internally. It's just waiting to\u001b[39;00m\n\u001b[1;32m 735\u001b[0m \u001b[38;5;66;03m# be returned.\u001b[39;00m\n\u001b[0;32m--> 736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_return_or_raise\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 738\u001b[0m \u001b[38;5;66;03m# For other backends, the main thread needs to run the retrieval step.\u001b[39;00m\n\u001b[1;32m 739\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:754\u001b[0m, in \u001b[0;36mBatchCompletionCallBack._return_or_raise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 752\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 753\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m==\u001b[39m TASK_ERROR:\n\u001b[0;32m--> 754\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\n\u001b[1;32m 755\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\n\u001b[1;32m 756\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n", + "\u001b[0;31mInputDataError\u001b[0m: `choices` must be non-emtpy." + ] + } + ], + "source": [ + "warnings.filterwarnings(\"ignore\", category=BadInitialCandidatesWarning)\n", + "warnings.filterwarnings(\"ignore\", category=RuntimeWarning)\n", + "\n", + "# Create a new directory if it does not exist\n", + "isExist = os.path.exists(model_input.output_folder)\n", + "if not isExist:\n", + " os.makedirs(model_input.output_folder)\n", + " print(\"The new directory is created!\", model_input.output_folder)\n", " \n", - " print(f\"time = {t1-t0:>4.2f}.\")\n", - " # Appending to common list of best observed values, with number of rows equal to number of trials\n", - " if GP_0:\n", - " best_observed_all_ei0.append(best_observed0) \n", - "\n" + "# Commented out by NKT\n", + "# # Copy input parameters file to output folder\n", + "# shutil.copy2('surrogate_model_inputs.py',model_input.output_folder)\n", + "# Copy surrogate model file to output folder\n", + "shutil.copy2('surrogate_models.py',model_input.output_folder)\n", + "\n", + "# Training a single GP for test\n", + "# a = train_gp(0)\n", + "\n", + "# Train the cluster of GP models in a parallel for loop\n", + "best_observed_all_ei0 = Parallel(n_jobs=-1)(\n", + " delayed(train_gp)(i) for i in range(num_cluster)\n", + ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ddae264", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -288,7 +891,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.2" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/code_inputs.py b/src/code_inputs.py index 108ae45..e726263 100644 --- a/src/code_inputs.py +++ b/src/code_inputs.py @@ -30,11 +30,11 @@ torch.manual_seed(0) # General inputs -run_folder = '/Users/maitreyeesharma/WORKSPACE/PostDoc/EngChem/MatDisc_ML/python_notebook_bo/' # Folder where code is run and input json exist +run_folder = '/Users/nikhilthota/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery' # Folder where code is run and input json exist num_run = 3 test_size = 0.01 -output_folder = run_folder+'../bo_output/' # Folder where all outputs are stored -output_folder = output_folder+'Space@Hopkins_recommendations/mpea_hv_forEddie_'+str(test_size)+'p_ThirdPass_Mar5_24/' +output_folder = run_folder+'/bo_output/' # Folder where all outputs are stored +output_folder = output_folder+'test' verbose = True deep_verbose = False diff --git a/src/input_class.py b/src/input_class.py index 2bf5d13..51ee96d 100644 --- a/src/input_class.py +++ b/src/input_class.py @@ -40,6 +40,7 @@ def __init__(self,input_type='COF',input_path='../',input_file='properties.csv') self.input_path = input_path self.input_file = input_file self.filename = self.input_path + self.input_file + self.data = data = pd.read_csv(self.filename) def read_inputs(self): ''' @@ -48,25 +49,33 @@ def read_inputs(self): input_path='.', input_file='properties.csv' ''' - data = pd.read_csv(self.filename) - descriptors = ['dimensions', 'bond type', 'void fraction [widom]', 'supercell volume [A^3]', 'density [kg/m^3]', - 'heat desorption high P [kJ/mol]','absolute methane uptake high P [molec/unit cell]', - 'absolute methane uptake high P [mol/kg]', 'excess methane uptake high P [molec/unit cell]', - 'excess methane uptake high P [mol/kg]', 'heat desorption low P [kJ/mol]', - 'absolute methane uptake low P [molec/unit cell]', - 'absolute methane uptake low P [mol/kg]', - 'excess methane uptake low P [molec/unit cell]', - 'excess methane uptake low P [mol/kg]', 'surface area [m^2/g]', 'linkerA', 'linkerB', 'net', - 'cell_a [A]', 'cell_b [A]', 'cell_c [A]', 'alpha [deg]', 'beta [deg]', 'gamma [deg]', - 'num carbon', 'num fluorine', 'num hydrogen', 'num nitrogen', 'num oxygen', 'num sulfur', - 'num silicon', 'vertices', 'edges', 'genus', 'largest included sphere diameter [A]', - 'largest free sphere diameter [A]', 'largest included sphere along free sphere path diameter [A]', - 'absolute methane uptake high P [v STP/v]', 'absolute methane uptake low P [v STP/v]]'] - XX = pd.DataFrame(data, columns=descriptors) - target = copy.deepcopy(data['deliverable capacity [v STP/v]'].to_numpy()) +# XX_comp = data.iloc[:,38:45] + descriptors = ['dimensions', ' bond type', ' void fraction [widom]', ' supercell volume [A^3]', ' density [kg/m^3]', + ' heat desorption high P [kJ/mol]',' absolute methane uptake high P [molec/unit cell]', + ' absolute methane uptake high P [mol/kg]', ' excess methane uptake high P [molec/unit cell]', + ' excess methane uptake high P [mol/kg]', ' heat desorption low P [kJ/mol]', + ' absolute methane uptake low P [molec/unit cell]', + ' absolute methane uptake low P [mol/kg]', + ' excess methane uptake low P [molec/unit cell]', + ' excess methane uptake low P [mol/kg]', ' surface area [m^2/g]', ' linkerA', ' linkerB', ' net', + ' cell_a [A]', ' cell_b [A]', ' cell_c [A]', ' alpha [deg]', ' beta [deg]', ' gamma [deg]', + ' num carbon', ' num fluorine', ' num hydrogen', ' num nitrogen', ' num oxygen', ' num sulfur', + ' num silicon', ' vertices', ' edges', ' genus', ' largest included sphere diameter [A]', + ' largest free sphere diameter [A]', ' largest included sphere along free sphere path diameter [A]', + ' absolute methane uptake high P [v STP/v]', ' absolute methane uptake low P [v STP/v]'] + XX_prop = pd.DataFrame(self.data, columns=descriptors) + target = copy.deepcopy(self.data[' deliverable capacity [v STP/v]'].to_numpy()) YY = target.reshape(-1,1) - return XX, YY, descriptors + return XX_prop, YY, descriptors + + def get_comp(self): + XX_comp_df = self.data.iloc[:,38:45] + YY = self.data.iloc[:, 27] + # YY_df = pd.DataFrame(YY) + # YY = copy.deepcopy(self.data[' deliverable capacity [v STP/v]'].to_numpy()) + return XX_comp_df, YY + if __name__=="__main__": diff --git a/src/kmeans.py b/src/kmeans.py new file mode 100644 index 0000000..faf0992 --- /dev/null +++ b/src/kmeans.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +from sklearn import metrics +from sklearn.metrics import silhouette_samples + +def k_means(X, y, k=3): + """ + Perform k-means clustering on the given input X and label y + """ + + # Perform k-means clustering + kmeans = KMeans(n_clusters=k, random_state=0).fit(X) + labels = kmeans.labels_ + X['cluster'] = labels + data = pd.concat([X, y], axis=1) + clustered_dfs = [data[data['cluster'] == i].drop('cluster', axis=1) for i in range(k)] + + return clustered_dfs + +def draw_samples(clustered_dfs, sample_fraction = 0.01, random_state = 42): + sample_dfs = [] + for df in clustered_dfs: + sample_size = int(len(df) * sample_fraction) + sample_dfs.append(df.sample(n=sample_size, random_state=random_state)) + return sample_dfs + +def concat(dfs_list): + return pd.concat(dfs_list, ignore_index=True) \ No newline at end of file diff --git a/src/surrogate_models.py b/src/surrogate_models.py index 6e90c68..a96e173 100644 --- a/src/surrogate_models.py +++ b/src/surrogate_models.py @@ -69,7 +69,7 @@ def train_surrogate_gp0(X_train,Y_train): for i in range(training_iter): optimizer.zero_grad() # Zero gradients from previous iteration output = model_gp0(X_train) # Output from model - loss = -mll(output, Y_train) # Calc loss and backprop gradients + loss = -mll(output, Y_train) # Calc loss and backprop gradients loss.backward() optimizer.step() diff --git a/src/test.ipynb b/src/test.ipynb index ada635f..8d0862b 100644 --- a/src/test.ipynb +++ b/src/test.ipynb @@ -8,53 +8,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "2234ceda", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/botorch/models/gp_regression.py:161: UserWarning: The model inputs are of type torch.float32. It is strongly recommended to use double precision in BoTorch, as this improves both precision and stability and can help avoid numerical errors. See https://github.com/pytorch/botorch/discussions/1444\n", - " self._validate_tensor_args(X=transformed_X, Y=train_Y, Yvar=train_Yvar)\n" - ] - }, - { - "data": { - "text/plain": [ - "ExactMarginalLogLikelihood(\n", - " (likelihood): GaussianLikelihood(\n", - " (noise_covar): HomoskedasticNoise(\n", - " (noise_prior): GammaPrior()\n", - " (raw_noise_constraint): GreaterThan(1.000E-04)\n", - " )\n", - " )\n", - " (model): SingleTaskGP(\n", - " (likelihood): GaussianLikelihood(\n", - " (noise_covar): HomoskedasticNoise(\n", - " (noise_prior): GammaPrior()\n", - " (raw_noise_constraint): GreaterThan(1.000E-04)\n", - " )\n", - " )\n", - " (mean_module): ConstantMean()\n", - " (covar_module): ScaleKernel(\n", - " (base_kernel): MaternKernel(\n", - " (lengthscale_prior): GammaPrior()\n", - " (raw_lengthscale_constraint): Positive()\n", - " )\n", - " (outputscale_prior): GammaPrior()\n", - " (raw_outputscale_constraint): Positive()\n", - " )\n", - " )\n", - ")" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import torch\n", "from botorch.models import SingleTaskGP\n", @@ -74,19 +31,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d4524f72", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15.3 ms ± 6.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", - "786 ns ± 94.2 ns per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" - ] - } - ], + "outputs": [], "source": [ "from joblib import Parallel, delayed\n", "\n", @@ -101,46 +49,28 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "92f541d2", "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'deliverable capacity [v STP/v]'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'deliverable capacity [v STP/v]'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 7\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01minput_class\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m inputs\n\u001b[1;32m 5\u001b[0m inputs_obj \u001b[38;5;241m=\u001b[39m inputs()\n\u001b[0;32m----> 7\u001b[0m XX, YY \u001b[38;5;241m=\u001b[39m \u001b[43minputs_obj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery/src/input_class.py:66\u001b[0m, in \u001b[0;36minputs.read_inputs\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 52\u001b[0m descriptors \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdimensions\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbond type\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mvoid fraction [widom]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msupercell volume [A^3]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdensity [kg/m^3]\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m 53\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mheat desorption high P [kJ/mol]\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute methane uptake high P [molec/unit cell]\u001b[39m\u001b[38;5;124m'\u001b[39m, \n\u001b[1;32m 54\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute methane uptake high P [mol/kg]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mexcess methane uptake high P [molec/unit cell]\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlargest free sphere diameter [A]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlargest included sphere along free sphere path diameter [A]\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 64\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute methane uptake high P [v STP/v]\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mabsolute methane uptake low P [v STP/v]]\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 65\u001b[0m XX \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(data, columns\u001b[38;5;241m=\u001b[39mdescriptors)\n\u001b[0;32m---> 66\u001b[0m target \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdeliverable capacity [v STP/v]\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mto_numpy())\n\u001b[1;32m 67\u001b[0m YY \u001b[38;5;241m=\u001b[39m target\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 69\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m XX, YY, descriptors\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", - "\u001b[0;31mKeyError\u001b[0m: 'deliverable capacity [v STP/v]'" - ] - } - ], + "outputs": [], "source": [ "# Read the inputs\n", "\n", + "from utils_dataset import generate_training_data\n", "from input_class import inputs\n", "\n", - "inputs_obj = inputs()\n", + "# X_train, X_test, Y_train, Y_test, Var_train, Var_test, scalerX_transform, scalerY_transform = generate_training_data(0, 0.1)\n", + "\n", + "input_obj = inputs()\n", + "XX, YY, descriptors = input_obj.read_inputs()\n", "\n", - "XX, YY = inputs_obj.read_inputs()" + "print(XX)\n", + "print(YY)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "id": "97d372f5", "metadata": {}, "outputs": [ @@ -148,16 +78,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "tensor([[0.9786],\n", - " [0.7992],\n", - " [0.4615],\n", - " [0.7805],\n", - " [0.1183],\n", - " [0.6399],\n", - " [0.1434],\n", - " [0.9447],\n", - " [0.5218],\n", - " [0.4147]], dtype=torch.float64)\n" + "Using cpu device\n", + "MultivariateNormal(loc: torch.Size([2]))\n", + "tensor([1.0886, 1.1504], dtype=torch.float64, grad_fn=)\n" ] }, { @@ -167,13 +90,8 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m train_Y \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mfrom_numpy(np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mrand(\u001b[38;5;241m10\u001b[39m,\u001b[38;5;241m1\u001b[39m))\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(train_Y)\n\u001b[0;32m----> 8\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimeit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m-n 10 train_surrogate_gp0(train_X, train_Y)\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/IPython/core/interactiveshell.py:2480\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2478\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m 2479\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2480\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2482\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2483\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2484\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2485\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/IPython/core/magics/execution.py:1189\u001b[0m, in \u001b[0;36mExecutionMagics.timeit\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n\u001b[1;32m 1186\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time_number \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.2\u001b[39m:\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m-> 1189\u001b[0m all_runs \u001b[38;5;241m=\u001b[39m \u001b[43mtimer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepeat\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepeat\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumber\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1190\u001b[0m best \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmin\u001b[39m(all_runs) \u001b[38;5;241m/\u001b[39m number\n\u001b[1;32m 1191\u001b[0m worst \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmax\u001b[39m(all_runs) \u001b[38;5;241m/\u001b[39m number\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/timeit.py:208\u001b[0m, in \u001b[0;36mTimer.repeat\u001b[0;34m(self, repeat, number)\u001b[0m\n\u001b[1;32m 206\u001b[0m r \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 207\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(repeat):\n\u001b[0;32m--> 208\u001b[0m t \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimeit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnumber\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 209\u001b[0m r\u001b[38;5;241m.\u001b[39mappend(t)\n\u001b[1;32m 210\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/IPython/core/magics/execution.py:173\u001b[0m, in \u001b[0;36mTimer.timeit\u001b[0;34m(self, number)\u001b[0m\n\u001b[1;32m 171\u001b[0m gc\u001b[38;5;241m.\u001b[39mdisable()\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 173\u001b[0m timing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gcold:\n", - "File \u001b[0;32m:1\u001b[0m, in \u001b[0;36minner\u001b[0;34m(_it, _timer)\u001b[0m\n", - "File \u001b[0;32m~/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery/src/surrogate_models.py:73\u001b[0m, in \u001b[0;36mtrain_surrogate_gp0\u001b[0;34m(X_train, Y_train)\u001b[0m\n\u001b[1;32m 71\u001b[0m output \u001b[38;5;241m=\u001b[39m model_gp0(X_train) \u001b[38;5;66;03m# Output from model\u001b[39;00m\n\u001b[1;32m 72\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39mmll(output, Y_train) \u001b[38;5;66;03m# Calc loss and backprop gradients \u001b[39;00m\n\u001b[0;32m---> 73\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 74\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model_gp0, likelihood_gp0\n", + "Cell \u001b[0;32mIn[1], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m XX \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mrand(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m2\u001b[39m))\n\u001b[1;32m 6\u001b[0m YY \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mrand(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m1\u001b[39m))\n\u001b[0;32m----> 8\u001b[0m \u001b[43mtrain_surrogate_gp0\u001b[49m\u001b[43m(\u001b[49m\u001b[43mXX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mYY\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery/src/surrogate_models.py:75\u001b[0m, in \u001b[0;36mtrain_surrogate_gp0\u001b[0;34m(X_train, Y_train)\u001b[0m\n\u001b[1;32m 73\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39mmll(output, Y_train) \u001b[38;5;66;03m# Calc loss and backprop gradients \u001b[39;00m\n\u001b[1;32m 74\u001b[0m \u001b[38;5;28mprint\u001b[39m(loss) \n\u001b[0;32m---> 75\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 76\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model_gp0, likelihood_gp0\n", "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 513\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 514\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 515\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 520\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 521\u001b[0m )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/autograd/__init__.py:259\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 250\u001b[0m inputs \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 251\u001b[0m (inputs,)\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(inputs, (torch\u001b[38;5;241m.\u001b[39mTensor, graph\u001b[38;5;241m.\u001b[39mGradientEdge))\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m()\n\u001b[1;32m 256\u001b[0m )\n\u001b[1;32m 258\u001b[0m grad_tensors_ \u001b[38;5;241m=\u001b[39m _tensor_or_tensors_to_tuple(grad_tensors, \u001b[38;5;28mlen\u001b[39m(tensors))\n\u001b[0;32m--> 259\u001b[0m grad_tensors_ \u001b[38;5;241m=\u001b[39m \u001b[43m_make_grads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_grads_batched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m retain_graph \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 261\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n", "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/autograd/__init__.py:132\u001b[0m, in \u001b[0;36m_make_grads\u001b[0;34m(outputs, grads, is_grads_batched)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out\u001b[38;5;241m.\u001b[39mrequires_grad:\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out\u001b[38;5;241m.\u001b[39mnumel() \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m--> 132\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgrad can be implicitly created only for scalar outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 134\u001b[0m )\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m out\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mis_floating_point:\n\u001b[1;32m 136\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgrad can be implicitly created only for real scalar outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mout\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 139\u001b[0m )\n", @@ -184,12 +102,12 @@ "source": [ "from surrogate_models import train_surrogate_gp0\n", "import numpy as np\n", + "import torch\n", "\n", - "train_X = torch.randn(10,2, requires_grad=True)\n", - "train_Y = torch.from_numpy(np.random.rand(10,1))\n", - "print(train_Y)\n", + "XX = torch.tensor(np.random.rand(2, 2))\n", + "YY = torch.tensor(np.random.rand(2, 1))\n", "\n", - "%timeit -n 10 train_surrogate_gp0(train_X, train_Y)" + "train_surrogate_gp0(XX, YY)" ] }, { diff --git a/src/utils_dataset.py b/src/utils_dataset.py index b5c5143..5015a41 100644 --- a/src/utils_dataset.py +++ b/src/utils_dataset.py @@ -60,28 +60,30 @@ def standardize_test_data(x,scalerX): def generate_training_data(random_state,test_size): - # Reading the input json file with dataset filename and path information - with open(model_input.run_folder+'inputs.json', "r") as f: - input_dict = json.load(f) + # todo: Modify below to read from code_inputs.py + # # Reading the input json file with dataset filename and path information + # with open(model_input.run_folder+'inputs.json', "r") as f: + # input_dict = json.load(f) - input_type = input_dict['InputType'] - input_path = input_dict['InputPath'] - input_file = input_dict['InputFile'] - add_target_noise = input_dict['AddTargetNoise'] - - input = input_class.inputs(input_type=input_type, - input_path=input_path, - input_file=input_file, - add_target_noise=add_target_noise) + # input_type = input_dict['InputType'] + # input_path = input_dict['InputPath'] + # input_file = input_dict['InputFile'] + # add_target_noise = input_dict['AddTargetNoise'] + + # input = input_class.inputs(input_type=input_type, + # input_path=input_path, + # input_file=input_file, + # add_target_noise=add_target_noise) - XX, YY, descriptors = input.read_inputs(model_input.verbose) + input_obj = input_class.inputs() + XX, YY, descriptors = input_obj.read_inputs() # Transforming datasets by standardization if model_input.standardize_data: X_stand, scalerX_transform = utilsd.standardize_data(XX) Y_stand, scalerY_transform = utilsd.standardize_data(YY) else: - X_stand=XX.to_numpy() + X_stand = XX.to_numpy() Y_stand = YY # Checking if we should use xgboost recommended descriptors or all descriptors From 37ddb09241e1e56c20bca17db0f87bc11f19bdb9 Mon Sep 17 00:00:00 2001 From: Maitreyee Sharma Date: Thu, 28 Mar 2024 19:59:09 -0400 Subject: [PATCH 4/4] Mod(BO): added epsilon-greedy selection in the last cell --- src/BO.ipynb | 285 ++++++++++++++++++++++++++------------------------- 1 file changed, 147 insertions(+), 138 deletions(-) diff --git a/src/BO.ipynb b/src/BO.ipynb index 442ba6e..68e4324 100644 --- a/src/BO.ipynb +++ b/src/BO.ipynb @@ -6,6 +6,16 @@ "id": "a45452e9-567c-4658-bfa1-f9a6f6b70bd1", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'dlopen(/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowIxEET_S2_S2_b\n", + " Referenced from: /Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/image.so\n", + " Expected in: /Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torch/lib/libc10.dylib'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n", + " warn(\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -16,7 +26,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 1, @@ -313,31 +323,31 @@ "" ], "text/plain": [ - " num carbon num fluorine num hydrogen num nitrogen num oxygen \\\n", - "0 360 0 216 144 72 \n", - "1 360 0 216 144 144 \n", - "2 432 0 360 144 72 \n", - "3 360 0 144 216 216 \n", - "4 360 0 144 216 216 \n", - "... ... ... ... ... ... \n", - "69835 996 0 576 96 0 \n", - "69836 1020 0 576 48 0 \n", - "69837 1360 0 768 64 0 \n", - "69838 1888 0 1152 128 128 \n", - "69839 536 0 288 32 0 \n", + " num carbon num fluorine num hydrogen num nitrogen num oxygen \\\n", + "0 360 0 216 144 72 \n", + "1 360 0 216 144 144 \n", + "2 432 0 360 144 72 \n", + "3 360 0 144 216 216 \n", + "4 360 0 144 216 216 \n", + "... ... ... ... ... ... \n", + "69835 996 0 576 96 0 \n", + "69836 1020 0 576 48 0 \n", + "69837 1360 0 768 64 0 \n", + "69838 1888 0 1152 128 128 \n", + "69839 536 0 288 32 0 \n", "\n", - " num sulfur num silicon \n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "... ... ... \n", - "69835 0 0 \n", - "69836 0 0 \n", - "69837 0 0 \n", - "69838 0 0 \n", - "69839 0 0 \n", + " num sulfur num silicon \n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "... ... ... \n", + "69835 0 0 \n", + "69836 0 0 \n", + "69837 0 0 \n", + "69838 0 0 \n", + "69839 0 0 \n", "\n", "[69840 rows x 7 columns]" ] @@ -358,6 +368,14 @@ "id": "fe95ea1f", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + " warnings.warn(\n" + ] + }, { "data": { "text/html": [ @@ -392,58 +410,58 @@ " \n", " \n", " 0\n", - " 832\n", - " 0\n", - " 448\n", - " 384\n", + " 1152\n", " 0\n", + " 1024\n", + " 256\n", " 0\n", " 0\n", - " 165.565439\n", + " 32\n", + " 145.407912\n", " \n", " \n", " 1\n", " 1152\n", " 0\n", - " 832\n", - " 128\n", - " 64\n", + " 1216\n", + " 256\n", " 0\n", " 0\n", - " 152.524690\n", + " 0\n", + " 95.429169\n", " \n", " \n", " 2\n", - " 1376\n", + " 768\n", " 0\n", - " 896\n", - " 256\n", - " 64\n", + " 512\n", " 0\n", " 0\n", - " 115.996501\n", + " 0\n", + " 32\n", + " 168.557294\n", " \n", " \n", " 3\n", - " 864\n", + " 1128\n", " 0\n", - " 720\n", + " 576\n", + " 192\n", " 192\n", " 0\n", " 0\n", - " 0\n", - " 143.024802\n", + " 115.141985\n", " \n", " \n", " 4\n", - " 1088\n", - " 0\n", - " 768\n", - " 128\n", + " 756\n", " 0\n", + " 486\n", + " 108\n", + " 18\n", " 0\n", " 0\n", - " 153.528996\n", + " 116.641903\n", " \n", " \n", " ...\n", @@ -458,58 +476,58 @@ " \n", " \n", " 692\n", - " 1968\n", + " 1792\n", + " 0\n", + " 768\n", " 0\n", - " 1200\n", - " 288\n", - " 48\n", " 0\n", " 0\n", - " 116.161354\n", + " 0\n", + " 193.030421\n", " \n", " \n", " 693\n", - " 2048\n", - " 0\n", - " 1024\n", - " 640\n", + " 2304\n", " 0\n", + " 1392\n", + " 288\n", + " 96\n", " 0\n", " 0\n", - " 152.702060\n", + " 107.276586\n", " \n", " \n", " 694\n", - " 1536\n", + " 1620\n", " 0\n", - " 1440\n", - " 384\n", + " 1080\n", + " 360\n", " 0\n", " 0\n", " 0\n", - " 99.338457\n", + " 176.029207\n", " \n", " \n", " 695\n", - " 1440\n", - " 0\n", - " 1104\n", - " 384\n", + " 1664\n", " 0\n", + " 1024\n", + " 640\n", + " 512\n", " 0\n", " 0\n", - " 135.714021\n", + " 164.708736\n", " \n", " \n", " 696\n", - " 1536\n", + " 2304\n", " 0\n", + " 576\n", " 1152\n", - " 768\n", " 0\n", " 0\n", " 0\n", - " 133.680986\n", + " 113.888695\n", " \n", " \n", "\n", @@ -517,31 +535,31 @@ "" ], "text/plain": [ - " num carbon num fluorine num hydrogen num nitrogen num oxygen \\\n", - "0 832 0 448 384 0 \n", - "1 1152 0 832 128 64 \n", - "2 1376 0 896 256 64 \n", - "3 864 0 720 192 0 \n", - "4 1088 0 768 128 0 \n", - ".. ... ... ... ... ... \n", - "692 1968 0 1200 288 48 \n", - "693 2048 0 1024 640 0 \n", - "694 1536 0 1440 384 0 \n", - "695 1440 0 1104 384 0 \n", - "696 1536 0 1152 768 0 \n", + " num carbon num fluorine num hydrogen num nitrogen num oxygen \\\n", + "0 1152 0 1024 256 0 \n", + "1 1152 0 1216 256 0 \n", + "2 768 0 512 0 0 \n", + "3 1128 0 576 192 192 \n", + "4 756 0 486 108 18 \n", + ".. ... ... ... ... ... \n", + "692 1792 0 768 0 0 \n", + "693 2304 0 1392 288 96 \n", + "694 1620 0 1080 360 0 \n", + "695 1664 0 1024 640 512 \n", + "696 2304 0 576 1152 0 \n", "\n", - " num sulfur num silicon deliverable capacity [v STP/v] \n", - "0 0 0 165.565439 \n", - "1 0 0 152.524690 \n", - "2 0 0 115.996501 \n", - "3 0 0 143.024802 \n", - "4 0 0 153.528996 \n", - ".. ... ... ... \n", - "692 0 0 116.161354 \n", - "693 0 0 152.702060 \n", - "694 0 0 99.338457 \n", - "695 0 0 135.714021 \n", - "696 0 0 133.680986 \n", + " num sulfur num silicon deliverable capacity [v STP/v] \n", + "0 0 32 145.407912 \n", + "1 0 0 95.429169 \n", + "2 0 32 168.557294 \n", + "3 0 0 115.141985 \n", + "4 0 0 116.641903 \n", + ".. ... ... ... \n", + "692 0 0 193.030421 \n", + "693 0 0 107.276586 \n", + "694 0 0 176.029207 \n", + "695 0 0 164.708736 \n", + "696 0 0 113.888695 \n", "\n", "[697 rows x 8 columns]" ] @@ -569,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "955bc734-96c5-4d3a-9325-920c041e256b", "metadata": {}, "outputs": [], @@ -621,7 +639,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "id": "72bb9112-1749-44fd-bc9d-7c0edb2e59a6", "metadata": {}, "outputs": [], @@ -732,7 +750,6 @@ " #--------------------------- GP-0 ---------------------------#\n", " likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood()\n", " model_gp0 = surrogate_models.ExactGPModel(X_train0, Y_train0, likelihood_gp0) \n", - " # AcqFunc_0 = UpperConfidenceBound(model_gp0, beta=0.1) \n", " AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_observed_value, maximize=model_input.maximization)\n", " best_observed0.append(best_observed_value) # Appending to best_observed list for the given trial\n", " \n", @@ -789,55 +806,35 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "id": "bfb03e8a", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Using cpu device\n", - "Using cpu device\n", - "Using cpu device\n", - "\n", - " -------------------- Trial 1 of 5 --------------------\n", + "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py:702: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", + " warnings.warn(\n", "\n", - " -------------------- Trial 1 of 5 --------------------\n", - "\n", - " -------------------- Trial 1 of 5 --------------------\n", - "Max in training set, removing it before training models.\n", - "Max in training set, removing it before training models.Max in training set, removing it before training models.\n", - "\n", - "tensor([[-0.8793, 0.0000, -0.7922, 1.8256, -0.5337, -0.1508, -0.1508]])\n", - "tensor([[-0.4935, 0.0000, -1.7229, -0.7278, 0.7059, -0.1508, -0.1508]])\n", - "tensor([[-1.0916]])\n", - "\n", - "Batch 1: best_value (GP-0) = (1.79tensor([[-0.4935, 0.0000, -1.7229, -0.7278, 0.7059, -0.1508, -0.1508]])\n", - "tensor([], size=(0, 7))\n", - "tensor([], size=(1, 0))\n", - "\n", - "Batch 2: best_value (GP-0) = (1.79" + "KeyboardInterrupt\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error in callback > (for post_execute):\n" ] }, { - "ename": "InputDataError", - "evalue": "`choices` must be non-emtpy.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31m_RemoteTraceback\u001b[0m Traceback (most recent call last)", - "\u001b[0;31m_RemoteTraceback\u001b[0m: \n\"\"\"\nTraceback (most recent call last):\n File \"/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py\", line 463, in _process_worker\n r = call_item()\n ^^^^^^^^^^^\n File \"/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py\", line 291, in __call__\n return self.fn(*self.args, **self.kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py\", line 589, in __call__\n return [func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/var/folders/k2/lw7ggcln7zx_cnkbk2zbgvm80000gn/T/ipykernel_16086/3447261974.py\", line 119, in train_gp\n File \"/var/folders/k2/lw7ggcln7zx_cnkbk2zbgvm80000gn/T/ipykernel_16086/1975869805.py\", line 11, in optimize_acqf_and_get_observation\n File \"/Users/nikhilthota/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/botorch/optim/optimize.py\", line 1053, in optimize_acqf_discrete\n raise InputDataError(\"`choices` must be non-emtpy.\")\nbotorch.exceptions.errors.InputDataError: `choices` must be non-emtpy.\n\"\"\"", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mInputDataError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 20\u001b[0m\n\u001b[1;32m 14\u001b[0m shutil\u001b[38;5;241m.\u001b[39mcopy2(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msurrogate_models.py\u001b[39m\u001b[38;5;124m'\u001b[39m,model_input\u001b[38;5;241m.\u001b[39moutput_folder)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Training a single GP for test\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# a = train_gp(0)\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# Train the cluster of GP models in a parallel for loop\u001b[39;00m\n\u001b[0;32m---> 20\u001b[0m best_observed_all_ei0 \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_gp\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mrange\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mnum_cluster\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:1952\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 1946\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[1;32m 1947\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[1;32m 1948\u001b[0m \u001b[38;5;66;03m# reach the first `yield` statement. This starts the aynchronous\u001b[39;00m\n\u001b[1;32m 1949\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[1;32m 1950\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[0;32m-> 1952\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:1595\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[0;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[1;32m 1592\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[1;32m 1594\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[0;32m-> 1595\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[1;32m 1597\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[1;32m 1598\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[1;32m 1599\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[1;32m 1600\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[1;32m 1601\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:1699\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1692\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wait_retrieval():\n\u001b[1;32m 1693\u001b[0m \n\u001b[1;32m 1694\u001b[0m \u001b[38;5;66;03m# If the callback thread of a worker has signaled that its task\u001b[39;00m\n\u001b[1;32m 1695\u001b[0m \u001b[38;5;66;03m# triggered an exception, or if the retrieval loop has raised an\u001b[39;00m\n\u001b[1;32m 1696\u001b[0m \u001b[38;5;66;03m# exception (e.g. `GeneratorExit`), exit the loop and surface the\u001b[39;00m\n\u001b[1;32m 1697\u001b[0m \u001b[38;5;66;03m# worker traceback.\u001b[39;00m\n\u001b[1;32m 1698\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_aborting:\n\u001b[0;32m-> 1699\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_error_fast\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1700\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 1702\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[1;32m 1703\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:1734\u001b[0m, in \u001b[0;36mParallel._raise_error_fast\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1730\u001b[0m \u001b[38;5;66;03m# If this error job exists, immediatly raise the error by\u001b[39;00m\n\u001b[1;32m 1731\u001b[0m \u001b[38;5;66;03m# calling get_result. This job might not exists if abort has been\u001b[39;00m\n\u001b[1;32m 1732\u001b[0m \u001b[38;5;66;03m# called directly or if the generator is gc'ed.\u001b[39;00m\n\u001b[1;32m 1733\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1734\u001b[0m \u001b[43merror_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_result\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:736\u001b[0m, in \u001b[0;36mBatchCompletionCallBack.get_result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 730\u001b[0m backend \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparallel\u001b[38;5;241m.\u001b[39m_backend\n\u001b[1;32m 732\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m backend\u001b[38;5;241m.\u001b[39msupports_retrieve_callback:\n\u001b[1;32m 733\u001b[0m \u001b[38;5;66;03m# We assume that the result has already been retrieved by the\u001b[39;00m\n\u001b[1;32m 734\u001b[0m \u001b[38;5;66;03m# callback thread, and is stored internally. It's just waiting to\u001b[39;00m\n\u001b[1;32m 735\u001b[0m \u001b[38;5;66;03m# be returned.\u001b[39;00m\n\u001b[0;32m--> 736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_return_or_raise\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 738\u001b[0m \u001b[38;5;66;03m# For other backends, the main thread needs to run the retrieval step.\u001b[39;00m\n\u001b[1;32m 739\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/joblib/parallel.py:754\u001b[0m, in \u001b[0;36mBatchCompletionCallBack._return_or_raise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 752\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 753\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m==\u001b[39m TASK_ERROR:\n\u001b[0;32m--> 754\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\n\u001b[1;32m 755\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\n\u001b[1;32m 756\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n", - "\u001b[0;31mInputDataError\u001b[0m: `choices` must be non-emtpy." + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "KeyboardInterrupt\n", + "\n" ] } ], @@ -872,7 +869,19 @@ "id": "5ddae264", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Epsilon greedy method:\n", + "\n", + "random_number = np.random.rand()\n", + "# Explore using the Epsilon Greedy Exploration Strategy\n", + "if random_number <= epsilon:\n", + " # Explore\n", + " best_observed = ## replace here with the randomly picking out one cluster's BO recommendation\n", + "else:\n", + " # Exploit best known action\n", + " best_observed = ## replace here with the best output from the BO recommedations of the local GPs\n", + " " + ] } ], "metadata": { @@ -891,7 +900,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.11.2" } }, "nbformat": 4,