diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..84ca336 Binary files /dev/null and b/.DS_Store differ diff --git a/bo_output/.DS_Store b/bo_output/.DS_Store new file mode 100644 index 0000000..1ce14b0 Binary files /dev/null and b/bo_output/.DS_Store differ diff --git a/bo_output/test/scalerX_0.joblib b/bo_output/test/scalerX_0.joblib new file mode 100644 index 0000000..86a99f4 Binary files /dev/null and b/bo_output/test/scalerX_0.joblib differ diff --git a/bo_output/test/scalerX_1.joblib b/bo_output/test/scalerX_1.joblib new file mode 100644 index 0000000..9fa980c Binary files /dev/null and b/bo_output/test/scalerX_1.joblib differ diff --git a/bo_output/test/scalerX_2.joblib b/bo_output/test/scalerX_2.joblib new file mode 100644 index 0000000..a4e2b81 Binary files /dev/null and b/bo_output/test/scalerX_2.joblib differ diff --git a/bo_output/test/scalerY_0.joblib b/bo_output/test/scalerY_0.joblib new file mode 100644 index 0000000..d20bc11 Binary files /dev/null and b/bo_output/test/scalerY_0.joblib differ diff --git a/bo_output/test/scalerY_1.joblib b/bo_output/test/scalerY_1.joblib new file mode 100644 index 0000000..78f3131 Binary files /dev/null and b/bo_output/test/scalerY_1.joblib differ diff --git a/bo_output/test/scalerY_2.joblib b/bo_output/test/scalerY_2.joblib new file mode 100644 index 0000000..ee98cec Binary files /dev/null and b/bo_output/test/scalerY_2.joblib differ diff --git a/bo_output/test/surrogate_models.py b/bo_output/test/surrogate_models.py new file mode 100644 index 0000000..a96e173 --- /dev/null +++ b/bo_output/test/surrogate_models.py @@ -0,0 +1,95 @@ +# This file costructs surrogate models for the input datasets +import numpy as np +import os +import json +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +import math + +# Torch specific module imports +import torch +import gpytorch + +# botorch specific modules +from botorch.fit import fit_gpytorch_model +from botorch.models.gpytorch import GPyTorchModel + +# Plotting libraries +import matplotlib as mpl +import matplotlib.pyplot as plt + +# User defined python classes and files +import utils_dataset as utilsd +import input_class +import code_inputs as model_input + +np.random.seed(0) +torch.manual_seed(0) + +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using {device} device") + +# We will use the simplest form of GP model, exact inference +class ExactGPModel(gpytorch.models.ExactGP,GPyTorchModel): + _num_outputs = 1 # to inform GPyTorchModel API + MIN_INFERRED_NOISE_LEVEL = 1e-5 + def __init__(self, train_x, train_y, likelihood): + super(ExactGPModel, self).__init__(train_x, train_y, likelihood) + self.mean_module = gpytorch.means.ConstantMean() + if model_input.kernel=='RBF': + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()) + elif model_input.kernel=='Matern': + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5)) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + +#--------------------------- GP-0 ---------------------------# +def train_surrogate_gp0(X_train,Y_train): + + mse_gp0 = 0.0 + training_iter = model_input.epochs_GP0 + + # initialize likelihood and model + likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood() + model_gp0 = ExactGPModel(X_train, Y_train, likelihood_gp0) + + # Find optimal model hyperparameters + model_gp0.train() + likelihood_gp0.train() + + # Use the adam optimizer + optimizer = torch.optim.Adam(model_gp0.parameters(), lr=model_input.learning_rate_gp0) # Includes GaussianLikelihood parameters + + # "Loss" for GPs - the marginal log likelihood + mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood_gp0, model_gp0) + + for i in range(training_iter): + optimizer.zero_grad() # Zero gradients from previous iteration + output = model_gp0(X_train) # Output from model + loss = -mll(output, Y_train) # Calc loss and backprop gradients + loss.backward() + optimizer.step() + + return model_gp0, likelihood_gp0 + +def predict_surrogates(model, likelihood, X): + + # Get into evaluation (predictive posterior) mode + model.eval() + likelihood.eval() + + # Make predictions by feeding model through likelihood + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + prediction = model(X) + prediction = likelihood(model(X)) + + observed_mean = prediction.mean + observed_var = prediction.variance + observed_covar = prediction.covariance_matrix + + return observed_mean, observed_var + + \ No newline at end of file diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..e5b0917 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/BO.ipynb b/src/BO.ipynb new file mode 100644 index 0000000..68e4324 --- /dev/null +++ b/src/BO.ipynb @@ -0,0 +1,908 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a45452e9-567c-4658-bfa1-f9a6f6b70bd1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'dlopen(/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowIxEET_S2_S2_b\n", + " Referenced from: /Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/image.so\n", + " Expected in: /Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torch/lib/libc10.dylib'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n", + " warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using cpu device\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This file costructs surrogate models for the input datasets\n", + "import numpy as np \n", + "import pandas as pd\n", + "import os\n", + "import shutil\n", + "import json\n", + "import math\n", + "import time\n", + "import warnings\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error\n", + "from joblib import Parallel, delayed, dump\n", + "\n", + "# Torch specific module imports\n", + "import torch\n", + "import gpytorch \n", + "\n", + "# botorch specific modules\n", + "from botorch.fit import fit_gpytorch_model\n", + "from botorch.models.gpytorch import GPyTorchModel\n", + "from botorch.optim import optimize_acqf, optimize_acqf_discrete\n", + "from botorch import fit_gpytorch_mll\n", + "from botorch.acquisition.monte_carlo import (\n", + " qExpectedImprovement,\n", + " qNoisyExpectedImprovement,\n", + ")\n", + "from botorch.sampling.normal import SobolQMCNormalSampler\n", + "from botorch.exceptions import BadInitialCandidatesWarning\n", + "from botorch.acquisition import UpperConfidenceBound, ExpectedImprovement\n", + "\n", + "# Plotting libraries\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Tick parameters\n", + "plt.rcParams['xtick.labelsize'] = 15\n", + "plt.rcParams['ytick.labelsize'] = 15\n", + "plt.rcParams['xtick.major.size'] = 5\n", + "plt.rcParams['xtick.major.width'] = 1\n", + "plt.rcParams['xtick.minor.size'] = 5\n", + "plt.rcParams['xtick.minor.width'] = 1\n", + "plt.rcParams['ytick.major.size'] = 5\n", + "plt.rcParams['ytick.major.width'] = 1\n", + "plt.rcParams['ytick.minor.size'] = 5\n", + "plt.rcParams['ytick.minor.width'] = 1\n", + "\n", + "plt.rcParams['axes.labelsize'] = 15\n", + "plt.rcParams['axes.titlesize'] = 15\n", + "plt.rcParams['legend.fontsize'] = 15\n", + "\n", + "# User defined python classes and files\n", + "import input_class \n", + "import code_inputs as model_input\n", + "import utils_dataset as utilsd\n", + "import surrogate_models\n", + "import kmeans as km\n", + "\n", + "# Set the random seeds\n", + "np.random.seed(0)\n", + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "8b29bdc5", + "metadata": {}, + "source": [ + "#### K means clustering" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a8de62ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['dimensions',\n", + " ' bond type',\n", + " ' void fraction [widom]',\n", + " ' supercell volume [A^3]',\n", + " ' density [kg/m^3]',\n", + " ' heat desorption high P [kJ/mol]',\n", + " ' absolute methane uptake high P [molec/unit cell]',\n", + " ' absolute methane uptake high P [mol/kg]',\n", + " ' excess methane uptake high P [molec/unit cell]',\n", + " ' excess methane uptake high P [mol/kg]',\n", + " ' heat desorption low P [kJ/mol]',\n", + " ' absolute methane uptake low P [molec/unit cell]',\n", + " ' absolute methane uptake low P [mol/kg]',\n", + " ' excess methane uptake low P [molec/unit cell]',\n", + " ' excess methane uptake low P [mol/kg]',\n", + " ' surface area [m^2/g]',\n", + " ' linkerA',\n", + " ' linkerB',\n", + " ' net',\n", + " ' cell_a [A]',\n", + " ' cell_b [A]',\n", + " ' cell_c [A]',\n", + " ' alpha [deg]',\n", + " ' beta [deg]',\n", + " ' gamma [deg]',\n", + " ' num carbon',\n", + " ' num fluorine',\n", + " ' num hydrogen',\n", + " ' num nitrogen',\n", + " ' num oxygen',\n", + " ' num sulfur',\n", + " ' num silicon',\n", + " ' vertices',\n", + " ' edges',\n", + " ' genus',\n", + " ' largest included sphere diameter [A]',\n", + " ' largest free sphere diameter [A]',\n", + " ' largest included sphere along free sphere path diameter [A]',\n", + " ' absolute methane uptake high P [v STP/v]',\n", + " ' absolute methane uptake low P [v STP/v]']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Input = input_class.inputs(input_path='../')\n", + "XX_prop, YY, descriptors = Input.read_inputs()\n", + "descriptors" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "147caa07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num carbonnum fluorinenum hydrogennum nitrogennum oxygennum sulfurnum silicon
036002161447200
1360021614414400
243203601447200
3360014421621600
4360014421621600
........................
69835996057696000
698361020057648000
698371360076864000
6983818880115212812800
69839536028832000
\n", + "

69840 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " num carbon num fluorine num hydrogen num nitrogen num oxygen \\\n", + "0 360 0 216 144 72 \n", + "1 360 0 216 144 144 \n", + "2 432 0 360 144 72 \n", + "3 360 0 144 216 216 \n", + "4 360 0 144 216 216 \n", + "... ... ... ... ... ... \n", + "69835 996 0 576 96 0 \n", + "69836 1020 0 576 48 0 \n", + "69837 1360 0 768 64 0 \n", + "69838 1888 0 1152 128 128 \n", + "69839 536 0 288 32 0 \n", + "\n", + " num sulfur num silicon \n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "... ... ... \n", + "69835 0 0 \n", + "69836 0 0 \n", + "69837 0 0 \n", + "69838 0 0 \n", + "69839 0 0 \n", + "\n", + "[69840 rows x 7 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "XX_comp_df, YY_df = Input.get_comp()\n", + "XX_comp_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fe95ea1f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num carbonnum fluorinenum hydrogennum nitrogennum oxygennum sulfurnum silicondeliverable capacity [v STP/v]
01152010242560032145.407912
111520121625600095.429169
2768051200032168.557294
31128057619219200115.141985
475604861081800116.641903
...........................
692179207680000193.030421
6932304013922889600107.276586
694162001080360000176.029207
69516640102464051200164.708736
696230405761152000113.888695
\n", + "

697 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " num carbon num fluorine num hydrogen num nitrogen num oxygen \\\n", + "0 1152 0 1024 256 0 \n", + "1 1152 0 1216 256 0 \n", + "2 768 0 512 0 0 \n", + "3 1128 0 576 192 192 \n", + "4 756 0 486 108 18 \n", + ".. ... ... ... ... ... \n", + "692 1792 0 768 0 0 \n", + "693 2304 0 1392 288 96 \n", + "694 1620 0 1080 360 0 \n", + "695 1664 0 1024 640 512 \n", + "696 2304 0 576 1152 0 \n", + "\n", + " num sulfur num silicon deliverable capacity [v STP/v] \n", + "0 0 32 145.407912 \n", + "1 0 0 95.429169 \n", + "2 0 32 168.557294 \n", + "3 0 0 115.141985 \n", + "4 0 0 116.641903 \n", + ".. ... ... ... \n", + "692 0 0 193.030421 \n", + "693 0 0 107.276586 \n", + "694 0 0 176.029207 \n", + "695 0 0 164.708736 \n", + "696 0 0 113.888695 \n", + "\n", + "[697 rows x 8 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_cluster = 3\n", + "clustered_dfs = km.k_means(XX_comp_df, YY_df, num_cluster)\n", + "sample_dfs = km.draw_samples(clustered_dfs, sample_fraction = 0.01)\n", + "samples = km.concat(sample_dfs)\n", + "samples" + ] + }, + { + "cell_type": "markdown", + "id": "35ed4601", + "metadata": {}, + "source": [ + "#### Acquisition function " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "955bc734-96c5-4d3a-9325-920c041e256b", + "metadata": {}, + "outputs": [], + "source": [ + "## TODO: TO BE Check\n", + "bounds = torch.tensor([[-10.0], [12.0]])\n", + "\n", + "batch_size = 1\n", + "num_restarts= 10 \n", + "raw_samples = 512\n", + "\n", + "def optimize_acqf_and_get_observation(acq_func, X_test, Y_test):\n", + " \"\"\"Optimizes the acquisition function, and returns a new candidate\"\"\"\n", + " # optimize\n", + " candidates, _ = optimize_acqf_discrete(\n", + " acq_function=acq_func,\n", + " choices=X_test,\n", + " q=batch_size,\n", + " max_batch_size=2048,\n", + " num_restarts=num_restarts,\n", + " raw_samples=raw_samples, # used for intialization heuristic\n", + " options={\"batch_limit\": 5, \"maxiter\": 200},\n", + " unique=True\n", + " )\n", + " \n", + " print(candidates)\n", + " # observe new values\n", + " new_x = candidates.detach()\n", + " b = [1 if torch.all(X_test[i].eq(new_x)) else 0 for i in range(0,X_test.shape[0]) ]\n", + " b = torch.tensor(b).to(torch.int)\n", + " index = b.nonzero()[0][0]\n", + " new_y = torch.reshape(Y_test[0,index],(1,1))\n", + " \n", + " X_test_new = X_test[torch.arange(0, X_test.shape[0]) != index, ...]\n", + " Y_test_new = Y_test[..., torch.arange(0, Y_test.shape[1]) != index]\n", + " print(X_test_new)\n", + " print(Y_test_new)\n", + " \n", + " return new_x, new_y, index, X_test_new, Y_test_new" + ] + }, + { + "cell_type": "markdown", + "id": "f93f668d", + "metadata": {}, + "source": [ + "#### GP Train Function" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "72bb9112-1749-44fd-bc9d-7c0edb2e59a6", + "metadata": {}, + "outputs": [], + "source": [ + "def create_train_test_data(cluster_dataXX, cluster_dataYY, random_seed):\n", + " if model_input.standardize_data:\n", + " cluster_dataXX, scalerX_transform = utilsd.standardize_data(cluster_dataXX)\n", + " cluster_dataYY, scalerY_transform = utilsd.standardize_data(cluster_dataYY.reshape(-1,1))\n", + " else:\n", + " scalerX_transform = None\n", + " scalerY_transform = None\n", + " \n", + " ## TODO : Incase for feature selection\n", + " # ....\n", + " # ....\n", + " # ....\n", + "\n", + " # Create train and test sets\n", + " X_train, X_test, Y_train, Y_test = train_test_split(cluster_dataXX, cluster_dataYY, test_size=model_input.test_size, random_state=random_seed)\n", + "\n", + " # Convert to tensors\n", + " X_train = torch.tensor(X_train, dtype=torch.float32)\n", + " X_test = torch.tensor(X_test, dtype=torch.float32)\n", + " Y_train = np.transpose(Y_train) # IMP : Has to have only one row for GP training\n", + " Y_train = torch.tensor(Y_train, dtype=torch.float32)\n", + " Y_test = np.transpose(Y_test)\n", + " Y_test = torch.tensor(Y_test, dtype=torch.float32)\n", + "\n", + " return X_train, X_test, Y_train, Y_test, scalerX_transform, scalerY_transform\n", + "\n", + "def train_gp(cluster_idx):\n", + " best_observed_all_ei0 = []\n", + " # Average over multiple trials\n", + " for trial in range(1, model_input.n_trials + 1):\n", + " t0 = time.monotonic()\n", + " if model_input.random_seed == 'time':\n", + " random_seed = int(t0)\n", + " elif model_input.random_seed == 'iteration':\n", + " random_seed = trial\n", + " \n", + " print(f\"\\n -------------------- Trial {trial:>2} of {model_input.n_trials} --------------------\\n\", end=\"\")\n", + " best_observed0 = []\n", + "\n", + " XX_desc = list(sample_dfs[cluster_idx].columns[:-1])\n", + " YY_desc = sample_dfs[cluster_idx].columns[-1]\n", + " (\n", + " X_train,\n", + " X_test,\n", + " Y_train,\n", + " Y_test,\n", + " scalerX, \n", + " scalerY\n", + " ) = create_train_test_data(sample_dfs[cluster_idx][XX_desc].to_numpy(), sample_dfs[cluster_idx][YY_desc].to_numpy(), random_seed)\n", + " if trial == 1:\n", + " # Dump the scalers to model output folder\n", + " dump(scalerX, os.path.join(model_input.output_folder, f'scalerX_{cluster_idx}.joblib'))\n", + " dump(scalerY, os.path.join(model_input.output_folder, f'scalerY_{cluster_idx}.joblib'))\n", + " \n", + " # Finding best value in initial data\n", + " if model_input.maximization:\n", + " best_observed_value = Y_train.max()\n", + " optimal_solution = torch.cat([Y_train[0],Y_test[0]]).max()\n", + " else:\n", + " best_observed_value = Y_train.min()\n", + " optimal_solution = torch.cat([Y_train[0],Y_test[0]]).min()\n", + " \n", + " # If optimal value is present in the initial dataset sample remove it \n", + " if (best_observed_value.eq(optimal_solution)) and model_input.maximization:\n", + " print('Max in training set, removing it before training models.')\n", + " optimal_position = torch.argmax(Y_train)\n", + " \n", + " # Add max value to test/exploration set\n", + " X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n", + " X_test = torch.cat([X_test,X_add_toTest])\n", + " Y_add_toTest = torch.reshape(optimal_solution,(1,1)) \n", + " Y_test = torch.cat((Y_test,Y_add_toTest),1)\n", + " \n", + " # Remove max value from training set\n", + " X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n", + " Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n", + " \n", + " # Update best observed value\n", + " best_observed_value = Y_train.max()\n", + " \n", + " elif (best_observed_value.eq(optimal_solution)) and not model_input.maximization:\n", + " print('Min in training set, removing it before training models.')\n", + " optimal_position = torch.argmin(Y_train)\n", + " \n", + " # Add min value to test/exploration set\n", + " X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n", + " X_test = torch.cat([X_test,X_add_toTest])\n", + " Y_add_toTest = torch.reshape(optimal_solution,(1,1)) \n", + " Y_test = torch.cat((Y_test,Y_add_toTest),1)\n", + " \n", + " # Remove min value from training set\n", + " X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n", + " Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n", + " \n", + " # Update best observed value\n", + " best_observed_value = Y_train.min()\n", + " \n", + " # Initialize data for training gp-0 and gp-l models\n", + " X_train0, Y_train0, X_test0, Y_test0 = X_train, Y_train, X_test, Y_test\n", + " \n", + " n_batch = model_input.n_batch_perTrial\n", + " \n", + " # Initialize likelihood, GP model and acquisition function for the models\n", + " #--------------------------- GP-0 ---------------------------#\n", + " likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood()\n", + " model_gp0 = surrogate_models.ExactGPModel(X_train0, Y_train0, likelihood_gp0) \n", + " AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_observed_value, maximize=model_input.maximization)\n", + " best_observed0.append(best_observed_value) # Appending to best_observed list for the given trial\n", + " \n", + " # run N_BATCH rounds of BayesOpt after the initial random batch\n", + " for iteration in range(1, n_batch + 1):\n", + "\n", + " if ((iteration-1)%model_input.n_update==0):\n", + " # fit the models every 10 iterations\n", + " model_gp0, likelihood_gp0 = surrogate_models.train_surrogate_gp0(X_train0, Y_train0)\n", + " \n", + " # optimize and get new observation using acquisition function\n", + " new_x0, new_y0, index, X_test_new0, Y_test_new0 = optimize_acqf_and_get_observation(AcqFunc_0, X_test0, Y_test0)\n", + " \n", + " # Update remaining choices tensor\n", + " X_test0 = X_test_new0\n", + " Y_test0 = Y_test_new0\n", + "\n", + " # Update training points\n", + " X_train0 = torch.cat([X_train0, new_x0])\n", + " Y_train0 = torch.cat([Y_train0[0], new_y0[0]])\n", + " Y_train0 = torch.reshape(Y_train0,(1,Y_train0.shape[0]))\n", + "\n", + " # update progress\n", + " if model_input.maximization:\n", + " best_value_ei0 = Y_train0.max()\n", + " elif not model_input.maximization:\n", + " best_value_ei0 = Y_train0.min()\n", + " best_observed0.append(best_value_ei0)\n", + "\n", + " # AcqFunc_0 = UpperConfidenceBound(model_gp0, beta=0.1) \n", + " AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_value_ei0, maximize=model_input.maximization)\n", + " \n", + " if model_input.verbose:\n", + " print(\n", + " f\"\\nBatch {iteration:>2}: best_value (GP-0) = \",\n", + " f\"({best_value_ei0:>4.2f}\",\n", + " end=\"\",)\n", + "\n", + " t1 = time.monotonic()\n", + " \n", + " print(f\"time = {t1-t0:>4.2f}.\")\n", + " # Appending to common list of best observed values, with number of rows equal to number of trials\n", + " best_observed_all_ei0.append(best_observed0) \n", + " return best_observed_all_ei0" + ] + }, + { + "cell_type": "markdown", + "id": "17fbf21d", + "metadata": {}, + "source": [ + "#### Main Function" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bfb03e8a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py:702: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n", + " warnings.warn(\n", + "\n", + "KeyboardInterrupt\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error in callback > (for post_execute):\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "KeyboardInterrupt\n", + "\n" + ] + } + ], + "source": [ + "warnings.filterwarnings(\"ignore\", category=BadInitialCandidatesWarning)\n", + "warnings.filterwarnings(\"ignore\", category=RuntimeWarning)\n", + "\n", + "# Create a new directory if it does not exist\n", + "isExist = os.path.exists(model_input.output_folder)\n", + "if not isExist:\n", + " os.makedirs(model_input.output_folder)\n", + " print(\"The new directory is created!\", model_input.output_folder)\n", + " \n", + "# Commented out by NKT\n", + "# # Copy input parameters file to output folder\n", + "# shutil.copy2('surrogate_model_inputs.py',model_input.output_folder)\n", + "# Copy surrogate model file to output folder\n", + "shutil.copy2('surrogate_models.py',model_input.output_folder)\n", + "\n", + "# Training a single GP for test\n", + "# a = train_gp(0)\n", + "\n", + "# Train the cluster of GP models in a parallel for loop\n", + "best_observed_all_ei0 = Parallel(n_jobs=-1)(\n", + " delayed(train_gp)(i) for i in range(num_cluster)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ddae264", + "metadata": {}, + "outputs": [], + "source": [ + "# Epsilon greedy method:\n", + "\n", + "random_number = np.random.rand()\n", + "# Explore using the Epsilon Greedy Exploration Strategy\n", + "if random_number <= epsilon:\n", + " # Explore\n", + " best_observed = ## replace here with the randomly picking out one cluster's BO recommendation\n", + "else:\n", + " # Exploit best known action\n", + " best_observed = ## replace here with the best output from the BO recommedations of the local GPs\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/code_inputs.py b/src/code_inputs.py new file mode 100644 index 0000000..e726263 --- /dev/null +++ b/src/code_inputs.py @@ -0,0 +1,66 @@ +# This file contaings surrogate model inputs +import numpy as np +import os +import json +import math + +# Torch specific module imports +import torch +import gpytorch +from torch import nn +from torch.utils.data import DataLoader, Dataset +from torchvision import datasets, transforms +from torch.nn import functional as F + +# botorch specific modules +from botorch.fit import fit_gpytorch_model +from botorch.models.gpytorch import GPyTorchModel + +# Plotting libraries +import matplotlib as mpl +import matplotlib.pyplot as plt + +# User defined python classes and files +import sys + +import utils_dataset as utilsd +import input_class + +np.random.seed(0) +torch.manual_seed(0) + +# General inputs +run_folder = '/Users/nikhilthota/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery' # Folder where code is run and input json exist +num_run = 3 +test_size = 0.01 +output_folder = run_folder+'/bo_output/' # Folder where all outputs are stored +output_folder = output_folder+'test' +verbose = True +deep_verbose = False + +# Reading and data processing inputs +add_target_noise = False +standardize_data = True + +# Feature selection inputs +test_size_fs = 0.1 +select_features_otherModels = False + +# BO inputs +n_trials = 5 +n_update = 1000 +GP_0_BO = True +GP_L_BO = True +GP_NN_BO = False +random_seed = 'iteration' +maximization = True +new_values_predict_from_model = False +n_batch_perTrial = 100 + +# Surrogate training boolean inputs +train_GP = True + +# GP Model parameters +kernel = 'Matern' +learning_rate_gp0 = 0.01 +epochs_GP0 = 500 diff --git a/src/feature_selection_methods.py b/src/feature_selection_methods.py new file mode 100644 index 0000000..f0e4c37 --- /dev/null +++ b/src/feature_selection_methods.py @@ -0,0 +1,78 @@ +import numpy as np +import csv +import copy +import random +import pandas as pd +import scipy +import sklearn as sk + +import xgboost as xgb +from xgboost.sklearn import XGBRegressor +from sklearn.metrics import mean_squared_error + +# User defined files and classes +import utils_dataset as utilsd + +# sklearn functions +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import KFold +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.linear_model import Lasso + + +class feature_selection_algorithms: + + def __init__(self,XX,YY,test_size=0.33,random_state=42): + + # Train Data + self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(XX, YY, test_size=test_size, random_state=random_state) + + # XGBoost + def xgboost(self, **kwargs): + + clf = XGBRegressor(n_estimators=100, learning_rate=0.025, max_depth=20, verbosity=0, booster='gbtree', + reg_alpha=np.exp(-6.788644799030888), reg_lambda=np.exp(-7.450413274554533), + gamma=np.exp(-5.374463422208394), subsample=0.5, objective= 'reg:squarederror', n_jobs=1) + + paras = clf.get_params() + + clf.fit(self.X_train, self.y_train) + return clf + + # Features selected by XGBoost + def selected_features_xgboost(self, descriptors, deep_verbose=False): + + clf = self.xgboost() + score = clf.score(self.X_train, self.y_train) + if deep_verbose: + print("XGBoost Training score: ", score) + + scores = cross_val_score(clf, self.X_train, self.y_train,cv=10) + if deep_verbose: + print("XGBoost Mean cross-validation score: %.2f" % scores.mean()) + + + ypred = clf.predict(self.X_test) + mse = mean_squared_error(self.y_test, ypred) + if deep_verbose: + print("XGBoost MSE: %.2f" % mse) + print("XGBoost RMSE: %.2f" % (mse**(1/2.0))) + + f_importance = clf.get_booster().get_score(importance_type='gain') + feature_importance_dict={} + + for f,value in f_importance.items(): + feature_index = int(f.split('f')[1]) + feature_importance_dict[descriptors[feature_index]] = value + if deep_verbose: + print(f"Column: {feature_index}, descriptor: {descriptors[feature_index]}") + + return feature_importance_dict.keys() + + +if __name__=="__main__": + + print('Feature selection methods are in this class') \ No newline at end of file diff --git a/src/input_class.py b/src/input_class.py new file mode 100644 index 0000000..51ee96d --- /dev/null +++ b/src/input_class.py @@ -0,0 +1,84 @@ +import sklearn +import numpy as np +import csv +import copy +import random +import pandas as pd +import pickle +import json +import openpyxl +import itertools + +# User defined files and classes +import feature_selection_methods as feature_selection +import utils_dataset as utilsd + +# Plotting libraries +import matplotlib as mpl +import matplotlib.pyplot as plt + +# Tick parameters +plt.rcParams['xtick.labelsize'] = 15 +plt.rcParams['ytick.labelsize'] = 15 +plt.rcParams['xtick.major.size'] = 5 +plt.rcParams['xtick.major.width'] = 1 +plt.rcParams['xtick.minor.size'] = 5 +plt.rcParams['xtick.minor.width'] = 1 +plt.rcParams['ytick.major.size'] = 5 +plt.rcParams['ytick.major.width'] = 1 +plt.rcParams['ytick.minor.size'] = 5 +plt.rcParams['ytick.minor.width'] = 1 + +plt.rcParams['axes.labelsize'] = 20 +plt.rcParams['axes.titlesize'] = 20 +plt.rcParams['legend.fontsize'] = 15 + + +class inputs: + def __init__(self,input_type='COF',input_path='../',input_file='properties.csv'): + self.input_type = input_type + self.input_path = input_path + self.input_file = input_file + self.filename = self.input_path + self.input_file + self.data = data = pd.read_csv(self.filename) + + def read_inputs(self): + ''' + This function reads the dataset from the COF paper: https://pubs.acs.org/doi/10.1021/acs.chemmater.8b01425 + input_type='COF', + input_path='.', + input_file='properties.csv' + ''' +# XX_comp = data.iloc[:,38:45] + descriptors = ['dimensions', ' bond type', ' void fraction [widom]', ' supercell volume [A^3]', ' density [kg/m^3]', + ' heat desorption high P [kJ/mol]',' absolute methane uptake high P [molec/unit cell]', + ' absolute methane uptake high P [mol/kg]', ' excess methane uptake high P [molec/unit cell]', + ' excess methane uptake high P [mol/kg]', ' heat desorption low P [kJ/mol]', + ' absolute methane uptake low P [molec/unit cell]', + ' absolute methane uptake low P [mol/kg]', + ' excess methane uptake low P [molec/unit cell]', + ' excess methane uptake low P [mol/kg]', ' surface area [m^2/g]', ' linkerA', ' linkerB', ' net', + ' cell_a [A]', ' cell_b [A]', ' cell_c [A]', ' alpha [deg]', ' beta [deg]', ' gamma [deg]', + ' num carbon', ' num fluorine', ' num hydrogen', ' num nitrogen', ' num oxygen', ' num sulfur', + ' num silicon', ' vertices', ' edges', ' genus', ' largest included sphere diameter [A]', + ' largest free sphere diameter [A]', ' largest included sphere along free sphere path diameter [A]', + ' absolute methane uptake high P [v STP/v]', ' absolute methane uptake low P [v STP/v]'] + XX_prop = pd.DataFrame(self.data, columns=descriptors) + target = copy.deepcopy(self.data[' deliverable capacity [v STP/v]'].to_numpy()) + YY = target.reshape(-1,1) + + return XX_prop, YY, descriptors + + def get_comp(self): + XX_comp_df = self.data.iloc[:,38:45] + YY = self.data.iloc[:, 27] + # YY_df = pd.DataFrame(YY) + # YY = copy.deepcopy(self.data[' deliverable capacity [v STP/v]'].to_numpy()) + return XX_comp_df, YY + + + +if __name__=="__main__": + + print('Reading inputs') + \ No newline at end of file diff --git a/src/kmeans.py b/src/kmeans.py new file mode 100644 index 0000000..faf0992 --- /dev/null +++ b/src/kmeans.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +from sklearn import metrics +from sklearn.metrics import silhouette_samples + +def k_means(X, y, k=3): + """ + Perform k-means clustering on the given input X and label y + """ + + # Perform k-means clustering + kmeans = KMeans(n_clusters=k, random_state=0).fit(X) + labels = kmeans.labels_ + X['cluster'] = labels + data = pd.concat([X, y], axis=1) + clustered_dfs = [data[data['cluster'] == i].drop('cluster', axis=1) for i in range(k)] + + return clustered_dfs + +def draw_samples(clustered_dfs, sample_fraction = 0.01, random_state = 42): + sample_dfs = [] + for df in clustered_dfs: + sample_size = int(len(df) * sample_fraction) + sample_dfs.append(df.sample(n=sample_size, random_state=random_state)) + return sample_dfs + +def concat(dfs_list): + return pd.concat(dfs_list, ignore_index=True) \ No newline at end of file diff --git a/src/surrogate_models.py b/src/surrogate_models.py new file mode 100644 index 0000000..a96e173 --- /dev/null +++ b/src/surrogate_models.py @@ -0,0 +1,95 @@ +# This file costructs surrogate models for the input datasets +import numpy as np +import os +import json +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +import math + +# Torch specific module imports +import torch +import gpytorch + +# botorch specific modules +from botorch.fit import fit_gpytorch_model +from botorch.models.gpytorch import GPyTorchModel + +# Plotting libraries +import matplotlib as mpl +import matplotlib.pyplot as plt + +# User defined python classes and files +import utils_dataset as utilsd +import input_class +import code_inputs as model_input + +np.random.seed(0) +torch.manual_seed(0) + +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using {device} device") + +# We will use the simplest form of GP model, exact inference +class ExactGPModel(gpytorch.models.ExactGP,GPyTorchModel): + _num_outputs = 1 # to inform GPyTorchModel API + MIN_INFERRED_NOISE_LEVEL = 1e-5 + def __init__(self, train_x, train_y, likelihood): + super(ExactGPModel, self).__init__(train_x, train_y, likelihood) + self.mean_module = gpytorch.means.ConstantMean() + if model_input.kernel=='RBF': + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()) + elif model_input.kernel=='Matern': + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5)) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + +#--------------------------- GP-0 ---------------------------# +def train_surrogate_gp0(X_train,Y_train): + + mse_gp0 = 0.0 + training_iter = model_input.epochs_GP0 + + # initialize likelihood and model + likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood() + model_gp0 = ExactGPModel(X_train, Y_train, likelihood_gp0) + + # Find optimal model hyperparameters + model_gp0.train() + likelihood_gp0.train() + + # Use the adam optimizer + optimizer = torch.optim.Adam(model_gp0.parameters(), lr=model_input.learning_rate_gp0) # Includes GaussianLikelihood parameters + + # "Loss" for GPs - the marginal log likelihood + mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood_gp0, model_gp0) + + for i in range(training_iter): + optimizer.zero_grad() # Zero gradients from previous iteration + output = model_gp0(X_train) # Output from model + loss = -mll(output, Y_train) # Calc loss and backprop gradients + loss.backward() + optimizer.step() + + return model_gp0, likelihood_gp0 + +def predict_surrogates(model, likelihood, X): + + # Get into evaluation (predictive posterior) mode + model.eval() + likelihood.eval() + + # Make predictions by feeding model through likelihood + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + prediction = model(X) + prediction = likelihood(model(X)) + + observed_mean = prediction.mean + observed_var = prediction.variance + observed_covar = prediction.covariance_matrix + + return observed_mean, observed_var + + \ No newline at end of file diff --git a/src/test.ipynb b/src/test.ipynb new file mode 100644 index 0000000..8d0862b --- /dev/null +++ b/src/test.ipynb @@ -0,0 +1,143 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dde8a292", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2234ceda", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from botorch.models import SingleTaskGP\n", + "from botorch.fit import fit_gpytorch_mll\n", + "from botorch.utils import standardize\n", + "from gpytorch.mlls import ExactMarginalLogLikelihood\n", + "\n", + "train_X = torch.rand(10, 2)\n", + "Y = 1 - torch.linalg.norm(train_X - 0.5, dim=-1, keepdim=True)\n", + "Y = Y + 0.1 * torch.randn_like(Y) # add some noise\n", + "train_Y = standardize(Y)\n", + "\n", + "gp = SingleTaskGP(train_X, train_Y)\n", + "mll = ExactMarginalLogLikelihood(gp.likelihood, gp)\n", + "fit_gpytorch_mll(mll)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4524f72", + "metadata": {}, + "outputs": [], + "source": [ + "from joblib import Parallel, delayed\n", + "\n", + "def compute_sq(x):\n", + " return x**2\n", + "\n", + "# Time the computation of the squares of the numbers 0 to 9\n", + "\n", + "%timeit -n 10 Parallel(n_jobs=-1, backend='loky', verbose=0)(delayed(compute_sq)(x) for x in range(10))\n", + "%timeit -n 10 [compute_sq(x) for x in range(10)]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92f541d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the inputs\n", + "\n", + "from utils_dataset import generate_training_data\n", + "from input_class import inputs\n", + "\n", + "# X_train, X_test, Y_train, Y_test, Var_train, Var_test, scalerX_transform, scalerY_transform = generate_training_data(0, 0.1)\n", + "\n", + "input_obj = inputs()\n", + "XX, YY, descriptors = input_obj.read_inputs()\n", + "\n", + "print(XX)\n", + "print(YY)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "97d372f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using cpu device\n", + "MultivariateNormal(loc: torch.Size([2]))\n", + "tensor([1.0886, 1.1504], dtype=torch.float64, grad_fn=)\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "grad can be implicitly created only for scalar outputs", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m XX \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mrand(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m2\u001b[39m))\n\u001b[1;32m 6\u001b[0m YY \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mrand(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m1\u001b[39m))\n\u001b[0;32m----> 8\u001b[0m \u001b[43mtrain_surrogate_gp0\u001b[49m\u001b[43m(\u001b[49m\u001b[43mXX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mYY\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery/src/surrogate_models.py:75\u001b[0m, in \u001b[0;36mtrain_surrogate_gp0\u001b[0;34m(X_train, Y_train)\u001b[0m\n\u001b[1;32m 73\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39mmll(output, Y_train) \u001b[38;5;66;03m# Calc loss and backprop gradients \u001b[39;00m\n\u001b[1;32m 74\u001b[0m \u001b[38;5;28mprint\u001b[39m(loss) \n\u001b[0;32m---> 75\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 76\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model_gp0, likelihood_gp0\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 513\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 514\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 515\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 520\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 521\u001b[0m )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/autograd/__init__.py:259\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 250\u001b[0m inputs \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 251\u001b[0m (inputs,)\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(inputs, (torch\u001b[38;5;241m.\u001b[39mTensor, graph\u001b[38;5;241m.\u001b[39mGradientEdge))\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m()\n\u001b[1;32m 256\u001b[0m )\n\u001b[1;32m 258\u001b[0m grad_tensors_ \u001b[38;5;241m=\u001b[39m _tensor_or_tensors_to_tuple(grad_tensors, \u001b[38;5;28mlen\u001b[39m(tensors))\n\u001b[0;32m--> 259\u001b[0m grad_tensors_ \u001b[38;5;241m=\u001b[39m \u001b[43m_make_grads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_grads_batched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m retain_graph \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 261\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n", + "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/autograd/__init__.py:132\u001b[0m, in \u001b[0;36m_make_grads\u001b[0;34m(outputs, grads, is_grads_batched)\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out\u001b[38;5;241m.\u001b[39mrequires_grad:\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out\u001b[38;5;241m.\u001b[39mnumel() \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m--> 132\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgrad can be implicitly created only for scalar outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 134\u001b[0m )\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m out\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mis_floating_point:\n\u001b[1;32m 136\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgrad can be implicitly created only for real scalar outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mout\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 139\u001b[0m )\n", + "\u001b[0;31mRuntimeError\u001b[0m: grad can be implicitly created only for scalar outputs" + ] + } + ], + "source": [ + "from surrogate_models import train_surrogate_gp0\n", + "import numpy as np\n", + "import torch\n", + "\n", + "XX = torch.tensor(np.random.rand(2, 2))\n", + "YY = torch.tensor(np.random.rand(2, 1))\n", + "\n", + "train_surrogate_gp0(XX, YY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07d11c64", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/utils_dataset.py b/src/utils_dataset.py new file mode 100644 index 0000000..5015a41 --- /dev/null +++ b/src/utils_dataset.py @@ -0,0 +1,127 @@ +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +from torch.utils.data import Dataset, random_split, DataLoader + +import torch +import json +import pandas as pd +import numpy as np + +# User defined python classes and files +import utils_dataset as utilsd +import input_class +import code_inputs as model_input +import feature_selection_methods as feature_selection + +# Add slicing of the input XX tensor with additional input for the columns picked out by XGBoost or other feature selection methods +class InputDataset(Dataset): + """ Input dataset used for training """ + + def __init__(self, XX, YY, Var=None, transform=None): + """ + Args: + XX: NN Input features vector as a torch tensor + YY: NN Labels vector as a torch tensor + descriptors(list of strings): Names of the input features + transform (callable, optional): Optional transform to be applied + on a sample. + """ + self.XX = XX + self.YY = YY + self.var = Var + self.transform = transform + + def __len__(self): + return self.XX.shape[0] + + def __getitem__(self, idx): + if torch.is_tensor(idx): + idx = idx.tolist() + + x = self.XX[idx,:] + y = self.YY[:,idx] + if self.var != None: + var = self.var[idx] + item = {'in_features':x,'labels':y,'variance':var} + else: + item = {'in_features':x,'labels':y} + + return item + + +def standardize_data(x): + scalerX = StandardScaler().fit(x) + x_train = scalerX.transform(x) + return x_train, scalerX + +def standardize_test_data(x,scalerX): + x_test = scalerX.transform(x) + return x_test + +def generate_training_data(random_state,test_size): + + # todo: Modify below to read from code_inputs.py + # # Reading the input json file with dataset filename and path information + # with open(model_input.run_folder+'inputs.json', "r") as f: + # input_dict = json.load(f) + + # input_type = input_dict['InputType'] + # input_path = input_dict['InputPath'] + # input_file = input_dict['InputFile'] + # add_target_noise = input_dict['AddTargetNoise'] + + # input = input_class.inputs(input_type=input_type, + # input_path=input_path, + # input_file=input_file, + # add_target_noise=add_target_noise) + + input_obj = input_class.inputs() + XX, YY, descriptors = input_obj.read_inputs() + + # Transforming datasets by standardization + if model_input.standardize_data: + X_stand, scalerX_transform = utilsd.standardize_data(XX) + Y_stand, scalerY_transform = utilsd.standardize_data(YY) + else: + X_stand = XX.to_numpy() + Y_stand = YY + + # Checking if we should use xgboost recommended descriptors or all descriptors + if model_input.select_features_otherModels: + fs = feature_selection.feature_selection_algorithms(X_stand,Y_stand, + test_size=model_input.test_size_fs, + random_state=random_state) + xg_boost_descriptors = fs.selected_features_xgboost(descriptors) + if model_input.verbose: + print('Selected Features, ', xg_boost_descriptors) + else: + xg_boost_descriptors = descriptors + + XX = pd.DataFrame(XX, columns=xg_boost_descriptors) + if model_input.standardize_data: + X_stand, scalerX_transform = utilsd.standardize_data(XX) + else: + X_stand=XX.to_numpy() + + # Creating train-test split in data + X_train, X_test, Y_train, Y_test = train_test_split(X_stand, Y_stand, + test_size=test_size, + random_state=random_state) #,stratify=Y_stand) + + Var_train = torch.ones(len(Y_train)) + Var_test = torch.ones(len(Y_test)) + + # Converting data arrays to torch tensors + X_train = torch.tensor(X_train).to(torch.float32) + Y_train = np.transpose(Y_train) # Ytrain has to have only one row for GP training + Y_train = torch.tensor(Y_train).to(torch.float32) + + X_test = torch.tensor(X_test).to(torch.float32) + Y_test = np.transpose(Y_test) # Ytrain has to have only one row for GP training + Y_test = torch.tensor(Y_test).to(torch.float32) + + if model_input.standardize_data: + return X_train, X_test, Y_train, Y_test, Var_train, Var_test, scalerX_transform, scalerY_transform + else: + return X_train, X_test, Y_train, Y_test, Var_train, Var_test + diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 4356ad3..0000000 --- a/test.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "dde8a292", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}