diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..84ca336
Binary files /dev/null and b/.DS_Store differ
diff --git a/bo_output/.DS_Store b/bo_output/.DS_Store
new file mode 100644
index 0000000..1ce14b0
Binary files /dev/null and b/bo_output/.DS_Store differ
diff --git a/bo_output/test/scalerX_0.joblib b/bo_output/test/scalerX_0.joblib
new file mode 100644
index 0000000..86a99f4
Binary files /dev/null and b/bo_output/test/scalerX_0.joblib differ
diff --git a/bo_output/test/scalerX_1.joblib b/bo_output/test/scalerX_1.joblib
new file mode 100644
index 0000000..9fa980c
Binary files /dev/null and b/bo_output/test/scalerX_1.joblib differ
diff --git a/bo_output/test/scalerX_2.joblib b/bo_output/test/scalerX_2.joblib
new file mode 100644
index 0000000..a4e2b81
Binary files /dev/null and b/bo_output/test/scalerX_2.joblib differ
diff --git a/bo_output/test/scalerY_0.joblib b/bo_output/test/scalerY_0.joblib
new file mode 100644
index 0000000..d20bc11
Binary files /dev/null and b/bo_output/test/scalerY_0.joblib differ
diff --git a/bo_output/test/scalerY_1.joblib b/bo_output/test/scalerY_1.joblib
new file mode 100644
index 0000000..78f3131
Binary files /dev/null and b/bo_output/test/scalerY_1.joblib differ
diff --git a/bo_output/test/scalerY_2.joblib b/bo_output/test/scalerY_2.joblib
new file mode 100644
index 0000000..ee98cec
Binary files /dev/null and b/bo_output/test/scalerY_2.joblib differ
diff --git a/bo_output/test/surrogate_models.py b/bo_output/test/surrogate_models.py
new file mode 100644
index 0000000..a96e173
--- /dev/null
+++ b/bo_output/test/surrogate_models.py
@@ -0,0 +1,95 @@
+# This file costructs surrogate models for the input datasets
+import numpy as np     
+import os
+import json
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+import math
+
+# Torch specific module imports
+import torch
+import gpytorch
+
+# botorch specific modules
+from botorch.fit import fit_gpytorch_model
+from botorch.models.gpytorch import GPyTorchModel
+
+# Plotting libraries
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+
+# User defined python classes and files
+import utils_dataset as utilsd
+import input_class 
+import code_inputs as model_input
+
+np.random.seed(0)
+torch.manual_seed(0)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using {device} device")
+    
+# We will use the simplest form of GP model, exact inference
+class ExactGPModel(gpytorch.models.ExactGP,GPyTorchModel):
+    _num_outputs = 1  # to inform GPyTorchModel API
+    MIN_INFERRED_NOISE_LEVEL = 1e-5
+    def __init__(self, train_x, train_y, likelihood):
+        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
+        self.mean_module = gpytorch.means.ConstantMean()
+        if model_input.kernel=='RBF':
+            self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
+        elif model_input.kernel=='Matern':            
+            self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5))
+
+    def forward(self, x):
+        mean_x = self.mean_module(x)
+        covar_x = self.covar_module(x)
+        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
+            
+#--------------------------- GP-0 ---------------------------#
+def train_surrogate_gp0(X_train,Y_train):
+    
+    mse_gp0 = 0.0 
+    training_iter = model_input.epochs_GP0
+    
+    # initialize likelihood and model
+    likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood()
+    model_gp0 = ExactGPModel(X_train, Y_train, likelihood_gp0) 
+    
+    # Find optimal model hyperparameters
+    model_gp0.train()
+    likelihood_gp0.train()
+
+    # Use the adam optimizer
+    optimizer = torch.optim.Adam(model_gp0.parameters(), lr=model_input.learning_rate_gp0)  # Includes GaussianLikelihood parameters
+
+    # "Loss" for GPs - the marginal log likelihood
+    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood_gp0, model_gp0)
+
+    for i in range(training_iter):
+        optimizer.zero_grad()        # Zero gradients from previous iteration
+        output = model_gp0(X_train)  # Output from model
+        loss = -mll(output, Y_train) # Calc loss and backprop gradients         
+        loss.backward()
+        optimizer.step()
+        
+    return model_gp0, likelihood_gp0
+
+def predict_surrogates(model, likelihood, X):
+    
+    # Get into evaluation (predictive posterior) mode
+    model.eval()
+    likelihood.eval()
+
+    # Make predictions by feeding model through likelihood
+    with torch.no_grad(), gpytorch.settings.fast_pred_var():
+        prediction = model(X)
+        prediction = likelihood(model(X))
+
+    observed_mean = prediction.mean
+    observed_var = prediction.variance
+    observed_covar = prediction.covariance_matrix
+
+    return observed_mean, observed_var
+    
+    
\ No newline at end of file
diff --git a/src/.DS_Store b/src/.DS_Store
new file mode 100644
index 0000000..e5b0917
Binary files /dev/null and b/src/.DS_Store differ
diff --git a/src/BO.ipynb b/src/BO.ipynb
new file mode 100644
index 0000000..68e4324
--- /dev/null
+++ b/src/BO.ipynb
@@ -0,0 +1,908 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a45452e9-567c-4658-bfa1-f9a6f6b70bd1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'dlopen(/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowIxEET_S2_S2_b\n",
+      "  Referenced from: /Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torchvision/image.so\n",
+      "  Expected in: /Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/torch/lib/libc10.dylib'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n",
+      "  warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using cpu device\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<torch._C.Generator at 0x1628702f0>"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# This file costructs surrogate models for the input datasets\n",
+    "import numpy as np   \n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import shutil\n",
+    "import json\n",
+    "import math\n",
+    "import time\n",
+    "import warnings\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "from joblib import Parallel, delayed, dump\n",
+    "\n",
+    "# Torch specific module imports\n",
+    "import torch\n",
+    "import gpytorch \n",
+    "\n",
+    "# botorch specific modules\n",
+    "from botorch.fit import fit_gpytorch_model\n",
+    "from botorch.models.gpytorch import GPyTorchModel\n",
+    "from botorch.optim import optimize_acqf, optimize_acqf_discrete\n",
+    "from botorch import fit_gpytorch_mll\n",
+    "from botorch.acquisition.monte_carlo import (\n",
+    "    qExpectedImprovement,\n",
+    "    qNoisyExpectedImprovement,\n",
+    ")\n",
+    "from botorch.sampling.normal import SobolQMCNormalSampler\n",
+    "from botorch.exceptions import BadInitialCandidatesWarning\n",
+    "from botorch.acquisition import UpperConfidenceBound, ExpectedImprovement\n",
+    "\n",
+    "# Plotting libraries\n",
+    "import matplotlib as mpl\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "# Tick parameters\n",
+    "plt.rcParams['xtick.labelsize'] = 15\n",
+    "plt.rcParams['ytick.labelsize'] = 15\n",
+    "plt.rcParams['xtick.major.size'] = 5\n",
+    "plt.rcParams['xtick.major.width'] = 1\n",
+    "plt.rcParams['xtick.minor.size'] = 5\n",
+    "plt.rcParams['xtick.minor.width'] = 1\n",
+    "plt.rcParams['ytick.major.size'] = 5\n",
+    "plt.rcParams['ytick.major.width'] = 1\n",
+    "plt.rcParams['ytick.minor.size'] = 5\n",
+    "plt.rcParams['ytick.minor.width'] = 1\n",
+    "\n",
+    "plt.rcParams['axes.labelsize'] = 15\n",
+    "plt.rcParams['axes.titlesize'] = 15\n",
+    "plt.rcParams['legend.fontsize'] = 15\n",
+    "\n",
+    "# User defined python classes and files\n",
+    "import input_class \n",
+    "import code_inputs as model_input\n",
+    "import utils_dataset as utilsd\n",
+    "import surrogate_models\n",
+    "import kmeans as km\n",
+    "\n",
+    "# Set the random seeds\n",
+    "np.random.seed(0)\n",
+    "torch.manual_seed(0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8b29bdc5",
+   "metadata": {},
+   "source": [
+    "#### K means clustering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a8de62ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['dimensions',\n",
+       " ' bond type',\n",
+       " ' void fraction [widom]',\n",
+       " ' supercell volume [A^3]',\n",
+       " ' density [kg/m^3]',\n",
+       " ' heat desorption high P [kJ/mol]',\n",
+       " ' absolute methane uptake high P [molec/unit cell]',\n",
+       " ' absolute methane uptake high P [mol/kg]',\n",
+       " ' excess methane uptake high P [molec/unit cell]',\n",
+       " ' excess methane uptake high P [mol/kg]',\n",
+       " ' heat desorption low P [kJ/mol]',\n",
+       " ' absolute methane uptake low P [molec/unit cell]',\n",
+       " ' absolute methane uptake low P [mol/kg]',\n",
+       " ' excess methane uptake low P [molec/unit cell]',\n",
+       " ' excess methane uptake low P [mol/kg]',\n",
+       " ' surface area [m^2/g]',\n",
+       " ' linkerA',\n",
+       " ' linkerB',\n",
+       " ' net',\n",
+       " ' cell_a [A]',\n",
+       " ' cell_b [A]',\n",
+       " ' cell_c [A]',\n",
+       " ' alpha [deg]',\n",
+       " ' beta [deg]',\n",
+       " ' gamma [deg]',\n",
+       " ' num carbon',\n",
+       " ' num fluorine',\n",
+       " ' num hydrogen',\n",
+       " ' num nitrogen',\n",
+       " ' num oxygen',\n",
+       " ' num sulfur',\n",
+       " ' num silicon',\n",
+       " ' vertices',\n",
+       " ' edges',\n",
+       " ' genus',\n",
+       " ' largest included sphere diameter [A]',\n",
+       " ' largest free sphere diameter [A]',\n",
+       " ' largest included sphere along free sphere path diameter [A]',\n",
+       " ' absolute methane uptake high P [v STP/v]',\n",
+       " ' absolute methane uptake low P [v STP/v]']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Input = input_class.inputs(input_path='../')\n",
+    "XX_prop, YY, descriptors = Input.read_inputs()\n",
+    "descriptors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "147caa07",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>num carbon</th>\n",
+       "      <th>num fluorine</th>\n",
+       "      <th>num hydrogen</th>\n",
+       "      <th>num nitrogen</th>\n",
+       "      <th>num oxygen</th>\n",
+       "      <th>num sulfur</th>\n",
+       "      <th>num silicon</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>360</td>\n",
+       "      <td>0</td>\n",
+       "      <td>216</td>\n",
+       "      <td>144</td>\n",
+       "      <td>72</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>360</td>\n",
+       "      <td>0</td>\n",
+       "      <td>216</td>\n",
+       "      <td>144</td>\n",
+       "      <td>144</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>432</td>\n",
+       "      <td>0</td>\n",
+       "      <td>360</td>\n",
+       "      <td>144</td>\n",
+       "      <td>72</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>360</td>\n",
+       "      <td>0</td>\n",
+       "      <td>144</td>\n",
+       "      <td>216</td>\n",
+       "      <td>216</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>360</td>\n",
+       "      <td>0</td>\n",
+       "      <td>144</td>\n",
+       "      <td>216</td>\n",
+       "      <td>216</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>69835</th>\n",
+       "      <td>996</td>\n",
+       "      <td>0</td>\n",
+       "      <td>576</td>\n",
+       "      <td>96</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>69836</th>\n",
+       "      <td>1020</td>\n",
+       "      <td>0</td>\n",
+       "      <td>576</td>\n",
+       "      <td>48</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>69837</th>\n",
+       "      <td>1360</td>\n",
+       "      <td>0</td>\n",
+       "      <td>768</td>\n",
+       "      <td>64</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>69838</th>\n",
+       "      <td>1888</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1152</td>\n",
+       "      <td>128</td>\n",
+       "      <td>128</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>69839</th>\n",
+       "      <td>536</td>\n",
+       "      <td>0</td>\n",
+       "      <td>288</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>69840 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        num carbon   num fluorine   num hydrogen   num nitrogen   num oxygen  \\\n",
+       "0              360              0            216            144           72   \n",
+       "1              360              0            216            144          144   \n",
+       "2              432              0            360            144           72   \n",
+       "3              360              0            144            216          216   \n",
+       "4              360              0            144            216          216   \n",
+       "...            ...            ...            ...            ...          ...   \n",
+       "69835          996              0            576             96            0   \n",
+       "69836         1020              0            576             48            0   \n",
+       "69837         1360              0            768             64            0   \n",
+       "69838         1888              0           1152            128          128   \n",
+       "69839          536              0            288             32            0   \n",
+       "\n",
+       "        num sulfur   num silicon  \n",
+       "0                0             0  \n",
+       "1                0             0  \n",
+       "2                0             0  \n",
+       "3                0             0  \n",
+       "4                0             0  \n",
+       "...            ...           ...  \n",
+       "69835            0             0  \n",
+       "69836            0             0  \n",
+       "69837            0             0  \n",
+       "69838            0             0  \n",
+       "69839            0             0  \n",
+       "\n",
+       "[69840 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "XX_comp_df, YY_df = Input.get_comp()\n",
+    "XX_comp_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fe95ea1f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>num carbon</th>\n",
+       "      <th>num fluorine</th>\n",
+       "      <th>num hydrogen</th>\n",
+       "      <th>num nitrogen</th>\n",
+       "      <th>num oxygen</th>\n",
+       "      <th>num sulfur</th>\n",
+       "      <th>num silicon</th>\n",
+       "      <th>deliverable capacity [v STP/v]</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1152</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1024</td>\n",
+       "      <td>256</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>32</td>\n",
+       "      <td>145.407912</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1152</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1216</td>\n",
+       "      <td>256</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>95.429169</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>768</td>\n",
+       "      <td>0</td>\n",
+       "      <td>512</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>32</td>\n",
+       "      <td>168.557294</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1128</td>\n",
+       "      <td>0</td>\n",
+       "      <td>576</td>\n",
+       "      <td>192</td>\n",
+       "      <td>192</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>115.141985</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>756</td>\n",
+       "      <td>0</td>\n",
+       "      <td>486</td>\n",
+       "      <td>108</td>\n",
+       "      <td>18</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>116.641903</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>692</th>\n",
+       "      <td>1792</td>\n",
+       "      <td>0</td>\n",
+       "      <td>768</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>193.030421</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>693</th>\n",
+       "      <td>2304</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1392</td>\n",
+       "      <td>288</td>\n",
+       "      <td>96</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>107.276586</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>694</th>\n",
+       "      <td>1620</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1080</td>\n",
+       "      <td>360</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>176.029207</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>695</th>\n",
+       "      <td>1664</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1024</td>\n",
+       "      <td>640</td>\n",
+       "      <td>512</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>164.708736</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>696</th>\n",
+       "      <td>2304</td>\n",
+       "      <td>0</td>\n",
+       "      <td>576</td>\n",
+       "      <td>1152</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>113.888695</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>697 rows × 8 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      num carbon   num fluorine   num hydrogen   num nitrogen   num oxygen  \\\n",
+       "0           1152              0           1024            256            0   \n",
+       "1           1152              0           1216            256            0   \n",
+       "2            768              0            512              0            0   \n",
+       "3           1128              0            576            192          192   \n",
+       "4            756              0            486            108           18   \n",
+       "..           ...            ...            ...            ...          ...   \n",
+       "692         1792              0            768              0            0   \n",
+       "693         2304              0           1392            288           96   \n",
+       "694         1620              0           1080            360            0   \n",
+       "695         1664              0           1024            640          512   \n",
+       "696         2304              0            576           1152            0   \n",
+       "\n",
+       "      num sulfur   num silicon   deliverable capacity [v STP/v]  \n",
+       "0              0            32                       145.407912  \n",
+       "1              0             0                        95.429169  \n",
+       "2              0            32                       168.557294  \n",
+       "3              0             0                       115.141985  \n",
+       "4              0             0                       116.641903  \n",
+       "..           ...           ...                              ...  \n",
+       "692            0             0                       193.030421  \n",
+       "693            0             0                       107.276586  \n",
+       "694            0             0                       176.029207  \n",
+       "695            0             0                       164.708736  \n",
+       "696            0             0                       113.888695  \n",
+       "\n",
+       "[697 rows x 8 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num_cluster = 3\n",
+    "clustered_dfs = km.k_means(XX_comp_df, YY_df, num_cluster)\n",
+    "sample_dfs = km.draw_samples(clustered_dfs, sample_fraction = 0.01)\n",
+    "samples = km.concat(sample_dfs)\n",
+    "samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "35ed4601",
+   "metadata": {},
+   "source": [
+    "#### Acquisition function "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "955bc734-96c5-4d3a-9325-920c041e256b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## TODO: TO BE Check\n",
+    "bounds = torch.tensor([[-10.0], [12.0]])\n",
+    "\n",
+    "batch_size = 1\n",
+    "num_restarts= 10 \n",
+    "raw_samples = 512\n",
+    "\n",
+    "def optimize_acqf_and_get_observation(acq_func, X_test, Y_test):\n",
+    "    \"\"\"Optimizes the acquisition function, and returns a new candidate\"\"\"\n",
+    "    # optimize\n",
+    "    candidates, _ = optimize_acqf_discrete(\n",
+    "        acq_function=acq_func,\n",
+    "        choices=X_test,\n",
+    "        q=batch_size,\n",
+    "        max_batch_size=2048,\n",
+    "        num_restarts=num_restarts,\n",
+    "        raw_samples=raw_samples,  # used for intialization heuristic\n",
+    "        options={\"batch_limit\": 5, \"maxiter\": 200},\n",
+    "        unique=True\n",
+    "    )\n",
+    "    \n",
+    "    print(candidates)\n",
+    "    # observe new values\n",
+    "    new_x = candidates.detach()\n",
+    "    b = [1 if torch.all(X_test[i].eq(new_x)) else 0 for i in range(0,X_test.shape[0]) ]\n",
+    "    b = torch.tensor(b).to(torch.int)\n",
+    "    index = b.nonzero()[0][0]\n",
+    "    new_y = torch.reshape(Y_test[0,index],(1,1))\n",
+    "    \n",
+    "    X_test_new = X_test[torch.arange(0, X_test.shape[0]) != index, ...]\n",
+    "    Y_test_new = Y_test[..., torch.arange(0, Y_test.shape[1]) != index]\n",
+    "    print(X_test_new)\n",
+    "    print(Y_test_new)\n",
+    "    \n",
+    "    return new_x, new_y, index, X_test_new, Y_test_new"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f93f668d",
+   "metadata": {},
+   "source": [
+    "#### GP Train Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "72bb9112-1749-44fd-bc9d-7c0edb2e59a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_train_test_data(cluster_dataXX, cluster_dataYY, random_seed):\n",
+    "    if model_input.standardize_data:\n",
+    "        cluster_dataXX, scalerX_transform = utilsd.standardize_data(cluster_dataXX)\n",
+    "        cluster_dataYY, scalerY_transform = utilsd.standardize_data(cluster_dataYY.reshape(-1,1))\n",
+    "    else:\n",
+    "        scalerX_transform = None\n",
+    "        scalerY_transform = None\n",
+    "    \n",
+    "    ## TODO : Incase for feature selection\n",
+    "        # ....\n",
+    "        # ....\n",
+    "        # ....\n",
+    "\n",
+    "    # Create train and test sets\n",
+    "    X_train, X_test, Y_train, Y_test = train_test_split(cluster_dataXX, cluster_dataYY, test_size=model_input.test_size, random_state=random_seed)\n",
+    "\n",
+    "    # Convert to tensors\n",
+    "    X_train = torch.tensor(X_train, dtype=torch.float32)\n",
+    "    X_test = torch.tensor(X_test, dtype=torch.float32)\n",
+    "    Y_train = np.transpose(Y_train) # IMP : Has to have only one row for GP training\n",
+    "    Y_train = torch.tensor(Y_train, dtype=torch.float32)\n",
+    "    Y_test = np.transpose(Y_test)\n",
+    "    Y_test = torch.tensor(Y_test, dtype=torch.float32)\n",
+    "\n",
+    "    return X_train, X_test, Y_train, Y_test, scalerX_transform, scalerY_transform\n",
+    "\n",
+    "def train_gp(cluster_idx):\n",
+    "    best_observed_all_ei0 = []\n",
+    "    # Average over multiple trials\n",
+    "    for trial in range(1, model_input.n_trials + 1):\n",
+    "        t0 = time.monotonic()\n",
+    "        if model_input.random_seed == 'time':\n",
+    "            random_seed = int(t0)\n",
+    "        elif model_input.random_seed == 'iteration':\n",
+    "            random_seed = trial\n",
+    "            \n",
+    "        print(f\"\\n -------------------- Trial {trial:>2} of {model_input.n_trials} --------------------\\n\", end=\"\")\n",
+    "        best_observed0 = []\n",
+    "\n",
+    "        XX_desc = list(sample_dfs[cluster_idx].columns[:-1])\n",
+    "        YY_desc = sample_dfs[cluster_idx].columns[-1]\n",
+    "        (\n",
+    "            X_train,\n",
+    "            X_test,\n",
+    "            Y_train,\n",
+    "            Y_test,\n",
+    "            scalerX, \n",
+    "            scalerY\n",
+    "        ) = create_train_test_data(sample_dfs[cluster_idx][XX_desc].to_numpy(), sample_dfs[cluster_idx][YY_desc].to_numpy(), random_seed)\n",
+    "        if trial == 1:\n",
+    "            # Dump the scalers to model output folder\n",
+    "            dump(scalerX, os.path.join(model_input.output_folder, f'scalerX_{cluster_idx}.joblib'))\n",
+    "            dump(scalerY, os.path.join(model_input.output_folder, f'scalerY_{cluster_idx}.joblib'))\n",
+    "            \n",
+    "        # Finding best value in initial data\n",
+    "        if model_input.maximization:\n",
+    "            best_observed_value = Y_train.max()\n",
+    "            optimal_solution = torch.cat([Y_train[0],Y_test[0]]).max()\n",
+    "        else:\n",
+    "            best_observed_value = Y_train.min()\n",
+    "            optimal_solution = torch.cat([Y_train[0],Y_test[0]]).min()\n",
+    "        \n",
+    "        # If optimal value is present in the initial dataset sample remove it  \n",
+    "        if (best_observed_value.eq(optimal_solution)) and model_input.maximization:\n",
+    "            print('Max in training set, removing it before training models.')\n",
+    "            optimal_position = torch.argmax(Y_train)\n",
+    "            \n",
+    "            # Add max value to test/exploration set\n",
+    "            X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n",
+    "            X_test = torch.cat([X_test,X_add_toTest])\n",
+    "            Y_add_toTest = torch.reshape(optimal_solution,(1,1))      \n",
+    "            Y_test = torch.cat((Y_test,Y_add_toTest),1)\n",
+    "            \n",
+    "            # Remove max value from training set\n",
+    "            X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n",
+    "            Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n",
+    "            \n",
+    "            # Update best observed value\n",
+    "            best_observed_value = Y_train.max()\n",
+    "            \n",
+    "        elif (best_observed_value.eq(optimal_solution)) and not model_input.maximization:\n",
+    "            print('Min in training set, removing it before training models.')\n",
+    "            optimal_position = torch.argmin(Y_train)\n",
+    "            \n",
+    "            # Add min value to test/exploration set\n",
+    "            X_add_toTest = torch.reshape(X_train[optimal_position,:],(1,X_train.shape[1]))\n",
+    "            X_test = torch.cat([X_test,X_add_toTest])\n",
+    "            Y_add_toTest = torch.reshape(optimal_solution,(1,1))      \n",
+    "            Y_test = torch.cat((Y_test,Y_add_toTest),1)\n",
+    "            \n",
+    "            # Remove min value from training set\n",
+    "            X_train = X_train[torch.arange(0, X_train.shape[0]) != optimal_position, ...]\n",
+    "            Y_train = Y_train[..., torch.arange(0, Y_train.shape[1]) != optimal_position]\n",
+    "            \n",
+    "            # Update best observed value\n",
+    "            best_observed_value = Y_train.min()\n",
+    "        \n",
+    "        # Initialize data for training gp-0 and gp-l models\n",
+    "        X_train0, Y_train0, X_test0, Y_test0 = X_train, Y_train, X_test, Y_test\n",
+    "                \n",
+    "        n_batch = model_input.n_batch_perTrial\n",
+    "        \n",
+    "        # Initialize likelihood, GP model and acquisition function for the models\n",
+    "        #--------------------------- GP-0 ---------------------------#\n",
+    "        likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood()\n",
+    "        model_gp0 = surrogate_models.ExactGPModel(X_train0, Y_train0, likelihood_gp0)           \n",
+    "        AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_observed_value, maximize=model_input.maximization)\n",
+    "        best_observed0.append(best_observed_value)  # Appending to best_observed list for the given trial\n",
+    "        \n",
+    "        # run N_BATCH rounds of BayesOpt after the initial random batch\n",
+    "        for iteration in range(1, n_batch + 1):\n",
+    "\n",
+    "            if ((iteration-1)%model_input.n_update==0):\n",
+    "                # fit the models every 10 iterations\n",
+    "                model_gp0, likelihood_gp0 = surrogate_models.train_surrogate_gp0(X_train0, Y_train0)\n",
+    "        \n",
+    "            # optimize and get new observation using acquisition function\n",
+    "            new_x0, new_y0, index, X_test_new0, Y_test_new0 = optimize_acqf_and_get_observation(AcqFunc_0, X_test0, Y_test0)\n",
+    "            \n",
+    "            # Update remaining choices tensor\n",
+    "            X_test0 = X_test_new0\n",
+    "            Y_test0 = Y_test_new0\n",
+    "\n",
+    "            # Update training points\n",
+    "            X_train0 = torch.cat([X_train0, new_x0])\n",
+    "            Y_train0 = torch.cat([Y_train0[0], new_y0[0]])\n",
+    "            Y_train0 = torch.reshape(Y_train0,(1,Y_train0.shape[0]))\n",
+    "\n",
+    "            # update progress\n",
+    "            if model_input.maximization:\n",
+    "                best_value_ei0 = Y_train0.max()\n",
+    "            elif not model_input.maximization:\n",
+    "                best_value_ei0 = Y_train0.min()\n",
+    "            best_observed0.append(best_value_ei0)\n",
+    "\n",
+    "            # AcqFunc_0 = UpperConfidenceBound(model_gp0, beta=0.1) \n",
+    "            AcqFunc_0 = ExpectedImprovement(model=model_gp0, best_f=best_value_ei0, maximize=model_input.maximization)\n",
+    "        \n",
+    "            if model_input.verbose:\n",
+    "                print(\n",
+    "                    f\"\\nBatch {iteration:>2}: best_value (GP-0) = \",\n",
+    "                    f\"({best_value_ei0:>4.2f}\",\n",
+    "                    end=\"\",)\n",
+    "\n",
+    "        t1 = time.monotonic()\n",
+    "        \n",
+    "        print(f\"time = {t1-t0:>4.2f}.\")\n",
+    "        # Appending to common list of best observed values, with number of rows equal to number of trials\n",
+    "        best_observed_all_ei0.append(best_observed0) \n",
+    "    return best_observed_all_ei0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17fbf21d",
+   "metadata": {},
+   "source": [
+    "#### Main Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "bfb03e8a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/maitreyeesharma/opt/anaconda3/envs/torch/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py:702: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n",
+      "  warnings.warn(\n",
+      "\n",
+      "KeyboardInterrupt\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Error in callback <bound method AutoreloadMagics.post_execute_hook of <IPython.extensions.autoreload.AutoreloadMagics object at 0x168e0be90>> (for post_execute):\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "KeyboardInterrupt\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "warnings.filterwarnings(\"ignore\", category=BadInitialCandidatesWarning)\n",
+    "warnings.filterwarnings(\"ignore\", category=RuntimeWarning)\n",
+    "\n",
+    "# Create a new directory if it does not exist\n",
+    "isExist = os.path.exists(model_input.output_folder)\n",
+    "if not isExist:\n",
+    "    os.makedirs(model_input.output_folder)\n",
+    "    print(\"The new directory is created!\", model_input.output_folder)\n",
+    "    \n",
+    "# Commented out by NKT\n",
+    "# # Copy input parameters file to output folder\n",
+    "# shutil.copy2('surrogate_model_inputs.py',model_input.output_folder)\n",
+    "# Copy surrogate model file to output folder\n",
+    "shutil.copy2('surrogate_models.py',model_input.output_folder)\n",
+    "\n",
+    "# Training a single GP for test\n",
+    "# a = train_gp(0)\n",
+    "\n",
+    "# Train the cluster of GP models in a parallel for loop\n",
+    "best_observed_all_ei0 = Parallel(n_jobs=-1)(\n",
+    "    delayed(train_gp)(i) for i in range(num_cluster)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ddae264",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Epsilon greedy method:\n",
+    "\n",
+    "random_number = np.random.rand()\n",
+    "# Explore using the Epsilon Greedy Exploration Strategy\n",
+    "if random_number <= epsilon:\n",
+    "    # Explore\n",
+    "    best_observed = ## replace here with the randomly picking out one cluster's BO recommendation\n",
+    "else:\n",
+    "    # Exploit best known action\n",
+    "    best_observed = ## replace here with the best output from the BO recommedations of the local GPs\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/code_inputs.py b/src/code_inputs.py
new file mode 100644
index 0000000..e726263
--- /dev/null
+++ b/src/code_inputs.py
@@ -0,0 +1,66 @@
+# This file contaings surrogate model inputs
+import numpy as np     
+import os
+import json
+import math
+
+# Torch specific module imports
+import torch
+import gpytorch
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+from torchvision import datasets, transforms
+from torch.nn import functional as F
+
+# botorch specific modules
+from botorch.fit import fit_gpytorch_model
+from botorch.models.gpytorch import GPyTorchModel
+
+# Plotting libraries
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+
+# User defined python classes and files
+import sys
+
+import utils_dataset as utilsd
+import input_class 
+
+np.random.seed(0)
+torch.manual_seed(0)
+
+# General inputs
+run_folder = '/Users/nikhilthota/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery'  # Folder where code is run and input json exist
+num_run = 3
+test_size = 0.01
+output_folder = run_folder+'/bo_output/' # Folder where all outputs are stored
+output_folder = output_folder+'test'
+verbose = True
+deep_verbose = False
+
+# Reading and data processing inputs
+add_target_noise = False
+standardize_data = True
+
+# Feature selection inputs
+test_size_fs = 0.1
+select_features_otherModels = False
+
+# BO inputs
+n_trials = 5
+n_update = 1000
+GP_0_BO = True
+GP_L_BO = True
+GP_NN_BO = False
+random_seed = 'iteration'
+maximization = True
+new_values_predict_from_model = False
+n_batch_perTrial = 100
+
+# Surrogate training boolean inputs
+train_GP = True
+
+# GP Model parameters
+kernel = 'Matern'
+learning_rate_gp0 = 0.01
+epochs_GP0 = 500
diff --git a/src/feature_selection_methods.py b/src/feature_selection_methods.py
new file mode 100644
index 0000000..f0e4c37
--- /dev/null
+++ b/src/feature_selection_methods.py
@@ -0,0 +1,78 @@
+import numpy as np     
+import csv 
+import copy 
+import random 
+import pandas as pd
+import scipy
+import sklearn as sk
+
+import xgboost as xgb
+from xgboost.sklearn import XGBRegressor
+from sklearn.metrics import mean_squared_error
+
+# User defined files and classes
+import utils_dataset as utilsd
+
+# sklearn functions
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import KFold
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import Lasso
+
+
+class feature_selection_algorithms:
+
+    def __init__(self,XX,YY,test_size=0.33,random_state=42):
+        
+        # Train Data
+        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(XX, YY, test_size=test_size, random_state=random_state)
+
+    # XGBoost
+    def xgboost(self, **kwargs):
+        
+        clf = XGBRegressor(n_estimators=100, learning_rate=0.025, max_depth=20, verbosity=0, booster='gbtree', 
+                    reg_alpha=np.exp(-6.788644799030888), reg_lambda=np.exp(-7.450413274554533), 
+                    gamma=np.exp(-5.374463422208394), subsample=0.5, objective= 'reg:squarederror', n_jobs=1)  
+                           
+        paras = clf.get_params()
+
+        clf.fit(self.X_train, self.y_train)        
+        return clf
+    
+    # Features selected by XGBoost
+    def selected_features_xgboost(self, descriptors, deep_verbose=False):
+        
+        clf = self.xgboost()
+        score = clf.score(self.X_train, self.y_train)
+        if deep_verbose:
+            print("XGBoost Training score: ", score)
+
+        scores = cross_val_score(clf, self.X_train, self.y_train,cv=10)
+        if deep_verbose:
+            print("XGBoost Mean cross-validation score: %.2f" % scores.mean())
+
+
+        ypred = clf.predict(self.X_test)
+        mse = mean_squared_error(self.y_test, ypred)
+        if deep_verbose:
+            print("XGBoost MSE: %.2f" % mse)
+            print("XGBoost RMSE: %.2f" % (mse**(1/2.0)))
+
+        f_importance = clf.get_booster().get_score(importance_type='gain')
+        feature_importance_dict={}
+
+        for f,value in f_importance.items():
+            feature_index = int(f.split('f')[1])
+            feature_importance_dict[descriptors[feature_index]] = value
+            if deep_verbose:
+                print(f"Column: {feature_index}, descriptor: {descriptors[feature_index]}")
+            
+        return feature_importance_dict.keys()
+                
+    
+if __name__=="__main__":
+    
+    print('Feature selection methods are in this class')
\ No newline at end of file
diff --git a/src/input_class.py b/src/input_class.py
new file mode 100644
index 0000000..51ee96d
--- /dev/null
+++ b/src/input_class.py
@@ -0,0 +1,84 @@
+import sklearn
+import numpy as np     
+import csv 
+import copy 
+import random 
+import pandas as pd
+import pickle
+import json  
+import openpyxl
+import itertools
+
+# User defined files and classes
+import feature_selection_methods as feature_selection
+import utils_dataset as utilsd
+
+# Plotting libraries
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+
+# Tick parameters
+plt.rcParams['xtick.labelsize'] = 15
+plt.rcParams['ytick.labelsize'] = 15
+plt.rcParams['xtick.major.size'] = 5
+plt.rcParams['xtick.major.width'] = 1
+plt.rcParams['xtick.minor.size'] = 5
+plt.rcParams['xtick.minor.width'] = 1
+plt.rcParams['ytick.major.size'] = 5
+plt.rcParams['ytick.major.width'] = 1
+plt.rcParams['ytick.minor.size'] = 5
+plt.rcParams['ytick.minor.width'] = 1
+
+plt.rcParams['axes.labelsize'] = 20
+plt.rcParams['axes.titlesize'] = 20
+plt.rcParams['legend.fontsize'] = 15
+
+
+class inputs:
+    def __init__(self,input_type='COF',input_path='../',input_file='properties.csv'):
+        self.input_type = input_type
+        self.input_path = input_path
+        self.input_file = input_file
+        self.filename   = self.input_path + self.input_file
+        self.data = data = pd.read_csv(self.filename)
+
+    def read_inputs(self):
+        '''
+        This function reads the dataset from the COF paper: https://pubs.acs.org/doi/10.1021/acs.chemmater.8b01425
+        input_type='COF',
+        input_path='.',
+        input_file='properties.csv'
+        '''     
+#         XX_comp = data.iloc[:,38:45]
+        descriptors = ['dimensions', ' bond type', ' void fraction [widom]', ' supercell volume [A^3]', ' density [kg/m^3]', 
+                       ' heat desorption high P [kJ/mol]',' absolute methane uptake high P [molec/unit cell]', 
+                       ' absolute methane uptake high P [mol/kg]', ' excess methane uptake high P [molec/unit cell]',
+                       ' excess methane uptake high P [mol/kg]', ' heat desorption low P [kJ/mol]', 
+                       ' absolute methane uptake low P [molec/unit cell]', 
+                       ' absolute methane uptake low P [mol/kg]', 
+                       ' excess methane uptake low P [molec/unit cell]', 
+                       ' excess methane uptake low P [mol/kg]', ' surface area [m^2/g]', ' linkerA', ' linkerB', ' net', 
+                       ' cell_a [A]', ' cell_b [A]', ' cell_c [A]', ' alpha [deg]', ' beta [deg]', ' gamma [deg]', 
+                       ' num carbon', ' num fluorine', ' num hydrogen', ' num nitrogen', ' num oxygen', ' num sulfur', 
+                       ' num silicon', ' vertices', ' edges', ' genus', ' largest included sphere diameter [A]', 
+                       ' largest free sphere diameter [A]', ' largest included sphere along free sphere path diameter [A]',
+                       ' absolute methane uptake high P [v STP/v]', ' absolute methane uptake low P [v STP/v]']
+        XX_prop = pd.DataFrame(self.data, columns=descriptors)
+        target = copy.deepcopy(self.data[' deliverable capacity [v STP/v]'].to_numpy())
+        YY = target.reshape(-1,1)
+
+        return XX_prop, YY, descriptors
+    
+    def get_comp(self):
+        XX_comp_df = self.data.iloc[:,38:45]
+        YY = self.data.iloc[:, 27]
+        # YY_df = pd.DataFrame(YY)
+        # YY = copy.deepcopy(self.data[' deliverable capacity [v STP/v]'].to_numpy())
+        return XX_comp_df, YY
+
+
+
+if __name__=="__main__":
+    
+    print('Reading inputs')
+    
\ No newline at end of file
diff --git a/src/kmeans.py b/src/kmeans.py
new file mode 100644
index 0000000..faf0992
--- /dev/null
+++ b/src/kmeans.py
@@ -0,0 +1,30 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn import metrics
+from sklearn.metrics import silhouette_samples
+
+def k_means(X, y, k=3):
+    """
+    Perform k-means clustering on the given input X and label y
+    """
+
+    # Perform k-means clustering
+    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
+    labels = kmeans.labels_
+    X['cluster'] = labels
+    data = pd.concat([X, y], axis=1)
+    clustered_dfs = [data[data['cluster'] == i].drop('cluster', axis=1) for i in range(k)]
+
+    return clustered_dfs
+
+def draw_samples(clustered_dfs, sample_fraction = 0.01, random_state = 42):
+    sample_dfs = []
+    for df in clustered_dfs:
+        sample_size = int(len(df) * sample_fraction)
+        sample_dfs.append(df.sample(n=sample_size, random_state=random_state))
+    return sample_dfs
+
+def concat(dfs_list):
+    return pd.concat(dfs_list, ignore_index=True)
\ No newline at end of file
diff --git a/src/surrogate_models.py b/src/surrogate_models.py
new file mode 100644
index 0000000..a96e173
--- /dev/null
+++ b/src/surrogate_models.py
@@ -0,0 +1,95 @@
+# This file costructs surrogate models for the input datasets
+import numpy as np     
+import os
+import json
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+import math
+
+# Torch specific module imports
+import torch
+import gpytorch
+
+# botorch specific modules
+from botorch.fit import fit_gpytorch_model
+from botorch.models.gpytorch import GPyTorchModel
+
+# Plotting libraries
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+
+# User defined python classes and files
+import utils_dataset as utilsd
+import input_class 
+import code_inputs as model_input
+
+np.random.seed(0)
+torch.manual_seed(0)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using {device} device")
+    
+# We will use the simplest form of GP model, exact inference
+class ExactGPModel(gpytorch.models.ExactGP,GPyTorchModel):
+    _num_outputs = 1  # to inform GPyTorchModel API
+    MIN_INFERRED_NOISE_LEVEL = 1e-5
+    def __init__(self, train_x, train_y, likelihood):
+        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
+        self.mean_module = gpytorch.means.ConstantMean()
+        if model_input.kernel=='RBF':
+            self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
+        elif model_input.kernel=='Matern':            
+            self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5))
+
+    def forward(self, x):
+        mean_x = self.mean_module(x)
+        covar_x = self.covar_module(x)
+        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
+            
+#--------------------------- GP-0 ---------------------------#
+def train_surrogate_gp0(X_train,Y_train):
+    
+    mse_gp0 = 0.0 
+    training_iter = model_input.epochs_GP0
+    
+    # initialize likelihood and model
+    likelihood_gp0 = gpytorch.likelihoods.GaussianLikelihood()
+    model_gp0 = ExactGPModel(X_train, Y_train, likelihood_gp0) 
+    
+    # Find optimal model hyperparameters
+    model_gp0.train()
+    likelihood_gp0.train()
+
+    # Use the adam optimizer
+    optimizer = torch.optim.Adam(model_gp0.parameters(), lr=model_input.learning_rate_gp0)  # Includes GaussianLikelihood parameters
+
+    # "Loss" for GPs - the marginal log likelihood
+    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood_gp0, model_gp0)
+
+    for i in range(training_iter):
+        optimizer.zero_grad()        # Zero gradients from previous iteration
+        output = model_gp0(X_train)  # Output from model
+        loss = -mll(output, Y_train) # Calc loss and backprop gradients         
+        loss.backward()
+        optimizer.step()
+        
+    return model_gp0, likelihood_gp0
+
+def predict_surrogates(model, likelihood, X):
+    
+    # Get into evaluation (predictive posterior) mode
+    model.eval()
+    likelihood.eval()
+
+    # Make predictions by feeding model through likelihood
+    with torch.no_grad(), gpytorch.settings.fast_pred_var():
+        prediction = model(X)
+        prediction = likelihood(model(X))
+
+    observed_mean = prediction.mean
+    observed_var = prediction.variance
+    observed_covar = prediction.covariance_matrix
+
+    return observed_mean, observed_var
+    
+    
\ No newline at end of file
diff --git a/src/test.ipynb b/src/test.ipynb
new file mode 100644
index 0000000..8d0862b
--- /dev/null
+++ b/src/test.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "dde8a292",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2234ceda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from botorch.models import SingleTaskGP\n",
+    "from botorch.fit import fit_gpytorch_mll\n",
+    "from botorch.utils import standardize\n",
+    "from gpytorch.mlls import ExactMarginalLogLikelihood\n",
+    "\n",
+    "train_X = torch.rand(10, 2)\n",
+    "Y = 1 - torch.linalg.norm(train_X - 0.5, dim=-1, keepdim=True)\n",
+    "Y = Y + 0.1 * torch.randn_like(Y)  # add some noise\n",
+    "train_Y = standardize(Y)\n",
+    "\n",
+    "gp = SingleTaskGP(train_X, train_Y)\n",
+    "mll = ExactMarginalLogLikelihood(gp.likelihood, gp)\n",
+    "fit_gpytorch_mll(mll)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4524f72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "def compute_sq(x):\n",
+    "    return x**2\n",
+    "\n",
+    "# Time the computation of the squares of the numbers 0 to 9\n",
+    "\n",
+    "%timeit -n 10 Parallel(n_jobs=-1, backend='loky', verbose=0)(delayed(compute_sq)(x) for x in range(10))\n",
+    "%timeit -n 10 [compute_sq(x) for x in range(10)]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92f541d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the inputs\n",
+    "\n",
+    "from utils_dataset import generate_training_data\n",
+    "from input_class import inputs\n",
+    "\n",
+    "# X_train, X_test, Y_train, Y_test, Var_train, Var_test, scalerX_transform, scalerY_transform = generate_training_data(0, 0.1)\n",
+    "\n",
+    "input_obj = inputs()\n",
+    "XX, YY, descriptors = input_obj.read_inputs()\n",
+    "\n",
+    "print(XX)\n",
+    "print(YY)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "97d372f5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using cpu device\n",
+      "MultivariateNormal(loc: torch.Size([2]))\n",
+      "tensor([1.0886, 1.1504], dtype=torch.float64, grad_fn=<NegBackward0>)\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "grad can be implicitly created only for scalar outputs",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m XX \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mrand(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m2\u001b[39m))\n\u001b[1;32m      6\u001b[0m YY \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(np\u001b[38;5;241m.\u001b[39mrandom\u001b[38;5;241m.\u001b[39mrand(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m1\u001b[39m))\n\u001b[0;32m----> 8\u001b[0m \u001b[43mtrain_surrogate_gp0\u001b[49m\u001b[43m(\u001b[49m\u001b[43mXX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mYY\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/lab/projects/SPIRAL/codes_and_datasets/T-NIKHIL/project-sparse-gp-for-materials-discovery/src/surrogate_models.py:75\u001b[0m, in \u001b[0;36mtrain_surrogate_gp0\u001b[0;34m(X_train, Y_train)\u001b[0m\n\u001b[1;32m     73\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39mmll(output, Y_train) \u001b[38;5;66;03m# Calc loss and backprop gradients \u001b[39;00m\n\u001b[1;32m     74\u001b[0m     \u001b[38;5;28mprint\u001b[39m(loss)           \n\u001b[0;32m---> 75\u001b[0m     \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     76\u001b[0m     optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m     78\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model_gp0, likelihood_gp0\n",
+      "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    513\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m    514\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m    515\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    520\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m    521\u001b[0m     )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    523\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m    524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/autograd/__init__.py:259\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    250\u001b[0m inputs \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    251\u001b[0m     (inputs,)\n\u001b[1;32m    252\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(inputs, (torch\u001b[38;5;241m.\u001b[39mTensor, graph\u001b[38;5;241m.\u001b[39mGradientEdge))\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    255\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m()\n\u001b[1;32m    256\u001b[0m )\n\u001b[1;32m    258\u001b[0m grad_tensors_ \u001b[38;5;241m=\u001b[39m _tensor_or_tensors_to_tuple(grad_tensors, \u001b[38;5;28mlen\u001b[39m(tensors))\n\u001b[0;32m--> 259\u001b[0m grad_tensors_ \u001b[38;5;241m=\u001b[39m \u001b[43m_make_grads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_grads_batched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    260\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m retain_graph \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    261\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n",
+      "File \u001b[0;32m~/miniconda3/envs/bo-hackathon/lib/python3.12/site-packages/torch/autograd/__init__.py:132\u001b[0m, in \u001b[0;36m_make_grads\u001b[0;34m(outputs, grads, is_grads_batched)\u001b[0m\n\u001b[1;32m    130\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out\u001b[38;5;241m.\u001b[39mrequires_grad:\n\u001b[1;32m    131\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m out\u001b[38;5;241m.\u001b[39mnumel() \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m--> 132\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m    133\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgrad can be implicitly created only for scalar outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    134\u001b[0m         )\n\u001b[1;32m    135\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m out\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mis_floating_point:\n\u001b[1;32m    136\u001b[0m         msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    137\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgrad can be implicitly created only for real scalar outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    138\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mout\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    139\u001b[0m         )\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: grad can be implicitly created only for scalar outputs"
+     ]
+    }
+   ],
+   "source": [
+    "from surrogate_models import train_surrogate_gp0\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "\n",
+    "XX = torch.tensor(np.random.rand(2, 2))\n",
+    "YY = torch.tensor(np.random.rand(2, 1))\n",
+    "\n",
+    "train_surrogate_gp0(XX, YY)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07d11c64",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/utils_dataset.py b/src/utils_dataset.py
new file mode 100644
index 0000000..5015a41
--- /dev/null
+++ b/src/utils_dataset.py
@@ -0,0 +1,127 @@
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from torch.utils.data import Dataset, random_split, DataLoader
+
+import torch
+import json
+import pandas as pd
+import numpy as np
+
+# User defined python classes and files
+import utils_dataset as utilsd
+import input_class 
+import code_inputs as model_input
+import feature_selection_methods as feature_selection
+
+# Add slicing of the input XX tensor with additional input for the columns picked out by XGBoost or other feature selection methods
+class InputDataset(Dataset):
+    """ Input dataset used for training """
+
+    def __init__(self, XX, YY, Var=None, transform=None):
+        """
+        Args:
+            XX: NN Input features vector as a torch tensor
+            YY: NN Labels vector as a torch tensor
+            descriptors(list of strings): Names of the input features
+            transform (callable, optional): Optional transform to be applied
+                on a sample.
+        """
+        self.XX = XX
+        self.YY = YY
+        self.var = Var
+        self.transform = transform
+        
+    def __len__(self):
+        return self.XX.shape[0]
+
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+
+        x = self.XX[idx,:]
+        y = self.YY[:,idx]
+        if self.var != None:
+            var = self.var[idx]
+            item = {'in_features':x,'labels':y,'variance':var}
+        else:
+            item = {'in_features':x,'labels':y}
+
+        return item
+    
+
+def standardize_data(x):
+    scalerX = StandardScaler().fit(x)
+    x_train = scalerX.transform(x)
+    return x_train, scalerX
+    
+def standardize_test_data(x,scalerX):
+    x_test = scalerX.transform(x)
+    return x_test
+
+def generate_training_data(random_state,test_size):
+    
+    # todo: Modify below to read from code_inputs.py
+    # # Reading the input json file with dataset filename and path information
+    # with open(model_input.run_folder+'inputs.json', "r") as f:
+    #     input_dict = json.load(f)
+
+    # input_type = input_dict['InputType']
+    # input_path = input_dict['InputPath']
+    # input_file = input_dict['InputFile']
+    # add_target_noise = input_dict['AddTargetNoise']
+
+    # input = input_class.inputs(input_type=input_type,
+    #                            input_path=input_path,
+    #                            input_file=input_file,
+    #                            add_target_noise=add_target_noise)
+    
+    input_obj = input_class.inputs()
+    XX, YY, descriptors = input_obj.read_inputs()
+    
+    # Transforming datasets by standardization
+    if model_input.standardize_data:
+        X_stand, scalerX_transform = utilsd.standardize_data(XX)
+        Y_stand, scalerY_transform = utilsd.standardize_data(YY)
+    else:
+        X_stand = XX.to_numpy()
+        Y_stand = YY
+    
+    # Checking if we should use xgboost recommended descriptors or all descriptors
+    if model_input.select_features_otherModels:
+        fs = feature_selection.feature_selection_algorithms(X_stand,Y_stand,
+                                                            test_size=model_input.test_size_fs,
+                                                            random_state=random_state)
+        xg_boost_descriptors = fs.selected_features_xgboost(descriptors)
+        if model_input.verbose:
+            print('Selected Features, ', xg_boost_descriptors)
+    else:
+        xg_boost_descriptors = descriptors
+      
+    XX = pd.DataFrame(XX, columns=xg_boost_descriptors)
+    if model_input.standardize_data:
+        X_stand, scalerX_transform = utilsd.standardize_data(XX)
+    else:
+        X_stand=XX.to_numpy()
+        
+    # Creating train-test split in data
+    X_train, X_test, Y_train, Y_test = train_test_split(X_stand, Y_stand, 
+                                                        test_size=test_size, 
+                                                        random_state=random_state) #,stratify=Y_stand)
+   
+    Var_train = torch.ones(len(Y_train)) 
+    Var_test = torch.ones(len(Y_test)) 
+
+    # Converting data arrays to torch tensors
+    X_train = torch.tensor(X_train).to(torch.float32)
+    Y_train = np.transpose(Y_train) # Ytrain has to have only one row for GP training
+    Y_train = torch.tensor(Y_train).to(torch.float32)
+
+    X_test = torch.tensor(X_test).to(torch.float32)
+    Y_test = np.transpose(Y_test) # Ytrain has to have only one row for GP training
+    Y_test = torch.tensor(Y_test).to(torch.float32)
+    
+    if model_input.standardize_data:
+        return X_train, X_test, Y_train, Y_test, Var_train, Var_test, scalerX_transform, scalerY_transform
+    else:
+        return X_train, X_test, Y_train, Y_test, Var_train, Var_test
+
diff --git a/test.ipynb b/test.ipynb
deleted file mode 100644
index 4356ad3..0000000
--- a/test.ipynb
+++ /dev/null
@@ -1,33 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dde8a292",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}