Working on #18 : make autoencoders configurable through json, add opt…

…ion to train either single ae for all histograms or one ae per histogram (default)
AutoDQM · Feb 3, 2022 · 29124cb · 29124cb
1 parent 6f27045
commit 29124cb
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 56 deletions.
diff --git a/autodqm_ml/algorithms/anomaly_detection_algorithm.py b/autodqm_ml/algorithms/anomaly_detection_algorithm.py
@@ -72,6 +72,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
 
         if histograms:
             self.histograms = histograms
+        self.histogram_name_map = {} # we replace "/" and spaces in input histogram names to play nicely with other packages, this map lets you go back and forth between them
 
         logger.debug("[AnomalyDetectionAlgorithm : load_data] Loading training data from file '%s'" % (self.input_file))
 
@@ -81,6 +82,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
         # Set helpful metadata
         for histogram, histogram_info in self.histograms.items():
             self.histograms[histogram]["name"] = histogram.replace("/", "").replace(" ","")
+            self.histogram_name_map[self.histograms[histogram]["name"]] = histogram
 
             a = awkward.to_numpy(df[histogram][0])
             self.histograms[histogram]["shape"] = a.shape
@@ -134,9 +136,9 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
         self.n_train = awkward.sum(df.train_label == 0)
         self.n_test = awkward.sum(df.train_label == 1)
         self.df = df
+        self.n_histograms = len(list(self.histograms.keys()))
 
-
-        logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (len(list(self.histograms.keys())), self.n_train, self.n_test))
+        logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (self.n_histograms, self.n_train, self.n_test))
 
         self.data_is_loaded = True
 

diff --git a/autodqm_ml/algorithms/autoencoder.py b/autodqm_ml/algorithms/autoencoder.py
@@ -3,6 +3,7 @@
 import numpy
 import json
 import awkward
+import copy
 
 import logging
 logger = logging.getLogger(__name__)
@@ -14,26 +15,46 @@
 from autodqm_ml import utils
 
 DEFAULT_OPT = {
+        "batch_size" : 16,
+        "val_batch_size" : 1024,
+        "n_epochs" : 1000,
+        "early_stopping" : True,
+        "early_stopping_rounds" : 3,
         "n_hidden_layers" : 2,
-        "n_nodes" : 25,
+        "n_nodes" : 10,
         "n_components" : 3,
         "kernel_1d" : 3,
         "kernel_2d" : 3,
+        "strides_1d" : 1,
+        "strides_2d" : 1,
+        "dropout" : 0.0,
+        "batch_norm" : False,
         "n_filters" : 8
 }
 
 class AutoEncoder(MLAlgorithm):
     """
     Autoencoder base class.
+
+    :param config: dictionary with hyperparameters for autoencoder training. Any hyperparameters not specified will be taken from the default values in `DEFAULT_OPT`
+    :type config: dict
+    :param mode: string to specify whether you want to train an autoencoder for each histogram ("individual") or a single autoencoder on all histograms ("simultaneous")
+    :type mode: str
     """
     def __init__(self, **kwargs):
         super(AutoEncoder, self).__init__(**kwargs)
 
         self.config = utils.update_dict(
                 original = DEFAULT_OPT,
-                new = self.__dict__
+                new = kwargs.get('config', {})
         )
 
+        self.mode = kwargs.get('autoencoder_mode', 'individual')
+        if not self.mode in ["individual", "simultaneous"]:
+            logger.exception("AutoEncoder : __init__] mode '%s' is not a recognized option for AutoEncoder. Currently available modes are 'individual' (default) and 'simultaneous'." % (self.mode))
+            raise ValueError()
+        self.models = {}
+
 
     def load_model(self, model_file):
         """
@@ -47,66 +68,129 @@ def save_model(self, model, model_file):
         """
 
         """
+        logger.debug("[AutoEncoder : save_model] Saving trained autoencoder to file '%s'." % (model_file))
         model.save(model_file)
 
 
-    def train(self, n_epochs = 1000, batch_size = 128):
+    def train(self):
         """
 
         """
-        model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
-        if os.path.exists(model_file):
-            logger.warning("[AutoEncoder : train] A trained AutoEncoder alread exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
-            self.model = self.load_model(model_file)
-            return
-
-        inputs, outputs = self.make_inputs(split = "train")
-        inputs_val, outputs_val = self.make_inputs(split = "test")
-
-        self.model = AutoEncoder_DNN(self.histograms, **self.config).model()
-
-        self.model.compile(
-                optimizer = keras.optimizers.Adam(), 
-                loss = keras.losses.MeanSquaredError()
-        )
+        if self.mode == "simultaneous":
+            self.models = { None : None }
+            logger.debug("[AutoEncoder : train] Mode selected as 'simultaneous', meaning a single autoencoder will be trained simultaneously on all histograms. Use 'individual' if you wish to train one autoencoder for each histogram.")
+        elif self.mode == "individual":
+            self.models = { k : None for k,v in self.histograms.items() } #copy.deepcopy(self.histograms)
+            logger.debug("[AutoEncoder : train] Mode selected as 'individual', meaning one autoencoder will be trained for each histogram. Use 'simultaneous' if you wish to train a single autoencoder for all histograms.")
+
+        for histogram, histogram_info in self.models.items():
+            if histogram is None:
+                model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
+            else:
+                model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag)
+
+            if os.path.exists(model_file):
+                logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
+                self.models[histogram] = self.load_model(model_file)
+                return
 
-        self.model.fit(
-                inputs,
-                outputs,
-                validation_data = (inputs_val, outputs_val),
-                callbacks = [keras.callbacks.EarlyStopping(patience = 3)],
-                epochs = n_epochs,
-                batch_size = batch_size
-        )
-        self.save_model(self.model, model_file)
 
-
-    def predict(self, batch_size = 1024):
-        inputs, outputs = self.make_inputs(split = "all")
-        pred = self.model.predict(inputs, batch_size = batch_size)
-
-        idx = 0
-        for histogram, histogram_info in self.histograms.items():
-            original_hist = self.df[histogram]
-            if len(self.histograms.items()) >= 2:
-                reconstructed_hist = awkward.flatten(awkward.from_numpy(pred[idx]), axis = -1) 
-            else:
-                reconstructed_hist = awkward.flatten(awkward.from_numpy(pred), axis = -1)
+            inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram)
+            inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram)
 
-            sse = awkward.sum(
-                    (original_hist - reconstructed_hist) ** 2,
-                    axis = -1
+            if histogram is None:
+                hist_name = str(list(self.models.keys()))
+            else:
+                hist_name = histogram
+            logger.debug("[AutoEncoder : train] Training autoencoder with %d dimensions in latent space for histogram(s) '%s' with %d training examples." % (self.config["n_components"], hist_name, len(list(inputs.values())[0]))) 
+
+            if self.mode == "simultaneous":
+                histograms = self.histograms
+            elif self.mode == "individual":
+                histograms = { histogram : self.histograms[histogram] }
+
+            model = AutoEncoder_DNN(histograms, **self.config).model()
+
+            model.compile(
+                    optimizer = keras.optimizers.Adam(), 
+                    loss = keras.losses.MeanSquaredError()
             )
 
-            # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
-            if histogram_info["n_dim"] == 2:
-                sse = awkward.sum(sse, axis = -1)
-
-            self.add_prediction(histogram, sse, reconstructed_hist)
-            idx += 1
+            callbacks = []
+            if self.config["early_stopping"]:
+                callbacks.append(keras.callbacks.EarlyStopping(patience = self.config["early_stopping_rounds"))
+
+            model.fit(
+                    inputs,
+                    outputs,
+                    validation_data = (inputs_val, outputs_val),
+                    callbacks = callbacks, 
+                    epochs = self.config["n_epochs"],
+                    batch_size = self.config["batch_size"]
+            )
+
+            self.save_model(model, model_file)
+            self.models[histogram] = model
 
+
+    def predict(self, batch_size = 1024):
+        for histogram, model in self.models.items():
+            inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram)
+            predictions = model.predict(inputs, batch_size = batch_size)
 
-    def make_inputs(self, split = None):
+            if self.mode == "simultaneous" and self.n_histograms >= 2:
+                predictions = { name : pred for name, pred in zip(model.output_names, predictions) }
+            else:
+                predictions = { model.output_names[0] : predictions }
+
+            for name, pred in predictions.items():
+                hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1]
+                original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions]
+
+                reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions]
+                        awkward.from_numpy(pred),
+                        axis = -1 
+                )
+
+                sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension
+                        (original_hist - reconstructed_hist) ** 2,
+                        axis = -1
+                )
+
+                # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
+                if self.histograms[hist_name]["n_dim"] == 2:
+                    sse = awkward.sum(sse, axis = -1) # second histogram dimension
+
+                self.add_prediction(hist_name, sse, reconstructed_hist) 
+
+            """
+            idx = 0
+            for histogram, histogram_info in self.histograms.items():
+                original_hist = self.df[histogram]
+                if self.n_histograms >= 2: 
+                    #reconstructed_hist = awkward.flatten(
+                    #        awkward.from_numpy(pred["output_" + histogram_info["name"]]),
+                    #        axis = -1
+                    #)
+                    reconstructed_hist = awkward.flatten(awkward.from_numpy(pred[idx]), axis = -1) 
+                else:
+                    #reconstructed_hist = awkward.flatten(awkward.from_numpy(pred), axis = -1)
+
+                sse = awkward.sum(
+                        (original_hist - reconstructed_hist) ** 2,
+                        axis = -1
+                )
+
+                # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
+                if histogram_info["n_dim"] == 2:
+                    sse = awkward.sum(sse, axis = -1)
+
+                self.add_prediction(histogram, sse, reconstructed_hist)
+                idx += 1
+            """
+
+
+    def make_inputs(self, split = None, histogram_name = None):
         """
 
         """
@@ -123,21 +207,28 @@ def make_inputs(self, split = None):
         df = self.df[cut]
 
         for histogram, info in self.histograms.items():
+            if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram
+                if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder
+                    continue
+
             data = tf.convert_to_tensor(df[histogram])
             inputs["input_" + info["name"]] = data
             outputs["output_" + info["name"]] = data
 
+
+
         return inputs, outputs
 
 
-class AutoEncoder_DNN(keras.models.Model):
+#class AutoEncoder_DNN(keras.models.Model):
+class AutoEncoder_DNN():
     """
     Model defined through the Keras Model Subclassing API: https://www.tensorflow.org/guide/keras/custom_layers_and_models
     An AutoEncoder instance owns a single AutoEncoder_DNN, which is the actual implementation of the DNN.
 
     """
     def __init__(self, histograms, **kwargs): 
-        super(AutoEncoder_DNN, self).__init__()
+        #super(AutoEncoder_DNN, self).__init__()
 
         self.n_histograms = len(histograms.keys())
 
@@ -179,7 +270,11 @@ def __init__(self, histograms, **kwargs):
 
 
     def model(self):
-        model = keras.models.Model(inputs = self.inputs, outputs = self.outputs)
+        model = keras.models.Model(
+                inputs = self.inputs,
+                outputs = self.outputs,
+                name = "autoencoder"
+        )
         model.summary()
         return model
 
@@ -209,7 +304,12 @@ def build_encoder(self, histogram, info):
                         activation = "relu",
                         name = name
                 )(layer)
-
+            if self.batch_norm:
+                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
+            if self.dropout > 0:
+                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")
+
+
         encoder = keras.layers.Flatten()(layer)
         return input, encoder
 
@@ -229,10 +329,14 @@ def build_decoder(self, histogram, info, input):
                 activation = "relu"
                 n_filters = 1
                 name = "output_%s" % (info["name"])
+                batch_norm = False
+                dropout = 0
             else:
                 activation = "relu"
                 n_filters = self.n_filters 
                 name = "decoder_%d_%s" % (i, info["name"])
+                batch_norm = self.batch_norm
+                dropout = self.dropout
 
             if info["n_dim"] == 1:
                 layer = keras.layers.Conv1DTranspose(
@@ -252,6 +356,10 @@ def build_decoder(self, histogram, info, input):
                         activation = activation,
                         name = name
                 )(layer)
+            if batch_norm:
+                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
+            if dropout > 0:
+                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout") 
 
         output = layer
         return output
diff --git a/autodqm_ml/algorithms/pca.py b/autodqm_ml/algorithms/pca.py
@@ -64,6 +64,8 @@ def save_model(self, pca, model_file):
         :param model_file: folder name to place trained PCA pickles
         :type model_file: str
         """
+        logger.debug("[PCA : save_model] Saving trained PCA to file '%s'." % (model_file))
+
         os.system("mkdir -p %s" % self.output_dir)
         pcaParams = {
                 'name' : model_file.split("/")[-1].replace(".json", ""),
@@ -141,7 +143,6 @@ def train(self):
             pca.fit(input)
             self.model[histogram] = pca
 
-            logger.debug("[PCA : train] Saving trained PCA to file '%s'." % (model_file))
             self.save_model(pca, model_file)
 
 

diff --git a/scripts/train.py b/scripts/train.py
@@ -58,6 +58,13 @@
     required = False,
     default = None
 )
+parser.add_argument(
+    "--autoencoder_mode",
+    help = "specify whether you want to train an autoencoder for each histogram ('individual') or a single autoencoder on all histograms ('simultaneous')",
+    type = str,
+    required = False,
+    default = None
+)
 parser.add_argument(
     "--debug",
     help = "run logger in DEBUG mode (INFO is default)",