Merge pull request #27 from AutoDQM/autoencoder_dev_3Feb2022

Developments for autoencoders and assess script
AutoDQM · Feb 23, 2022 · 65968ac · 65968ac
2 parents 850ba6a + 8f03834
commit 65968ac
Show file tree

Hide file tree

Showing 11 changed files with 1,087 additions and 91 deletions.
diff --git a/README.md b/README.md
@@ -42,15 +42,10 @@ and then rerunning the command to create the `conda` env. The resulting `conda e
 
 **3. Install autodqm-ml**
 
-**Users** can install with:
-```
-python setup.py install
-```
-**Developers** are suggested to install with:
+Install with:
 ```
 pip install -e .
 ```
-to avoid rerunning the whole installation every time there is a change.
 
 Once your setup is installed, you can activate your python environment with
 ```

diff --git a/autodqm_ml/algorithms/anomaly_detection_algorithm.py b/autodqm_ml/algorithms/anomaly_detection_algorithm.py
@@ -2,6 +2,7 @@
 import pandas
 import numpy
 import awkward
+import json
 
 from autodqm_ml import utils
 from autodqm_ml.data_formats.histogram import Histogram
@@ -72,6 +73,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
 
         if histograms:
             self.histograms = histograms
+        self.histogram_name_map = {} # we replace "/" and spaces in input histogram names to play nicely with other packages, this map lets you go back and forth between them
 
         logger.debug("[AnomalyDetectionAlgorithm : load_data] Loading training data from file '%s'" % (self.input_file))
 
@@ -81,6 +83,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
         # Set helpful metadata
         for histogram, histogram_info in self.histograms.items():
             self.histograms[histogram]["name"] = histogram.replace("/", "").replace(" ","")
+            self.histogram_name_map[self.histograms[histogram]["name"]] = histogram
 
             a = awkward.to_numpy(df[histogram][0])
             self.histograms[histogram]["shape"] = a.shape
@@ -134,9 +137,9 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
         self.n_train = awkward.sum(df.train_label == 0)
         self.n_test = awkward.sum(df.train_label == 1)
         self.df = df
+        self.n_histograms = len(list(self.histograms.keys()))
 
-
-        logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (len(list(self.histograms.keys())), self.n_train, self.n_test))
+        logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (self.n_histograms, self.n_train, self.n_test))
 
         self.data_is_loaded = True
 
@@ -160,3 +163,13 @@ def save(self):
         self.output_file = "%s/%s.parquet" % (self.output_dir, self.input_file.split("/")[-1].replace(".parquet", ""))
         logger.info("[AnomalyDetectionAlgorithm : save] Saving output with additional fields to file '%s'." % (self.output_file))
         awkward.to_parquet(self.df, self.output_file)
+
+        self.config_file = "%s/%s_%s.json" % (self.output_dir, self.name, self.tag)
+        config = {}
+        for k,v in vars(self).items():
+            if utils.is_json_serializable(v):
+                config[k] = v
+
+        logger.info("[AnomalyDetectionAlgorithm : save] Saving AnomalyDetectionAlgorithm config to file '%s'." % (self.config_file))
+        with open(self.config_file, "w") as f_out:
+            json.dump(config, f_out, sort_keys = True, indent = 4)
diff --git a/autodqm_ml/algorithms/autoencoder.py b/autodqm_ml/algorithms/autoencoder.py
@@ -3,6 +3,7 @@
 import numpy
 import json
 import awkward
+import copy
 
 import logging
 logger = logging.getLogger(__name__)
@@ -14,26 +15,54 @@
 from autodqm_ml import utils
 
 DEFAULT_OPT = {
+        "batch_size" : 128, 
+        "val_batch_size" : 1024,
+        "learning_rate" : 0.001,
+        "n_epochs" : 1000,
+        "early_stopping" : True,
+        "early_stopping_rounds" : 3,
         "n_hidden_layers" : 2,
-        "n_nodes" : 25,
+        "n_nodes" : 50,
         "n_components" : 3,
         "kernel_1d" : 3,
         "kernel_2d" : 3,
-        "n_filters" : 8
+        "strides_1d" : 1,
+        "strides_2d" : 1,
+        "dropout" : 0.0,
+        "batch_norm" : False,
+        "n_filters" : 12
 }
 
 class AutoEncoder(MLAlgorithm):
     """
     Autoencoder base class.
+
+    :param config: dictionary with hyperparameters for autoencoder training. Any hyperparameters not specified will be taken from the default values in `DEFAULT_OPT`
+    :type config: dict
+    :param mode: string to specify whether you want to train an autoencoder for each histogram ("individual") or a single autoencoder on all histograms ("simultaneous")
+    :type mode: str
     """
     def __init__(self, **kwargs):
         super(AutoEncoder, self).__init__(**kwargs)
 
         self.config = utils.update_dict(
                 original = DEFAULT_OPT,
-                new = self.__dict__
+                new = kwargs.get('config', {})
         )
 
+        self.mode = kwargs.get('autoencoder_mode', 'individual')
+        if self.mode is None:
+            self.mode = "individual"
+
+        if not self.mode in ["individual", "simultaneous"]:
+            logger.exception("[AutoEncoder : __init__] mode '%s' is not a recognized option for AutoEncoder. Currently available modes are 'individual' (default) and 'simultaneous'." % (self.mode))
+            raise ValueError()
+        self.models = {}
+
+        logger.debug("[AutoEncoder : __init__] Constructing AutoEncoder with the following training options and hyperparameters:")
+        for param, value in self.config.items():
+            logger.debug("\t %s : %s" % (param, str(value)))
+
 
     def load_model(self, model_file):
         """
@@ -47,66 +76,103 @@ def save_model(self, model, model_file):
         """
 
         """
+        logger.debug("[AutoEncoder : save_model] Saving trained autoencoder to file '%s'." % (model_file))
         model.save(model_file)
 
 
-    def train(self, n_epochs = 1000, batch_size = 128):
+    def train(self):
         """
 
         """
-        model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
-        if os.path.exists(model_file):
-            logger.warning("[AutoEncoder : train] A trained AutoEncoder alread exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
-            self.model = self.load_model(model_file)
-            return
-
-        inputs, outputs = self.make_inputs(split = "train")
-        inputs_val, outputs_val = self.make_inputs(split = "test")
-
-        self.model = AutoEncoder_DNN(self.histograms, **self.config).model()
-
-        self.model.compile(
-                optimizer = keras.optimizers.Adam(), 
-                loss = keras.losses.MeanSquaredError()
-        )
+        if self.mode == "simultaneous":
+            self.models = { None : None }
+            logger.debug("[AutoEncoder : train] Mode selected as 'simultaneous', meaning a single autoencoder will be trained simultaneously on all histograms. Use 'individual' if you wish to train one autoencoder for each histogram.")
+        elif self.mode == "individual":
+            self.models = { k : None for k,v in self.histograms.items() } #copy.deepcopy(self.histograms)
+            logger.debug("[AutoEncoder : train] Mode selected as 'individual', meaning one autoencoder will be trained for each histogram. Use 'simultaneous' if you wish to train a single autoencoder for all histograms.")
+
+        for histogram, histogram_info in self.models.items():
+            if histogram is None:
+                model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
+            else:
+                model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag)
+
+            if os.path.exists(model_file):
+                logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
+                self.models[histogram] = self.load_model(model_file)
+                return
 
-        self.model.fit(
-                inputs,
-                outputs,
-                validation_data = (inputs_val, outputs_val),
-                callbacks = [keras.callbacks.EarlyStopping(patience = 3)],
-                epochs = n_epochs,
-                batch_size = batch_size
-        )
-        self.save_model(self.model, model_file)
 
-
-    def predict(self, batch_size = 1024):
-        inputs, outputs = self.make_inputs(split = "all")
-        pred = self.model.predict(inputs, batch_size = batch_size)
-
-        idx = 0
-        for histogram, histogram_info in self.histograms.items():
-            original_hist = self.df[histogram]
-            if len(self.histograms.items()) >= 2:
-                reconstructed_hist = awkward.flatten(awkward.from_numpy(pred[idx]), axis = -1) 
-            else:
-                reconstructed_hist = awkward.flatten(awkward.from_numpy(pred), axis = -1)
+            inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram)
+            inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram)
 
-            sse = awkward.sum(
-                    (original_hist - reconstructed_hist) ** 2,
-                    axis = -1
+            if histogram is None:
+                hist_name = str(list(self.models.keys()))
+            else:
+                hist_name = histogram
+            logger.debug("[AutoEncoder : train] Training autoencoder with %d dimensions in latent space for histogram(s) '%s' with %d training examples." % (self.config["n_components"], hist_name, len(list(inputs.values())[0]))) 
+
+            if self.mode == "simultaneous":
+                histograms = self.histograms
+            elif self.mode == "individual":
+                histograms = { histogram : self.histograms[histogram] }
+
+            model = AutoEncoder_DNN(histograms, **self.config).model()
+
+            model.compile(
+                    optimizer = keras.optimizers.Adam(learning_rate = self.config["learning_rate"]), 
+                    loss = keras.losses.MeanSquaredError()
             )
 
-            # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
-            if histogram_info["n_dim"] == 2:
-                sse = awkward.sum(sse, axis = -1)
-
-            self.add_prediction(histogram, sse, reconstructed_hist)
-            idx += 1
+            callbacks = []
+            if self.config["early_stopping"]:
+                callbacks.append(keras.callbacks.EarlyStopping(patience = self.config["early_stopping_rounds"]))
+
+            model.fit(
+                    inputs,
+                    outputs,
+                    validation_data = (inputs_val, outputs_val),
+                    callbacks = callbacks, 
+                    epochs = self.config["n_epochs"],
+                    batch_size = self.config["batch_size"]
+            )
+
+            self.save_model(model, model_file)
+            self.models[histogram] = model
 
+
+    def predict(self, batch_size = 1024):
+        for histogram, model in self.models.items():
+            inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram)
+            predictions = model.predict(inputs, batch_size = batch_size)
 
-    def make_inputs(self, split = None):
+            if self.mode == "simultaneous" and self.n_histograms >= 2:
+                predictions = { name : pred for name, pred in zip(model.output_names, predictions) }
+            else:
+                predictions = { model.output_names[0] : predictions }
+
+            for name, pred in predictions.items():
+                hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1]
+                original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions]
+
+                reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions]
+                        awkward.from_numpy(pred),
+                        axis = -1 
+                )
+
+                sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension
+                        (original_hist - reconstructed_hist) ** 2,
+                        axis = -1
+                )
+
+                # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
+                if self.histograms[hist_name]["n_dim"] == 2:
+                    sse = awkward.sum(sse, axis = -1) # second histogram dimension
+
+                self.add_prediction(hist_name, sse, reconstructed_hist) 
+
+
+    def make_inputs(self, split = None, histogram_name = None):
         """
 
         """
@@ -118,27 +184,29 @@ def make_inputs(self, split = None):
         elif split == "test":
             cut = self.df.train_label == 1
         else:
-            cut = self.df.train_label >= 0
+            cut = self.df.run_number >= 0 # dummy all True cut
 
         df = self.df[cut]
 
         for histogram, info in self.histograms.items():
+            if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram
+                if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder
+                    continue
+
             data = tf.convert_to_tensor(df[histogram])
             inputs["input_" + info["name"]] = data
             outputs["output_" + info["name"]] = data
 
+
+
         return inputs, outputs
 
 
-class AutoEncoder_DNN(keras.models.Model):
+class AutoEncoder_DNN():
     """
-    Model defined through the Keras Model Subclassing API: https://www.tensorflow.org/guide/keras/custom_layers_and_models
-    An AutoEncoder instance owns a single AutoEncoder_DNN, which is the actual implementation of the DNN.
-
+    An AutoEncoder instance owns AutoEncoder_DNN(s), which is the actual implementation of the DNN.
     """
     def __init__(self, histograms, **kwargs): 
-        super(AutoEncoder_DNN, self).__init__()
-
         self.n_histograms = len(histograms.keys())
 
         self.__dict__.update(kwargs)
@@ -179,7 +247,11 @@ def __init__(self, histograms, **kwargs):
 
 
     def model(self):
-        model = keras.models.Model(inputs = self.inputs, outputs = self.outputs)
+        model = keras.models.Model(
+                inputs = self.inputs,
+                outputs = self.outputs,
+                name = "autoencoder"
+        )
         model.summary()
         return model
 
@@ -209,7 +281,12 @@ def build_encoder(self, histogram, info):
                         activation = "relu",
                         name = name
                 )(layer)
-
+            if self.batch_norm:
+                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
+            if self.dropout > 0:
+                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")
+
+
         encoder = keras.layers.Flatten()(layer)
         return input, encoder
 
@@ -229,10 +306,14 @@ def build_decoder(self, histogram, info, input):
                 activation = "relu"
                 n_filters = 1
                 name = "output_%s" % (info["name"])
+                batch_norm = False
+                dropout = 0
             else:
                 activation = "relu"
                 n_filters = self.n_filters 
                 name = "decoder_%d_%s" % (i, info["name"])
+                batch_norm = self.batch_norm
+                dropout = self.dropout
 
             if info["n_dim"] == 1:
                 layer = keras.layers.Conv1DTranspose(
@@ -252,6 +333,10 @@ def build_decoder(self, histogram, info, input):
                         activation = activation,
                         name = name
                 )(layer)
+            if batch_norm:
+                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
+            if dropout > 0:
+                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout") 
 
         output = layer
         return output
diff --git a/autodqm_ml/algorithms/pca.py b/autodqm_ml/algorithms/pca.py
@@ -64,6 +64,8 @@ def save_model(self, pca, model_file):
         :param model_file: folder name to place trained PCA pickles
         :type model_file: str
         """
+        logger.debug("[PCA : save_model] Saving trained PCA to file '%s'." % (model_file))
+
         os.system("mkdir -p %s" % self.output_dir)
         pcaParams = {
                 'name' : model_file.split("/")[-1].replace(".json", ""),
@@ -141,7 +143,6 @@ def train(self):
             pca.fit(input)
             self.model[histogram] = pca
 
-            logger.debug("[PCA : train] Saving trained PCA to file '%s'." % (model_file))
             self.save_model(pca, model_file)