From 1c9aabf0ab4c992c331ab15929fc7b5125b375ff Mon Sep 17 00:00:00 2001
From: jocain <jgcoain@bu.edu>
Date: Fri, 12 Aug 2022 04:58:58 -0700
Subject: [PATCH 1/2] Autoencoder/General Training Features + Specialized
 Mirror AE

---
 autodqm_ml/algorithms/autoencoder.py | 240 +++++++++++++++--------
 autodqm_ml/algorithms/pca.py         |  44 +++--
 autodqm_ml/evaluation/roc_tools.py   |   2 +-
 autodqm_ml/plotting/plot_tools.py    | 276 +++++++++++++++++++++++++--
 scripts/assess.py                    | 112 ++++++++---
 scripts/train.py                     |  21 +-
 6 files changed, 559 insertions(+), 136 deletions(-)

diff --git a/autodqm_ml/algorithms/autoencoder.py b/autodqm_ml/algorithms/autoencoder.py
index b4226e6..5fd3004 100644
--- a/autodqm_ml/algorithms/autoencoder.py
+++ b/autodqm_ml/algorithms/autoencoder.py
@@ -11,26 +11,46 @@
 import tensorflow as tf
 import tensorflow.keras as keras
 
-from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
-from autodqm_ml import utils
+from datetime import datetime
 
+from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm 
+from autodqm_ml import utils
+from autodqm_ml.constants import kANOMALOUS, kGOOD
+from autodqm_ml.plotting.plot_tools import make_training_plots
 DEFAULT_OPT = {
         "batch_size" : 128, 
         "val_batch_size" : 1024,
         "learning_rate" : 0.001,
         "n_epochs" : 1000,
+        "loss": "mse",
         "early_stopping" : True,
         "early_stopping_rounds" : 3,
-        "n_hidden_layers" : 2,
+        "n_conv_layers": 2,
+        "n_hidden_layers" : 1,
         "n_nodes" : 50,
         "n_components" : 3,
         "kernel_1d" : 3,
         "kernel_2d" : 3,
         "strides_1d" : 1,
         "strides_2d" : 1,
+        "n_filters" : 12,
+        "encoder_conv_padding" : "valid",
+        "pooling" : False,
+        "pooling_kernel_1d" : 2,
+        "pooling_kernel_2d" : 2,
+        "pooling_stride_1d" : None,
+        "pooling_stride_2d" : None,
+        "decoder_conv_layers" : True,
         "dropout" : 0.0,
         "batch_norm" : False,
-        "n_filters" : 12
+        "retain_best":False,
+        "lr_plateau_decay": False,
+        "lr_plateau_decay_rounds": 5,
+        "lr_plateau_decay_factor": 1,
+        "lr_plateau_threshold": 0,
+        "overwrite":False,
+        "train_highest_only":False,
+        "low_stat_threshold": 10000
 }
 
 class AutoEncoder(MLAlgorithm):
@@ -49,7 +69,6 @@ def __init__(self, **kwargs):
                 original = DEFAULT_OPT,
                 new = kwargs.get('config', {})
         )
-
         self.mode = kwargs.get('autoencoder_mode', 'individual')
         if self.mode is None:
             self.mode = "individual"
@@ -98,11 +117,14 @@ def train(self):
                 model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag)
             
             if os.path.exists(model_file):
-                logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
                 self.models[histogram] = self.load_model(model_file)
-                return
-
-
+                if self.config['overwrite']:
+                    logger.warning("[AutoEncoder : train] Overwrite has been turned on and a trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file and continue to train it." % (self.tag, model_file))
+                    model = self.models[histogram]
+                else:
+                    logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
+                    continue
+                    
             inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram)
             inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram)
 
@@ -117,18 +139,24 @@ def train(self):
             elif self.mode == "individual":
                 histograms = { histogram : self.histograms[histogram] }
 
-            model = AutoEncoder_DNN(histograms, **self.config).model()
-           
+            if not os.path.exists(model_file):
+                 model = AutoEncoder_DNN(histograms, **self.config).model()
+                 if self.config["overwrite"]:
+                     logger.warning("[AutoEncoder : train] Overwrite has been turned on but no existing model was found with tag '%s' at file '%s'. We will create and train a new model." % (self.tag, model_file))
+            model.summary()
             model.compile(
                     optimizer = keras.optimizers.Adam(learning_rate = self.config["learning_rate"]), 
-                    loss = keras.losses.MeanSquaredError()
+                    loss = self.config['loss'],
+                    metrics = ["mse"]
             )
 
             callbacks = []
             if self.config["early_stopping"]:
-                callbacks.append(keras.callbacks.EarlyStopping(patience = self.config["early_stopping_rounds"]))
+                callbacks.append(keras.callbacks.EarlyStopping(monitor = 'val_mse', patience = self.config["early_stopping_rounds"], restore_best_weights = self.config['retain_best']))
+            if self.config["lr_plateau_decay"]:
+                callbacks.append(keras.callbacks.ReduceLROnPlateau(patience = self.config["lr_plateau_decay_rounds"], factor = self.config["lr_plateau_decay_factor"], min_delta = self.config["lr_plateau_threshold"]))
 
-            model.fit(
+            history = model.fit(
                     inputs,
                     outputs,
                     validation_data = (inputs_val, outputs_val),
@@ -136,7 +164,18 @@ def train(self):
                     epochs = self.config["n_epochs"],
                     batch_size = self.config["batch_size"]
             )
-            
+            if self.stats_output_dir:
+               if not os.path.isdir(self.stats_output_dir):
+                   os.mkdir(self.stats_output_dir)
+               history = pandas.DataFrame(history.history)
+               runlabel = 'run_' + datetime.now().strftime('%H%M%S%m%d')
+               logger.info("[AutoEncoder : train] Saving training statistics in '%s'." % (self.stats_output_dir))
+               history.to_csv(self.stats_output_dir + runlabel + '_history.csv')
+               history.to_parquet(self.stats_output_dir + runlabel + '_history.parquet')
+               make_training_plots(history, histogram, self.stats_output_dir + runlabel + '_plots.png')
+               
+
+             
             self.save_model(model, model_file)
             self.models[histogram] = model
 
@@ -144,8 +183,8 @@ def train(self):
     def predict(self, batch_size = 1024):
         for histogram, model in self.models.items():
             inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram)
+           
             predictions = model.predict(inputs, batch_size = batch_size)
-
             if self.mode == "simultaneous" and self.n_histograms >= 2:
                 predictions = { name : pred for name, pred in zip(model.output_names, predictions) }
             else:
@@ -154,12 +193,10 @@ def predict(self, batch_size = 1024):
             for name, pred in predictions.items():
                 hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1]
                 original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions]
-
                 reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions]
                         awkward.from_numpy(pred),
                         axis = -1 
                 )
-
                 sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension
                         (original_hist - reconstructed_hist) ** 2,
                         axis = -1
@@ -179,20 +216,39 @@ def make_inputs(self, split = None, histogram_name = None):
         inputs = {}
         outputs = {}
 
-        if split == "train":
-            cut = self.df.train_label == 0
-        elif split == "test":
-            cut = self.df.train_label == 1
-        else:
-            cut = self.df.run_number >= 0 # dummy all True cut
-
-        df = self.df[cut]
-
         for histogram, info in self.histograms.items():
             if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram
                 if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder
                     continue
-
+            if 'CSC' in histogram_name:
+                label_field = 'CSC_label'
+            elif 'emtf' in histogram_name:
+                label_field = 'EMTF_label'
+            else:
+                label_field = None
+            
+            if False:# label_field and len(numpy.unique(self.df[label_field])) > 1: #Don't Include Anomalous Runs in Training
+                if split == "train":
+                    message = ('[AutoEncoder : train] Histogram %s is labeled. %i/%i anomalous runs have been removed from the train set.'% (histogram_name, numpy.sum([self.df.train_label[i] == 0 and self.df[label_field][i] == kANOMALOUS for i in range(len(self.df))]),  numpy.sum([self.df.train_label[i] == 0 for i in range(len(self.df))])))
+                    cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
+                elif split == "test":
+                    message = ('[AutoEncoder : train] Histogram %s is labeled. %i/%i anomalous runs have been removed from the train set.'% (histogram_name, numpy.sum([self.df.train_label[i] == 1 and self.df[label_field][i] == kANOMALOUS for i in range(len(self.df))]),  numpy.sum([self.df.train_label[i] == 1 for i in range(len(self.df))])))
+                    cut = [self.df.train_label[i] == 1 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
+                elif split == "all":
+                    message = ("Data for Histogram %s is labeled, however, 'all' was selected, so all will be used in paritioning" % (histogram_name))
+                    cut = self.df.run_number >= 0
+                else:
+                    cut = self.df[label_field] == kGOOD 
+            else:
+                logger.debug("[AutoEncoder : train] Histogram %s is has no labels, so all will be utiliezed in splitting.")
+                if split == "train":
+                    cut = self.df.train_label == 0
+                elif split == "test":
+                    cut = self.df.train_label == 1
+                else:
+                    cut = self.df.run_number >= 0 # dummy all True cut
+
+            df = self.df[cut] 
             data = tf.convert_to_tensor(df[histogram])
             inputs["input_" + info["name"]] = data
             outputs["output_" + info["name"]] = data
@@ -201,7 +257,6 @@ def make_inputs(self, split = None, histogram_name = None):
 
         return inputs, outputs
 
-
 class AutoEncoder_DNN():
     """
     An AutoEncoder instance owns AutoEncoder_DNN(s), which is the actual implementation of the DNN.
@@ -252,7 +307,7 @@ def model(self):
                 outputs = self.outputs,
                 name = "autoencoder"
         )
-        model.summary()
+        #model.summary()
         return model
 
 
@@ -263,28 +318,41 @@ def build_encoder(self, histogram, info):
         )
 
         layer = input
-        for i in range(self.n_hidden_layers):
+        for i in range(self.n_conv_layers):
             name = "encoder_%d_%s" % (i, info["name"])
             if info["n_dim"] == 1:
                 layer = keras.layers.Conv1D(
                         filters = self.n_filters,
                         kernel_size = self.kernel_1d,
-                        strides = 1,
+                        strides = self.strides_1d,
                         activation = "relu",
-                        name = name 
+                        name = name,
+                        padding = self.encoder_conv_padding
                 )(layer)
             elif info["n_dim"] == 2:
                 layer = keras.layers.Conv2D(
                         filters = self.n_filters,
                         kernel_size = self.kernel_2d,
-                        strides = 1,
+                        strides = self.strides_2d,
+                        padding = self.encoder_conv_padding,
                         activation = "relu",
                         name = name
                 )(layer)
+            if self.pooling:
+                if info["n_dim"] == 1:
+                   layer = keras.layers.MaxPooling1D(
+                           pool_size = self.pooling_kernel_1d,
+                           strides = self.pooling_stride_1d,
+                   )(layer)   
+                elif info["n_dim"] == 2:
+                   layer = keras.layers.MaxPooling2D(
+                           pool_size = (self.pooling_kernel_2d, self.pooling_kernel_2d),
+                           strides = self.pooling_stride_2d,
+                   )(layer)
             if self.batch_norm:
-                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
+                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")(layer)
             if self.dropout > 0:
-                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")
+                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")(layer)
 
 
         encoder = keras.layers.Flatten()(layer)
@@ -292,51 +360,67 @@ def build_encoder(self, histogram, info):
 
 
     def build_decoder(self, histogram, info, input):
-        n_output_units = info["n_bins"] * self.n_filters 
-        layer = keras.layers.Dense(
+        if self.decoder_conv_layers:
+            n_output_units = info["n_bins"] * self.n_filters
+            layer = keras.layers.Dense(
                 units = n_output_units,
                 activation = "relu",
                 name = "decoder_input_%s" % info["name"] 
-        )(input)
-        target_shape = info["shape"] + (self.n_filters,) 
-        layer = keras.layers.Reshape(target_shape = target_shape)(layer)
-
-        for i in range(self.n_hidden_layers):
-            if i == (self.n_hidden_layers - 1):
-                activation = "relu"
-                n_filters = 1
-                name = "output_%s" % (info["name"])
-                batch_norm = False
-                dropout = 0
-            else:
-                activation = "relu"
-                n_filters = self.n_filters 
-                name = "decoder_%d_%s" % (i, info["name"])
-                batch_norm = self.batch_norm
-                dropout = self.dropout
-
-            if info["n_dim"] == 1:
-                layer = keras.layers.Conv1DTranspose(
-                        filters = n_filters,
-                        kernel_size = self.kernel_1d,
-                        strides = 1,
-                        padding = "same",            
-                        activation = activation,
-                        name = name 
-                )(layer)
-            elif info["n_dim"] == 2:
-                layer = keras.layers.Conv2DTranspose(
-                        filters = n_filters, 
-                        kernel_size = self.kernel_2d,
-                        strides = 1,
-                        padding = "same", 
-                        activation = activation,
-                        name = name
-                )(layer)
-            if batch_norm:
-                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
-            if dropout > 0:
-                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout") 
+            )(input)
+            target_shape = info["shape"] + (self.n_filters,)
+            layer = keras.layers.Reshape(target_shape = target_shape)(layer)
+      
+            for i in range(self.n_conv_layers):
+                if i == (self.n_conv_layers - 1):
+                    activation = "relu"
+                    n_filters = 1
+                    name = "output_%s" % (info["name"])
+                    batch_norm = False
+                    dropout = 0
+                else:
+                    activation = "relu"
+                    n_filters = self.n_filters 
+                    name = "decoder_%d_%s" % (i, info["name"])
+                    batch_norm = self.batch_norm
+                    dropout = self.dropout
+
+                if info["n_dim"] == 1:
+                    layer = keras.layers.Conv1DTranspose(
+                            filters = n_filters,
+                            kernel_size = self.kernel_1d,
+                            strides = self.self.strides_1d,
+                            padding = "same",            
+                            activation = activation,
+                            name = name 
+                    )(layer)
+                elif info["n_dim"] == 2:
+                    layer = keras.layers.Conv2DTranspose(
+                            filters = n_filters, 
+                            kernel_size = self.kernel_2d,
+                            strides = self.strides_2d,
+                            padding = "same", 
+                            activation = activation,
+                            name = name
+                    )(layer)
+                if batch_norm:
+                    layer = keras.layers.BatchNormalization(name = name + "_batch_norm")(layer)
+                if dropout > 0:
+                    layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")(layer)
 
+        else:
+            n_output_units = info["n_bins"]
+            layer = keras.layers.Dense(
+                units = n_output_units,
+                activation = "relu"       
+            )(input)
+            target_shape = info["shape"] + (1,)
+           
+            layer = keras.layers.Reshape(
+                    target_shape = target_shape,
+                    name = "output_%s" % (info["name"])
+            )(layer)
         output = layer
         return output
+def mse_cutoff(y_true, y_pred):
+    mse = keras.losses.MeanSquaredError()
+    return tf.math.minimum(mse(y_true, y_pred), 5e-6)
diff --git a/autodqm_ml/algorithms/pca.py b/autodqm_ml/algorithms/pca.py
index cbc60eb..64aab08 100644
--- a/autodqm_ml/algorithms/pca.py
+++ b/autodqm_ml/algorithms/pca.py
@@ -15,6 +15,7 @@
 from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
 from autodqm_ml.data_formats.histogram import Histogram
 from autodqm_ml.plotting.plot_tools import plot1D, plotMSESummary
+from autodqm_ml.constants import kGOOD, kANOMALOUS
 
 DEFAULT_OPT = {
         "n_components" : 2
@@ -27,8 +28,11 @@ class PCA(MLAlgorithm):
     def __init__(self, **kwargs):
         super(PCA, self).__init__(**kwargs)
 
+
         if not hasattr(self, "n_components"):
             self.n_components = DEFAULT_OPT["n_components"]
+        
+        self.hist_shape = None
 
     def load_model(self, model_file):
         """
@@ -95,17 +99,35 @@ def get_histogram(self, histogram, split = "all"):
         :return: a 1d histogram (flattened if originally a 2d histogram)
         :rtype: awkward.Array
         """
-
-        if split == "train":
-            runs = self.df[self.df.train_label == 0]
-        elif split == "test":
-            runs = self.df[self.df.train_label == 1]
-        elif split == "all":
-            runs = self.df
-
-        h = runs[histogram]
+        if 'CSC' in histogram:
+            label_field = 'CSC_label'
+        elif 'emtf' in histogram:
+            label_field = 'EMTF_label'
+        else:
+            label_field = None
+
+        if label_field and len(numpy.unique(self.df[label_field])) > 1: #Don't Include Anomalous Runs in Training
+           if split == "train":
+               cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
+           elif split == "test":
+               cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
+           elif split == "all":
+               cut = self.df.run_number >= 0
+           else:
+               cut = self.df[label_field] == kGOOD
+        else:
+           if split == "train":
+               cut = self.df.train_label == 0
+           elif split == "test":
+               cut = self.df.train_label == 1
+           else:
+               cut = self.df.run_number >= 0 # dummy all True cut
+
+        df = self.df[cut]
+        h = df[histogram]
 
         n_dim = len(awkward.to_numpy(h[0]).shape)
+        self.hist_shape = awkward.to_numpy(h).shape
 
         if n_dim == 2:
             h = awkward.flatten(h, axis = 2)
@@ -165,6 +187,6 @@ def predict(self):
                     (original_hist - reconstructed_hist) ** 2,
                     axis = -1
             )
-
-            self.add_prediction(histogram, sse, reconstructed_hist)
+           
+            self.add_prediction(histogram, sse, numpy.array(reconstructed_hist).reshape(self.hist_shape))
 
diff --git a/autodqm_ml/evaluation/roc_tools.py b/autodqm_ml/evaluation/roc_tools.py
index 729e3f8..b23ae91 100644
--- a/autodqm_ml/evaluation/roc_tools.py
+++ b/autodqm_ml/evaluation/roc_tools.py
@@ -49,7 +49,7 @@ def bootstrap_indices(x):
     return numpy.random.randint(0, len(x), len(x))
     
 
-def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 100, interp = 10000):
+def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 1000, interp = 10000):
     """
     Calculates tpr and fpr arrays (with uncertainty for tpr) and auc and uncertainty
     Keyword arguments:
diff --git a/autodqm_ml/plotting/plot_tools.py b/autodqm_ml/plotting/plot_tools.py
index ee81c7f..23c863a 100644
--- a/autodqm_ml/plotting/plot_tools.py
+++ b/autodqm_ml/plotting/plot_tools.py
@@ -2,8 +2,15 @@
 import numpy as np 
 from pathlib import Path
 import awkward
+import os
 
-from yahist import Hist1D
+import pandas as pd
+
+from yahist import Hist1D, Hist2D
+
+from matplotlib import colors
+
+from datetime import datetime
 
 import logging
 logger = logging.getLogger(__name__)
@@ -43,22 +50,28 @@ def make_sse_plot(name, recos, save_name, **kwargs):
     plt.clf()
 
 
-def make_original_vs_reconstructed_plot(name, original, recos, run, save_name, **kwargs): 
+def make_original_vs_reconstructed_plot(name, original, recos, run, save_name, hist_layout, **kwargs): 
     n_dim = len(np.array(original).shape)
 
     if n_dim == 1:
         make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name, **kwargs)
 
     elif n_dim == 2:
-        original_flat = awkward.flatten(original, axis = -1)
-        recos_flat = {}
-        for algorithm, reco in recos.items():
-            recos_flat[algorithm] = {
-                    "reco" : awkward.flatten(reco["reco"], axis = -1),
-                    "score" : reco["score"]
-            }
-      
-        make_original_vs_reconstructed_plot1d(name, original_flat, recos_flat, run, save_name, **kwargs)
+        if hist_layout == 'flatten':
+            original_flat = awkward.flatten(original, axis = -1)
+            recos_flat = {}
+            for algorithm, reco in recos.items():
+                recos_flat[algorithm] = {
+                          "reco" : awkward.flatten(reco["reco"], axis = -1),
+                          "score" : reco["score"]
+                }
+            make_original_vs_reconstructed_plot1d(name, original_flat, recos_flat, run, save_name, **kwargs)     
+        elif hist_layout == '2d':
+            make_original_vs_reconstructed_plot2d(name, original, recos, run, save_name, **kwargs)
+        else:
+            message = "[plot_tools.py : make_original_vs_reconstructed_plot] Please specify a valid histogram layout option: flatten (default), 2d"
+            logger.exception(message)
+            raise RuntimeError()
 
     else:
         message = "[plot_tools.py : make_original_vs_reconstructed_plot] Plotting not implemented for histograms with dimension %d." % (n_dim)
@@ -76,6 +89,7 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
     h_orig = Hist1D(original, bins = bins, label = "original")
     h_orig._counts = original
     h_reco = []
+   
     for reco, info in recos.items():
         h = Hist1D(info["reco"], bins = bins, label = "%s [sse : %.2E]" % (reco, info["score"]))
         h._counts = info["reco"]
@@ -109,7 +123,68 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
     plt.savefig(save_name.replace(".pdf", ".png"))
     plt.clf()
 
+def make_original_vs_reconstructed_plot2d(name, original, recos, run, save_name, **kwargs):
+    x_label = name + " (a.u.)"
+    y_label = "Fraction of events"
+    extent = (0, 1, 0, 1)
+    color_map = plt.cm.Purples
+    rat_lim = kwargs.get("rat_lim", [0.0, 2.0])
+    log_y = kwargs.get("log_y", False)
+    h_reco = []
+    labels = []
+    base_vmax = awkward.max(original)
+    base_vmin = awkward.min(original)
+    ratio_vmax = -10
+    ratio_vmin = 10
+    ratios = []
+    for reco, info in recos.items():
+        h_reco.append(info["reco"])
+        ratio = np.abs(info["reco"] - original)
+        ratios.append(ratio)
+        base_vmax = np.max((base_vmax, awkward.max(info["reco"])))
+        base_vmin = np.min((base_vmin, awkward.min(info["reco"])))
+        ratio_vmax = np.max((ratio_vmax, awkward.max(ratio)))
+        ratio_vmin = np.min((ratio_vmin, awkward.min(ratio)))
+        labels.append("%s [sse : %.2E]" % (reco, info["score"]))
+    
+    fig, axes = plt.subplots(2, len(h_reco) + 1, figsize=(5 + len(h_reco)*5, 6), gridspec_kw=dict(height_ratios=[3, 1]), sharey = True, sharex=True)
+     
+    if log_y:
+        base_norm = colors.LogNorm(base_vmin, base_vmax)
+        ratio_norm = colors.LogNorm(ratio_vmin, ratio_vmax)
+    else:
+        base_norm = colors.Normalize(base_vmin, base_vmax)
+        ratio_norm = colors.Normalize(ratio_vmin, ratio_vmax)
+    #plt.grid()
+    axes[0][0].imshow(original, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
+    axes[0][0].set_title("Original")
+    #plt.colorbar(mesh, ax = axes[0][0])
+    axes[0][0].grid()
+    for idx, h in enumerate(h_reco):
+        if idx == len(h_reco) - 1:
+            pos = axes[0][idx+1].imshow(h, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
+            cax = axes[0][idx+1].inset_axes([1.1, 0, 0.1, 1])
+            plt.colorbar(pos, cax = cax, ax = axes[0][idx+1])
+            pos = axes[1][idx+1].imshow(ratios[idx], norm=ratio_norm, cmap=color_map, extent = extent, aspect = 'auto')
+            cax = axes[1][idx+1].inset_axes([1.1, 0, 0.1, 1])
+            plt.colorbar(pos, cax = cax, ax = axes[1][idx+1])
+        else:
+            pos = axes[0][idx+1].imshow(h, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
+            pos = axes[1][idx+1].imshow(ratios[idx], norm=ratio_norm, cmap=color_map, extent = extent, aspect = 'auto')
+        axes[0][idx+1].set_title(labels[idx])
+        axes[0][idx+1].grid()
+        axes[1][idx+1].grid()
+    axes[1][0].remove()
+    axes[0][0].set_ylabel(y_label)
+    axes[1][1].set_ylabel("ML Reco - Original")
+    axes[0][0].set_xlabel(x_label)
+
 
+    logger.debug("[plot_tools.py : make_original_vs_reconstructed_plot1d] Writing plot to file '%s'. " % (save_name))
+    plt.savefig(save_name, bbox_inches='tight')
+    plt.savefig(save_name.replace(".pdf", ".png"), bbox_inches='tight')
+    plt.clf()
+    
 def plot1D(original_hist, reconstructed_hist, run, hist_path, algo, threshold):    
     """
     plots given original and recontructed histogram. Will plot the MSE plot if the SSE is over the threshold. 
@@ -255,3 +330,182 @@ def plot_roc_curve(h_name, results, save_name, **kwargs):
     plt.savefig(save_name.replace(".pdf", ".png"))
     plt.clf()
 
+def plot_rescaled_score_hist(data, hist, savename):
+    
+    fig, axes = plt.subplots(len(data['score']), 1, figsize = (12, 4*len(data['score'])))
+    if len(data['score']) == 1:
+        axes = [axes]
+    for i in range(len(data['score'])):
+        score = data['score'][i]
+        score_min = awkward.min(score)
+        score_max = awkward.max(score) - score_min
+        score = score - awkward.min(score)
+        score = score/awkward.max(score)
+        axes[i].hist(score, bins = np.logspace(np.log10(1e-4),np.log10(1.0), 100), color = 'tab:blue', alpha = 0.8, label = 'All Runs')
+        axes[i].set_ylabel(data['algo'][i])
+        axes[i].set_yscale('log')
+        axes[i].set_xscale('log')
+        if 'bad' in data:
+            badax = axes[i].twinx()
+            bad = data['bad'][i]
+            bad = bad - score_min
+            bad = bad/score_max
+            badax.hist(bad, bins = np.logspace(np.log10(1e-4),np.log10(1.0), 100), range = (0, 1), color = 'tab:orange', alpha = 0.8, label ='Bad Runs')
+            axes[i].spines['left'].set_color('tab:blue')
+            badax.xaxis.label.set_color('tab:orange')
+            axes[i].xaxis.label.set_color('tab:blue')
+            if i == 0:
+               badax.set_ylabel('Anomalous Runs')
+            badax.spines['right'].set_color('tab:orange')
+            badax.set_xscale('log')    
+    fig.suptitle(hist)
+    axes[0].legend()
+    axes[0].set_title('Min-Max Scaled Anomaly Scores')
+    fig.savefig(savename, bbox_inches = 'tight')
+    fig.savefig(savename.replace('.png', '.pdf'), bbox_inches = 'tight')
+    
+def make_training_plots(history, hist, save_file):
+        epochs = range(len(history['loss']))
+        print(len(history.columns))
+        fig, axes = plt.subplots(1, len(history.columns), figsize = (len(history.columns)*9, 9))
+        i = 0
+        fig.suptitle(hist, fontsize = 22)
+        for stat, y in history.items():
+            axes[i].plot(epochs, y)
+            axes[i].set_xlabel('Epoch', fontsize = 15)
+            axes[i].set_title(stat, fontsize = 18)
+            axes[i].set_yscale('log')
+            i += 1
+        plt.savefig(save_file, bbox_inches = 'tight')
+def multi_exp_plots(paths, xlabel, x, title, legend = None, logx = False, logy = False):
+    fig, axes = plt.subplots(1, 4, figsize = (36, 9))
+    i = 0
+    fig.suptitle(title, fontsize = 22)
+    if not legend:
+        legend = [None]*len(x)
+    if type(paths) == list:
+        for i in range(len(paths)):
+            make_one_var_exp_plots(paths[i], xlabel, x, axes, legend[i], logx)
+        if paths[0][len(paths[0]) - 1] == '/':
+            savepath = paths[0][:len(paths[0]) - 1]
+        else:
+            savepath = paths[0]
+        filename = title.replace(' ', '_')
+        for c in '()[]{}/.,:;?!@#$^&*':
+            filename = filename.replace(c, '')
+        savepath = savepath[:str.rindex(savepath, '/')] + '/' + filename + '_plots.png'
+        print(savepath)
+    else:
+        make_one_var_exp_plots(paths, xlabel, x, axes, legend, logx)
+        plt.savefig(paths + 'plots.png', bbox_inches = 'tight')
+
+def make_one_var_exp_plots(path, xlabel, x, axes, label = None, logx = False):
+    data = {'Epochs Trained':[], 'Epochs Trained Std':[],
+            'Best Train Loss':[], 'Best Train Loss Std':[],
+            'Best Validation Loss':[], 'Best Validation Loss Std':[],
+            'Ending Learning Rate':[], 'Ending Learning Rate Std':[]}
+    levels = [dir for dir in os.listdir(path) if not '.png' in dir]
+    if not label:
+        if path[len(path) - 1] == '/':
+            label = path[:len(path) - 1]
+            label = label[label.rindex('/') + 1:]
+        else:
+            label = path[path.rindex('/')]
+    for level in levels:
+        dirs = os.listdir(path+level)
+        files = [file for file in dirs if '.csv' in file]
+        subset = {'Epochs Trained':[], 'Best Train Loss':[], 'Best Validation Loss':[],'Ending Learning Rate':[]}
+        for file in files:
+            filepath = path + level + '/' + file
+            df = pd.read_csv(filepath)
+            subset['Epochs Trained'].append(len(df['loss']))
+            subset['Best Train Loss'].append(np.min(df['loss']))
+            subset['Best Validation Loss'].append(np.min(df['val_loss']))
+            subset['Ending Learning Rate'].append(df['lr'].iloc[len(df['loss']) - 1])
+        for item, values in subset.items():
+            data[item].append(np.mean(values))
+            data[item + ' Std'].append(np.std(values))
+    i = 0
+    for item in data:
+        if not 'Std' in item:
+          axes[i].errorbar(x, data[item], data[item + ' Std'], label = label)
+          axes[i].set_title(item, fontsize = 18)
+          if logx:
+            axes[i].set_xscale('log')
+          if i == 3:
+            axes[i].set_yscale('log')
+          axes[i].set_xlabel(xlabel, fontsize = 15)
+          i += 1
+
+
+def multi_exp_bar_plots(paths, xlabel, title, legend = None):
+    b = 9
+    s = b/(len(xlabel) + .5)
+    fig, axes = plt.subplots(1, 5, figsize = (45, 9))
+    fig.suptitle(title, fontsize = 22)
+    if not legend:
+        legend = [None]*len(paths)
+    if type(paths) == list:
+        for i in range(len(paths)):
+            make_one_var_exp_bar_plots(paths[i], xlabel, axes, i, b, s, len(paths), legend[i])
+        if paths[0][len(paths[0]) - 1] == '/':
+            savepath = paths[0][:len(paths[0]) - 1]
+        else:
+            savepath = paths[0]
+        filename = title.replace(' ', '_')
+        for c in '()[]{}/.,:;?!@#$^&*':
+            filename = filename.replace(c, '')
+        savepath = savepath[:str.rindex(savepath, '/')] + '/' + filename + '_plots.png'
+        print(savepath)
+    else:
+        
+        make_one_var_exp_bar_plots(paths, xlabel, axes, 0, b, s, 1, legend[0])
+        plt.savefig(paths + 'plots.png', bbox_inches = 'tight')
+
+def make_one_var_exp_bar_plots(path, xlabel, axes, i, b, s, n, label = None):
+    data = {'Epochs Trained':[], 'Epochs Trained Std':[],
+            'Best Train Loss':[], 'Best Train Loss Std':[],
+            'Best Validation Loss':[], 'Best Validation Loss Std':[],
+            'Ending Learning Rate':[], 'Ending Learning Rate Std':[]}
+    levels = [dir for dir in os.listdir(path) if not '.png' in dir and not 'assess' in dir]
+    if not label:
+        if path[len(path) - 1] == '/':
+            label = path[:len(path) - 1]
+            label = label[label.rindex('/') + 1:]
+        else:
+            label = path[path.rindex('/')]
+    for level in levels:
+        dirs = os.listdir(path+level)
+        files = [file for file in dirs if '.csv' in file]
+        subset = {'Epochs Trained':[], 'Best Train Loss':[], 'Best Validation Loss':[],'Ending Learning Rate':[]}
+        for file in files:
+            filepath = path + level + '/' + file
+            df = pd.read_csv(filepath)
+            subset['Epochs Trained'].append(len(df['loss']))
+            subset['Best Train Loss'].append(np.min(df['loss']))
+            subset['Best Validation Loss'].append(np.min(df['val_loss']))
+            subset['Ending Learning Rate'].append(df['lr'].iloc[len(df['loss']) - 1])
+            if 'mse' in df.columns:
+                if 'Best MSE' not in data:
+                    data['Best MSE'] = []
+                    data['Best MSE Std'] = []
+                if 'Best MSE' not in subset:
+                    subset['Best MSE'] = []
+                subset['Best MSE'].append(np.min(df['mse']))
+        for item, values in subset.items():
+            data[item].append(np.mean(values))
+            data[item + ' Std'].append(np.std(values))
+            if 'MSE' in item:
+                print(level, np.mean(values))
+                print(values)
+    m = 0
+    for item in data:
+        if not 'Std' in item:
+          x = [s + 2*s*i + j*b for j in range(len(xlabel))]
+          print(data[item])
+          axes[m].bar(x, data[item], yerr = data[item + ' Std'], width = (b-.5)/n, label = label)
+          axes[m].set_title(item, fontsize = 18)
+          if 'Loss' in item:
+            axes[m].set_yscale('log')
+          axes[m].set_xticks(x, xlabel, fontsize = 15)
+          m += 1
diff --git a/scripts/assess.py b/scripts/assess.py
index 8ad7c36..79efdd5 100644
--- a/scripts/assess.py
+++ b/scripts/assess.py
@@ -4,9 +4,11 @@
 import awkward
 import numpy
 
+import pandas
+
 from autodqm_ml.utils import setup_logger
 from autodqm_ml.utils import expand_path
-from autodqm_ml.plotting.plot_tools import make_original_vs_reconstructed_plot, make_sse_plot, plot_roc_curve
+from autodqm_ml.plotting.plot_tools import make_original_vs_reconstructed_plot, make_sse_plot, plot_roc_curve, plot_rescaled_score_hist
 from autodqm_ml.evaluation.roc_tools import calc_roc_and_unc, print_eff_table
 from autodqm_ml.constants import kANOMALOUS, kGOOD
 
@@ -60,6 +62,12 @@ def parse_arguments():
         action="store_true",
         help="make a nicely browsable web page"
     ) 
+    parser.add_argument(
+        "--hist_layout",
+        type = str,
+        required = False,
+        default = 'flatten'
+    )
     parser.add_argument(
         "--debug",
         help = "run logger in DEBUG mode (INFO is default)",
@@ -100,8 +108,15 @@ def main(args):
     logger_mode = "DEBUG" if args.debug else "INFO"
     log_file = "%s/fetch_data_log_%s.txt" % (args.output_dir, "assess")
     logger = setup_logger(logger_mode, log_file)
-
-
+    
+    stats = {
+             'hist': [],
+             'dim_0': [],
+             'dim_1': [],
+             'algo': [],
+             'avg_an_score': [],
+             'std_an_score': []
+    }        
     histograms = { x : {"algorithms" : {}} for x in args.histograms.split(",") }
 
     runs = awkward.from_parquet(args.input_file)
@@ -121,6 +136,7 @@ def main(args):
     # Print out runs with N highest sse scores for each histogram
     N = 5
     for h, info in histograms.items():
+        score_hist_data = {'algo':[], 'score':[], 'bad':[]} #track scores for histogram
         for algorithm, algorithm_info in info["algorithms"].items():
             runs_sorted = runs[awkward.argsort(runs[algorithm_info["score"]], ascending=False)]
             logger.info("[assess.py] For histogram '%s', algorithm '%s', the mean +/- std anomaly score is: %.2e +/- %.2e." % (h, algorithm, awkward.mean(runs[algorithm_info["score"]]), awkward.std(runs[algorithm_info["score"]])))
@@ -128,18 +144,37 @@ def main(args):
             logger.info("\t The runs with the highest anomaly scores are:")
             for i in range(N):
                 logger.info("\t Run number : %d, Anomaly Score : %.2e" % (runs_sorted.run_number[i], runs_sorted[algorithm_info["score"]][i]))
-
+            stats['hist'].append(h)
+            stats['algo'].append(algorithm)
+            stats['avg_an_score'].append(awkward.mean(runs[algorithm_info["score"]]))
+            stats['std_an_score'].append(awkward.std(runs[algorithm_info["score"]]))
+            if 'CSC' in h:
+                label_field = 'CSC_label'
+            elif 'emtf' in h:
+                label_field = 'EMTF_label'
+            if len(numpy.unique(runs[label_field])) > 1:
+               score_hist_data['algo'].append(algorithm)
+               score_hist_data['score'].append(runs[algorithm_info["score"]][runs[label_field] == 0])
+               score_hist_data['bad'].append(runs[algorithm_info["score"]][runs[label_field] == 1])
+        if not os.path.isdir(args.output_dir + "/" + h.replace("/", "").replace(" ", "") + "/"):
+            os.mkdir(args.output_dir + "/" + h.replace("/", "").replace(" ", "") + "/")
+        if len(score_hist_data['algo']) != 0:
+            plot_rescaled_score_hist(score_hist_data, h, args.output_dir + "/" + h.replace("/", "").replace(" ", "") + "/" + "score_hist.png")
     # Histogram of sse for algorithms
     splits = {
             "train_label" : [("train", 0), ("test", 1)],
             "label" : [("anomalous", kANOMALOUS), ("good", kGOOD)]
     }
- 
+    
     for h, info in histograms.items():
         for split, split_info in splits.items():
             recos_by_label = { k : {} for k,v in info["algorithms"].items() }
             for name, id in split_info:
-                runs_set = runs[runs[split] == id]
+                if 'CSC' in h:
+                    label_field = 'CSC_label'
+                elif 'emtf' in h:
+                    label_field = 'EMTF_label'
+                runs_set = runs[runs[label_field] == id]
                 if len(runs_set) == 0:
                     logger.warning("[assess.py] For histogram '%s', no runs belong to the set '%s', skipping making a histogram of SSE for this." % (h, name))
                     continue
@@ -149,36 +184,43 @@ def main(args):
                     recos_by_label[algorithm][name] = { "score" : runs_set[algorithm_info["score"]] }
 
                 h_name = h.replace("/", "").replace(" ", "")
-                save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (split, name)
+                save_name = args.output_dir + "/" + h_name + "/sse_%s_%s.pdf" % (split, name)
                 make_sse_plot(h_name, recos, save_name)
 
             for algorithm, recos_alg in recos_by_label.items():
                 if not recos_alg:
                     continue
-                save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (algorithm, split)
+                save_name = args.output_dir + "/" + h_name + "/sse_%s_%s.pdf" % (algorithm, split)
                 make_sse_plot(h_name, recos_alg, save_name) 
-
-
+ 
     # ROC curves (if there are labeled runs)
-    has_labeled_runs = True
-    labeled_runs_cut = runs.run_number < 0 # dummy all False cut
-    for name, id in splits["label"]:
-        cut = runs.label == id
-        labeled_runs_cut = labeled_runs_cut | cut
-        runs_set = runs[cut]
-        has_labeled_runs = has_labeled_runs and (len(runs_set) > 0)
-
-    if has_labeled_runs:
-        labeled_runs = runs[labeled_runs_cut]
-        roc_results = {}
-        for h, info in histograms.items():
+    has_labeled_runs = {h:True for h in histograms}
+    labeled_runs_cut = {h:runs.run_number < 0 for h in histograms}
+    for h, info in histograms.items():
+        if 'CSC' in h:
+            label_field = 'CSC_label'
+        elif 'emtf' in h:
+            label_field = 'EMTF_label'
+        for name, id in splits["label"]:
+            cut = runs[label_field] == id
+            labeled_runs_cut[h] = labeled_runs_cut[h] | cut
+            runs_set = runs[cut]
+            has_labeled_runs[h] = has_labeled_runs[h] and (len(runs_set) > 0)
+    roc_results = {}
+    for h, info in histograms.items():
+        if has_labeled_runs[h]:
+            labeled_runs = runs[labeled_runs_cut[h]]
             roc_results[h] = {}
+            if 'CSC' in h:
+                label_field = 'CSC_label'
+            elif 'emtf' in h:
+                label_field = 'EMTF_label'
             for algorithm, algorithm_info in info["algorithms"].items():
                 pred = labeled_runs[algorithm_info["score"]]
-                roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs.label, pred)
+                roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs[label_field], pred)
 
             h_name = h.replace("/", "").replace(" ", "")
-            save_name = args.output_dir + "/" + h_name + "_roc.pdf"
+            save_name = args.output_dir + "/" + h_name + "/roc.pdf"
             plot_roc_curve(h_name, roc_results[h], save_name)
             plot_roc_curve(h_name, roc_results[h], save_name.replace(".pdf", "_log.pdf"), log = True)
             print_eff_table(h_name, roc_results[h])
@@ -197,9 +239,10 @@ def main(args):
         for run in selected_runs:
             selected_runs_idx = selected_runs_idx | (runs.run_number == run)
         logger.debug("[assess.py] Will make plots for the %d specified runs: %s" % (len(selected_runs), str(selected_runs)))
-
+    
     runs_trim = runs[selected_runs_idx]
     for h, info in histograms.items():
+        stats_checked = False
         for i in range(len(runs_trim)):
             run = runs_trim[i]
             run_number = run.run_number
@@ -209,12 +252,25 @@ def main(args):
                 if algorithm_info["reco"] is None:
                     continue
                 recos[algorithm] = { "reco" : run[algorithm_info["reco"]], "score" : run[algorithm_info["score"]]}
+                
+                if not stats_checked:
+                    if run[algorithm_info["reco"]].ndim > 1:
+                        stats['dim_0'].append(len(run[algorithm_info["reco"]]))
+                        stats['dim_1'].append(len(run[algorithm_info["reco"]][0]))
+                    else:
+                        stats['dim_0'].append(len(run[algorithm_info["reco"]]))
+                        stats['dim_1'].append(numpy.nan)
+            stats_checked = True
             h_name = h.replace("/", "").replace(" ", "")
-            save_name = args.output_dir + "/" + h_name + "_Run%d.pdf" % run_number
-            make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name) 
+            save_name = args.output_dir + "/" + h_name + "/Run%d.pdf" % run_number
+            make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name, hist_layout = args.hist_layout) 
 
     logger.info("[assess.py] Plots written to directory '%s'." % (args.output_dir))
-
+    stat_parquet_dir = args.output_dir + "/assessment_stats.parquet"
+    stat_csv_dir = args.output_dir + "/assessment_stats.csv"
+    pandas.DataFrame(stats).to_csv(stat_csv_dir)
+    pandas.DataFrame(stats).to_parquet(stat_parquet_dir)
+    logger.info("[assess.py] Assessment statistics written to '%s' and '%s'" % (stat_parquet_dir, stat_csv_dir))
     if args.make_webpage:
         os.system("cp web/index.php %s" % args.output_dir)
         os.system("chmod 755 %s" % args.output_dir)
diff --git a/scripts/train.py b/scripts/train.py
index b2d8dfb..b5d81a1 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -7,6 +7,7 @@
 from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
 from autodqm_ml.algorithms.pca import PCA
 from autodqm_ml.algorithms.autoencoder import AutoEncoder
+from autodqm_ml.algorithms.mirrorAE import MirrorAE
 from autodqm_ml.utils import expand_path
 
 parser = argparse.ArgumentParser()
@@ -51,10 +52,9 @@
 )
 parser.add_argument(
     "--train_highest_only",
-    help = "If True, only trains on the runs with the highest stats, or the highest number of entries. The test set becomes the remaining runs.",
-    type = bool,
-    required = False,
-    default = False
+    help = "If set, only trains on the runs with the highest stats, or the highest number of entries. The test set becomes the remaining runs.",
+    action = "store_true",
+    required = False
 )
 
 
@@ -72,7 +72,13 @@
 #    required = False,
 #    default = 0.5,
 #)
-
+parser.add_argument(
+    "--stats_output_dir",
+    help = "Makes a series of plots and a table containing data taken during training progression, and saves them in the provided output directory. If not provided, will not save train stats.",
+    required = False,
+    type = str,
+    default = None
+)
 parser.add_argument(
     "--reference",
     help = "reference run number to use for comparisons with StatisticalTester",
@@ -127,7 +133,7 @@
     config = vars(args)
     config["name"] = args.algorithm.lower()
 
-if not config["name"] in ["autoencoder", "pca", "statistical_tester"]:
+if not config["name"] in ["autoencoder", "pca", "statistical_tester", "mirrorae"]:
     message = "[train.py] Requested algorithm '%s' is not in the supported list of algorithms ['autoencoder', 'pca']." % (config["name"])
     logger.exception(message)
     raise RuntimeError()
@@ -138,7 +144,8 @@
     algorithm = AutoEncoder(**config)
 elif config["name"] == "statistical_tester":
     algorithm = StatisticalTester(**config)
-
+elif config["name"] == "mirrorae":
+    algorithm = MirrorAE(**config)
 if args.input_file is None and "input_file" not in config.keys():
     message = "[train.py] An input file for training the ML algorithm was not supplied through CLI nor found in the json config file for the algorithm."
     logger.exception(message)

From 5d70e342ba35a70965b37cd8672a5d56f5aaa43d Mon Sep 17 00:00:00 2001
From: jocain <jgcoain@bu.edu>
Date: Fri, 12 Aug 2022 05:09:14 -0700
Subject: [PATCH 2/2] Autoencoder/General Training Features + Specialized
 Mirror AE

---
 autodqm_ml/algorithms/mirrorAE.py | 520 ++++++++++++++++++++++++++++++
 1 file changed, 520 insertions(+)
 create mode 100644 autodqm_ml/algorithms/mirrorAE.py

diff --git a/autodqm_ml/algorithms/mirrorAE.py b/autodqm_ml/algorithms/mirrorAE.py
new file mode 100644
index 0000000..a2adfa2
--- /dev/null
+++ b/autodqm_ml/algorithms/mirrorAE.py
@@ -0,0 +1,520 @@
+import os
+import pandas
+import numpy
+import json
+import awkward
+import copy
+
+import logging
+logger = logging.getLogger(__name__)
+
+import tensorflow as tf
+import tensorflow.keras as keras
+
+from datetime import datetime
+from autodqm_ml.constants import kGOOD, kANOMALOUS
+from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
+from autodqm_ml import utils
+from autodqm_ml.plotting.plot_tools import make_training_plots
+DEFAULT_OPT = {
+        "batch_size" : 128,
+        "val_batch_size" : 1024,
+        "learning_rate" : 0.001,
+        "loss" : "mse",
+        "n_epochs" : 1000,
+        "early_stopping" : True,
+        "early_stopping_rounds" : 3,
+        "n_conv_layers": 1,
+        "tied_dense": True,
+        "n_hidden_layers": 1, 
+        "n_nodes": 15,
+        "n_components" : 3,
+        "kernel_1d" : 3,
+        "kernel_2d" : 3,
+        "strides_1d" : 1,
+        "strides_2d" : 1,
+        "pooling" : False,
+        "pooling_kernel_1d" : 2,
+        "pooling_kernel_2d" : 2,
+        "pooling_stride_1d" : None,
+        "pooling_stride_2d" : None,
+        "decoder_conv_layers" : True,
+        "dropout" : 0.0,
+        "batch_norm" : False,
+        "n_filters" : 12,
+        "retain_best":False,
+        "lr_plateau_decay": False,
+        "lr_plateau_decay_rounds": 5,
+        "lr_plateau_decay_factor": 0.1,
+        "lr_plateau_threshold": 0,
+        "overwrite":False
+}
+
+class MirrorAE(MLAlgorithm):
+    """
+    Autoencoder base class.
+
+    :param config: dictionary with hyperparameters for autoencoder training. Any hyperparameters not specified will be taken from the default values in `DEFAULT_OPT`
+    :type config: dict
+    :param mode: string to specify whether you want to train an autoencoder for each histogram ("individual") or a single autoencoder on all histograms ("simultaneous")
+    :type mode: str
+    """
+    def __init__(self, **kwargs):
+        super(MirrorAE, self).__init__(**kwargs)
+
+        self.config = utils.update_dict(
+                original = DEFAULT_OPT,
+                new = kwargs.get('config', {})
+        )
+
+        self.mode = kwargs.get('autoencoder_mode', 'individual')
+        if self.mode is None:
+            self.mode = "individual"
+
+        if not self.mode in ["individual", "simultaneous"]:
+            logger.exception("[AutoEncoder : __init__] mode '%s' is not a recognized option for AutoEncoder. Currently available modes are 'individual' (default) and 'simultaneous'." % (self.mode))
+            raise ValueError()
+        self.models = {}
+
+        logger.debug("[AutoEncoder : __init__] Constructing AutoEncoder with the following training options and hyperparameters:")
+        for param, value in self.config.items():
+            logger.debug("\t %s : %s" % (param, str(value)))
+
+
+    def load_model(self, model_file):
+        """
+
+        """
+        custom_layers = {'DenseTied': DenseTied}
+        with keras.utils.custom_object_scope(custom_layers):
+            model = keras.models.load_model(model_file)
+        return model
+
+
+    def save_model(self, model, model_file):
+        """
+
+        """
+        logger.debug("[AutoEncoder : save_model] Saving trained autoencoder to file '%s'." % (model_file))
+        model.save(model_file)
+
+
+    def train(self):
+        """
+
+        """
+        if self.mode == "simultaneous":
+            self.models = { None : None }
+            logger.debug("[AutoEncoder : train] Mode selected as 'simultaneous', meaning a single autoencoder will be trained simultaneously on all histograms. Use 'individual' if you wish to train one autoencoder for each histogram.")
+        elif self.mode == "individual":
+            self.models = { k : None for k,v in self.histograms.items() } #copy.deepcopy(self.histograms)
+            logger.debug("[AutoEncoder : train] Mode selected as 'individual', meaning one autoencoder will be trained for each histogram. Use 'simultaneous' if you wish to train a single autoencoder for all histograms.")
+
+        for histogram, histogram_info in self.models.items():
+            if histogram is None:
+                model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
+            else:
+                model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag)
+
+            if os.path.exists(model_file):
+                self.models[histogram] = self.load_model(model_file)
+                if self.config['overwrite']:
+                    logger.warning("[AutoEncoder : train] Overwrite has been turned on and a trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file and continue to train it." % (self.tag, model_file))
+                    model = self.models[histogram]
+                else:
+                    logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
+                    continue
+
+            inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram)
+            inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram)
+
+            if histogram is None:
+                hist_name = str(list(self.models.keys()))
+            else:
+                hist_name = histogram
+            logger.debug("[AutoEncoder : train] Training autoencoder with %d dimensions in latent space for histogram(s) '%s' with %d training examples." % (self.config["n_components"], hist_name, len(list(inputs.values())[0])))
+
+            if self.mode == "simultaneous":
+                histograms = self.histograms
+            elif self.mode == "individual":
+                histograms = { histogram : self.histograms[histogram] }
+
+            if not os.path.exists(model_file):
+                 model = AutoEncoder_DNN(histograms, **self.config).model()
+                 if self.config["overwrite"]:
+                     logger.warning("[AutoEncoder : train] Overwrite has been turned on but no existing model was found with tag '%s' at file '%s'. We will create and train a new model." % (self.tag, model_file))
+            model.summary()
+            model.compile(
+                    optimizer = keras.optimizers.Adam(learning_rate = self.config["learning_rate"]),
+                    loss = self.config["loss"],
+                    metrics = ['mse']
+            )
+
+            callbacks = []
+            if self.config["early_stopping"]:
+                callbacks.append(keras.callbacks.EarlyStopping(monitor = 'val_mse', patience = self.config["early_stopping_rounds"], restore_best_weights = self.config['retain_best']))
+            if self.config["lr_plateau_decay"]:
+                callbacks.append(keras.callbacks.ReduceLROnPlateau(patience = self.config["lr_plateau_decay_rounds"], factor = self.config["lr_plateau_decay_factor"], min_delta = self.config["lr_plateau_threshold"]))
+
+            history = model.fit(
+                    inputs,
+                    outputs,
+                    validation_data = (inputs_val, outputs_val),
+                    callbacks = callbacks,
+                    epochs = self.config["n_epochs"],
+                    batch_size = self.config["batch_size"]
+            )
+            if self.stats_output_dir:
+               if not os.path.isdir(self.stats_output_dir):
+                   os.mkdir(self.stats_output_dir)
+               history = pandas.DataFrame(history.history)
+               runlabel = 'run_' + datetime.now().strftime('%H%M%S%m%d')
+               logger.info("[AutoEncoder : train] Saving training statistics in '%s'." % (self.stats_output_dir))
+               history.to_csv(self.stats_output_dir + runlabel + '_history.csv')
+               history.to_parquet(self.stats_output_dir + runlabel + '_history.parquet')
+               make_training_plots(history, histogram, self.stats_output_dir + runlabel + '_plots.png')
+
+
+
+            self.save_model(model, model_file)
+            self.models[histogram] = model
+
+
+    def predict(self, batch_size = 1024):
+        for histogram, model in self.models.items():
+            inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram)
+
+            predictions = model.predict(inputs, batch_size = batch_size)
+            if self.mode == "simultaneous" and self.n_histograms >= 2:
+                predictions = { name : pred for name, pred in zip(model.output_names, predictions) }
+            else:
+                predictions = { model.output_names[0] : predictions }
+
+            for name, pred in predictions.items():
+                hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1]
+                original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions]
+
+                reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions]
+                        awkward.from_numpy(pred),
+                        axis = -1
+                )
+                #reconstructed_hist = pred
+                sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension
+                        (original_hist - reconstructed_hist) ** 2,
+                        axis = -1
+                )
+
+                # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
+                if self.histograms[hist_name]["n_dim"] == 2:
+                    sse = awkward.sum(sse, axis = -1) # second histogram dimension
+
+                self.add_prediction(hist_name, sse, reconstructed_hist)
+
+    def make_inputs(self, split = None, histogram_name = None):
+        """
+
+        """
+        inputs = {}
+        outputs = {}
+
+        for histogram, info in self.histograms.items():
+            if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram
+                if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder
+                    continue
+            if 'CSC' in histogram_name:
+                label_field = 'CSC_label'
+            elif 'emtf' in histogram_name:
+                label_field = 'EMTF_label'
+            else:
+                label_field = None
+
+            if label_field and len(numpy.unique(self.df[label_field])) > 1: #Don't Include Anomalous Runs in Training
+                if split == "train":
+                    cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
+                elif split == "test":
+                    cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
+                elif split == "all":
+                    cut = self.df.run_number >= 0
+                else:
+                    cut = [l == kGOOD for l in self.df[label_field]]
+            else:
+                if split == "train":
+                    cut = self.df.train_label == 0
+                elif split == "test":
+                    cut = self.df.train_label == 1
+                else:
+                    cut = self.df.run_number >= 0 # dummy all True cut
+
+            df = self.df[cut]
+            data = tf.convert_to_tensor(df[histogram])
+            inputs["input_" + info["name"]] = data
+            outputs["output_" + info["name"]] = data
+
+        return inputs, outputs
+
+
+class AutoEncoder_DNN():
+    """
+    An AutoEncoder instance owns AutoEncoder_DNN(s), which is the actual implementation of the DNN.
+    """
+    def __init__(self, histograms, **kwargs):
+        self.n_histograms = len(histograms.keys())
+
+        self.__dict__.update(kwargs)
+        self.inputs = []
+        self.outputs = []
+        self.encoders = []
+        self.decoders = []
+        self.coupled = {}
+        self.conv_sizes = {}
+        for histogram, info in histograms.items():
+            input, encoder = self.build_encoder(histogram, info)
+            self.inputs.append(input)
+            self.encoders.append(encoder)
+            output = self.build_decoder(histogram, info, encoder)
+            self.outputs.append(output)
+
+
+    def model(self):
+        model = keras.models.Model(
+                inputs = self.inputs,
+                outputs = self.outputs,
+                name = "autoencoder"
+        )
+        #model.summary()
+        return model
+
+
+    def build_encoder(self, histogram, info):        
+        input = keras.layers.Input(
+                shape = info["shape"] + (1,),
+                name = "input_%s" % info["name"]
+        )
+        self.conv_sizes[histogram] = [tf.shape(input)._inferred_value[1:]]
+        layer = input
+        for i in range(self.n_conv_layers):
+            name = "encoder_%d_%s" % (i, info["name"])
+            if info["n_dim"] == 1:
+                layer = keras.layers.Conv1D(
+                        filters = self.n_filters,
+                        kernel_size = self.kernel_1d,
+                        strides = self.strides_1d,
+                        activation = "relu",
+                        name = name
+                )(layer)
+            elif info["n_dim"] == 2:
+                layer = keras.layers.Conv2D(
+                        filters = self.n_filters,
+                        kernel_size = self.kernel_2d,
+                        use_bias = False,
+                        strides = self.strides_2d,
+                        activation = "relu",
+                        name = name
+                )(layer)
+            self.conv_sizes[histogram].append(tf.shape(layer)._inferred_value[1:])
+            if self.batch_norm:
+                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")(layer)
+            if self.dropout > 0:
+                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")(layer)
+
+        if self.n_hidden_layers >= 1:    
+            layer = keras.layers.Flatten()(layer)
+            self.coupled[histogram] = []
+        
+            for i in range(self.n_hidden_layers-1):
+                dense = keras.layers.Dense(
+                        units = self.n_nodes,
+                        activation = "relu",
+                        use_bias = False,
+                        name = "Encoder_Hidden_%d" % i,
+                )
+                if self.tied_dense:
+                    self.coupled[histogram].append(dense)
+                layer = dense(layer)
+
+            dense = keras.layers.Dense(
+                    units = self.n_components,
+                    activation = "relu",
+                    use_bias = False,
+                    name = "Encoder_Output"
+            )
+            if self.tied_dense:
+                self.coupled[histogram].append(dense)
+            encoder = dense(layer)
+        
+        else:
+            encoder = layer
+
+        return input, encoder
+
+
+    def build_decoder(self, histogram, info, input):
+        layer = input
+        if not self.tied_dense:
+            self.coupled[histogram] = [None]*self.n_hidden_layers
+        if self.n_hidden_layers >= 1:
+            self.coupled[histogram].reverse()
+            for i in range(len(self.coupled[histogram]) - 1):
+                layer = DenseTied(
+                        units = self.n_nodes,
+                        activation = "relu",
+                        use_bias = False,
+                        name = "Decoder_Hidden_%d" % i,
+                        tied_to = self.coupled[histogram][i]
+                )(layer)
+            units = 1
+            for m in self.conv_sizes[histogram][len(self.conv_sizes[histogram]) - 1]:
+                units *= m
+            layer = DenseTied(
+                    units = units,
+                    activation = "relu",
+                    use_bias = False,
+                    name = "Decoder_Hidden_%d" % (len(self.coupled[histogram]) - 1),
+                    tied_to = self.coupled[histogram][len(self.coupled[histogram]) - 1]
+            )(layer)
+        
+        self.conv_sizes[histogram].reverse()
+        if self.n_conv_layers >= 1:
+            name = "decoder_reshape"
+            target_shape = self.conv_sizes[histogram][0]
+        else:
+            target_shape = info["shape"] + (1,)
+            print(target_shape)
+            name = "output_%s" % (info["name"])
+        if self.n_hidden_layers >= 1:
+            layer = keras.layers.Reshape(target_shape = target_shape, name = name)(layer)
+        for i in range(self.n_conv_layers):
+            if i == (self.n_conv_layers - 1):
+                activation = "relu"
+                n_filters = 1
+                name = "output_%s" % (info["name"])
+                batch_norm = False
+                dropout = 0
+            else:
+                activation = "relu"
+                n_filters = self.n_filters
+                name = "decoder_%d_%s" % (i, info["name"])
+                batch_norm = self.batch_norm
+                dropout = self.dropout
+            cur_size = tf.shape(layer)._inferred_value[1:]
+            if info["n_dim"] == 1:
+                k = self.kernel_1d
+                s = self.strides_1d
+            elif info["n_dim"] == 2:
+                k = self.kernel_2d
+                s = self.strides_2d
+            l = self.conv_sizes[histogram][i+1]
+            out_pads = [l[j] - (cur_size[j] - 1)*s - k for j in range(info["n_dim"])]
+            if info["n_dim"] == 1:
+                layer = keras.layers.Conv1DTranspose(
+                        filters = n_filters,
+                        kernel_size = self.kernel_1d,
+                        strides = self.strides_1d,
+                        use_bias = False,
+                        output_padding = out_pads, 
+                        activation = activation,
+                        name = name
+                )(layer)
+            elif info["n_dim"] == 2:
+                layer = keras.layers.Conv2DTranspose(
+                        filters = n_filters,
+                        kernel_size = self.kernel_2d,
+                        strides = self.strides_2d,
+                        output_padding = out_pads,
+                        activation = activation,
+                        use_bias = False,
+                        name = name
+                )(layer)
+            if batch_norm:
+                layer = keras.layers.BatchNormalization(name = name + "_batch_norm")(layer)
+            if dropout > 0:
+                layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")(layer)
+           
+        output = layer
+        return output
+
+class DenseTied(keras.layers.Layer):
+    def __init__(self, units,
+                 activation=None,
+                 use_bias=True,
+                 kernel_initializer='glorot_uniform',
+                 bias_initializer='zeros',
+                 kernel_regularizer=None,
+                 bias_regularizer=None,
+                 activity_regularizer=None,
+                 kernel_constraint=None,
+                 bias_constraint=None,
+                 tied_to=None,
+                 **kwargs):
+        self.tied_to = tied_to
+        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
+            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
+        super().__init__(**kwargs)
+        self.config_terms = {
+                             'units':units,
+                             'activation':activation,
+                             'use_bias':use_bias,
+                             'kernel_initializer':kernel_initializer,
+                             'bias_initializer':bias_initializer,
+                             'kernel_regularizer':kernel_regularizer,
+                             'bias_regularizer':bias_regularizer,
+                             'activity_regularizer':activity_regularizer,
+                             'kernel_constraint':kernel_constraint,
+                             'bias_constraint':bias_constraint,
+                             'tied_to':tied_to
+        }
+        self.units = units
+        self.activation = keras.activations.get(activation)
+        self.use_bias = use_bias
+        self.kernel_initializer = keras.initializers.get(kernel_initializer)
+        self.bias_initializer = keras.initializers.get(bias_initializer)
+        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+        self.bias_regularizer = keras.regularizers.get(bias_regularizer)
+        self.activity_regularizer = keras.regularizers.get(activity_regularizer)
+        self.kernel_constraint = keras.constraints.get(kernel_constraint)
+        self.bias_constraint = keras.constraints.get(bias_constraint)
+        self.input_spec = keras.layers.InputSpec(min_ndim=2)
+        self.supports_masking = True
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(self.config_terms)
+        return config 
+
+    def build(self, input_shape):
+        assert len(input_shape) >= 2
+        input_dim = input_shape[-1]
+
+        if self.tied_to is not None:
+            self.kernel = tf.Variable(tf.transpose(self.tied_to.kernel), trainable = False)
+            self._non_trainable_weights.append(self.kernel)
+        else:
+            self.kernel = self.add_weight(shape=(input_dim, self.units),
+                                          initializer=self.kernel_initializer,
+                                          name='kernel',
+                                          regularizer=self.kernel_regularizer,
+                                          constraint=self.kernel_constraint)
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.units,),
+                                        initializer=self.bias_initializer,
+                                        name='bias',
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        else:
+            self.bias = None
+        self.input_spec = keras.layers.InputSpec(min_ndim=2, axes={-1: input_dim})
+        self.built = True
+
+    def compute_output_shape(self, input_shape):
+        assert input_shape and len(input_shape) >= 2
+        output_shape = list(input_shape)
+        output_shape[-1] = self.units
+        return tuple(output_shape)
+
+    def call(self, inputs):
+        output = tf.tensordot(inputs, self.kernel, axes = 1)
+        if self.use_bias:
+            output = tf.nn.bias_add(output, self.bias, data_format='channels_last')
+        if self.activation is not None:
+            output = self.activation(output)
+        return output