From 1c9aabf0ab4c992c331ab15929fc7b5125b375ff Mon Sep 17 00:00:00 2001 From: jocain Date: Fri, 12 Aug 2022 04:58:58 -0700 Subject: [PATCH 1/2] Autoencoder/General Training Features + Specialized Mirror AE --- autodqm_ml/algorithms/autoencoder.py | 240 +++++++++++++++-------- autodqm_ml/algorithms/pca.py | 44 +++-- autodqm_ml/evaluation/roc_tools.py | 2 +- autodqm_ml/plotting/plot_tools.py | 276 +++++++++++++++++++++++++-- scripts/assess.py | 112 ++++++++--- scripts/train.py | 21 +- 6 files changed, 559 insertions(+), 136 deletions(-) diff --git a/autodqm_ml/algorithms/autoencoder.py b/autodqm_ml/algorithms/autoencoder.py index b4226e6..5fd3004 100644 --- a/autodqm_ml/algorithms/autoencoder.py +++ b/autodqm_ml/algorithms/autoencoder.py @@ -11,26 +11,46 @@ import tensorflow as tf import tensorflow.keras as keras -from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm -from autodqm_ml import utils +from datetime import datetime +from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm +from autodqm_ml import utils +from autodqm_ml.constants import kANOMALOUS, kGOOD +from autodqm_ml.plotting.plot_tools import make_training_plots DEFAULT_OPT = { "batch_size" : 128, "val_batch_size" : 1024, "learning_rate" : 0.001, "n_epochs" : 1000, + "loss": "mse", "early_stopping" : True, "early_stopping_rounds" : 3, - "n_hidden_layers" : 2, + "n_conv_layers": 2, + "n_hidden_layers" : 1, "n_nodes" : 50, "n_components" : 3, "kernel_1d" : 3, "kernel_2d" : 3, "strides_1d" : 1, "strides_2d" : 1, + "n_filters" : 12, + "encoder_conv_padding" : "valid", + "pooling" : False, + "pooling_kernel_1d" : 2, + "pooling_kernel_2d" : 2, + "pooling_stride_1d" : None, + "pooling_stride_2d" : None, + "decoder_conv_layers" : True, "dropout" : 0.0, "batch_norm" : False, - "n_filters" : 12 + "retain_best":False, + "lr_plateau_decay": False, + "lr_plateau_decay_rounds": 5, + "lr_plateau_decay_factor": 1, + "lr_plateau_threshold": 0, + "overwrite":False, + "train_highest_only":False, + "low_stat_threshold": 10000 } class AutoEncoder(MLAlgorithm): @@ -49,7 +69,6 @@ def __init__(self, **kwargs): original = DEFAULT_OPT, new = kwargs.get('config', {}) ) - self.mode = kwargs.get('autoencoder_mode', 'individual') if self.mode is None: self.mode = "individual" @@ -98,11 +117,14 @@ def train(self): model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag) if os.path.exists(model_file): - logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file)) self.models[histogram] = self.load_model(model_file) - return - - + if self.config['overwrite']: + logger.warning("[AutoEncoder : train] Overwrite has been turned on and a trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file and continue to train it." % (self.tag, model_file)) + model = self.models[histogram] + else: + logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file)) + continue + inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram) inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram) @@ -117,18 +139,24 @@ def train(self): elif self.mode == "individual": histograms = { histogram : self.histograms[histogram] } - model = AutoEncoder_DNN(histograms, **self.config).model() - + if not os.path.exists(model_file): + model = AutoEncoder_DNN(histograms, **self.config).model() + if self.config["overwrite"]: + logger.warning("[AutoEncoder : train] Overwrite has been turned on but no existing model was found with tag '%s' at file '%s'. We will create and train a new model." % (self.tag, model_file)) + model.summary() model.compile( optimizer = keras.optimizers.Adam(learning_rate = self.config["learning_rate"]), - loss = keras.losses.MeanSquaredError() + loss = self.config['loss'], + metrics = ["mse"] ) callbacks = [] if self.config["early_stopping"]: - callbacks.append(keras.callbacks.EarlyStopping(patience = self.config["early_stopping_rounds"])) + callbacks.append(keras.callbacks.EarlyStopping(monitor = 'val_mse', patience = self.config["early_stopping_rounds"], restore_best_weights = self.config['retain_best'])) + if self.config["lr_plateau_decay"]: + callbacks.append(keras.callbacks.ReduceLROnPlateau(patience = self.config["lr_plateau_decay_rounds"], factor = self.config["lr_plateau_decay_factor"], min_delta = self.config["lr_plateau_threshold"])) - model.fit( + history = model.fit( inputs, outputs, validation_data = (inputs_val, outputs_val), @@ -136,7 +164,18 @@ def train(self): epochs = self.config["n_epochs"], batch_size = self.config["batch_size"] ) - + if self.stats_output_dir: + if not os.path.isdir(self.stats_output_dir): + os.mkdir(self.stats_output_dir) + history = pandas.DataFrame(history.history) + runlabel = 'run_' + datetime.now().strftime('%H%M%S%m%d') + logger.info("[AutoEncoder : train] Saving training statistics in '%s'." % (self.stats_output_dir)) + history.to_csv(self.stats_output_dir + runlabel + '_history.csv') + history.to_parquet(self.stats_output_dir + runlabel + '_history.parquet') + make_training_plots(history, histogram, self.stats_output_dir + runlabel + '_plots.png') + + + self.save_model(model, model_file) self.models[histogram] = model @@ -144,8 +183,8 @@ def train(self): def predict(self, batch_size = 1024): for histogram, model in self.models.items(): inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram) + predictions = model.predict(inputs, batch_size = batch_size) - if self.mode == "simultaneous" and self.n_histograms >= 2: predictions = { name : pred for name, pred in zip(model.output_names, predictions) } else: @@ -154,12 +193,10 @@ def predict(self, batch_size = 1024): for name, pred in predictions.items(): hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1] original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions] - reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions] awkward.from_numpy(pred), axis = -1 ) - sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension (original_hist - reconstructed_hist) ** 2, axis = -1 @@ -179,20 +216,39 @@ def make_inputs(self, split = None, histogram_name = None): inputs = {} outputs = {} - if split == "train": - cut = self.df.train_label == 0 - elif split == "test": - cut = self.df.train_label == 1 - else: - cut = self.df.run_number >= 0 # dummy all True cut - - df = self.df[cut] - for histogram, info in self.histograms.items(): if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder continue - + if 'CSC' in histogram_name: + label_field = 'CSC_label' + elif 'emtf' in histogram_name: + label_field = 'EMTF_label' + else: + label_field = None + + if False:# label_field and len(numpy.unique(self.df[label_field])) > 1: #Don't Include Anomalous Runs in Training + if split == "train": + message = ('[AutoEncoder : train] Histogram %s is labeled. %i/%i anomalous runs have been removed from the train set.'% (histogram_name, numpy.sum([self.df.train_label[i] == 0 and self.df[label_field][i] == kANOMALOUS for i in range(len(self.df))]), numpy.sum([self.df.train_label[i] == 0 for i in range(len(self.df))]))) + cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))] + elif split == "test": + message = ('[AutoEncoder : train] Histogram %s is labeled. %i/%i anomalous runs have been removed from the train set.'% (histogram_name, numpy.sum([self.df.train_label[i] == 1 and self.df[label_field][i] == kANOMALOUS for i in range(len(self.df))]), numpy.sum([self.df.train_label[i] == 1 for i in range(len(self.df))]))) + cut = [self.df.train_label[i] == 1 and self.df[label_field][i] == kGOOD for i in range(len(self.df))] + elif split == "all": + message = ("Data for Histogram %s is labeled, however, 'all' was selected, so all will be used in paritioning" % (histogram_name)) + cut = self.df.run_number >= 0 + else: + cut = self.df[label_field] == kGOOD + else: + logger.debug("[AutoEncoder : train] Histogram %s is has no labels, so all will be utiliezed in splitting.") + if split == "train": + cut = self.df.train_label == 0 + elif split == "test": + cut = self.df.train_label == 1 + else: + cut = self.df.run_number >= 0 # dummy all True cut + + df = self.df[cut] data = tf.convert_to_tensor(df[histogram]) inputs["input_" + info["name"]] = data outputs["output_" + info["name"]] = data @@ -201,7 +257,6 @@ def make_inputs(self, split = None, histogram_name = None): return inputs, outputs - class AutoEncoder_DNN(): """ An AutoEncoder instance owns AutoEncoder_DNN(s), which is the actual implementation of the DNN. @@ -252,7 +307,7 @@ def model(self): outputs = self.outputs, name = "autoencoder" ) - model.summary() + #model.summary() return model @@ -263,28 +318,41 @@ def build_encoder(self, histogram, info): ) layer = input - for i in range(self.n_hidden_layers): + for i in range(self.n_conv_layers): name = "encoder_%d_%s" % (i, info["name"]) if info["n_dim"] == 1: layer = keras.layers.Conv1D( filters = self.n_filters, kernel_size = self.kernel_1d, - strides = 1, + strides = self.strides_1d, activation = "relu", - name = name + name = name, + padding = self.encoder_conv_padding )(layer) elif info["n_dim"] == 2: layer = keras.layers.Conv2D( filters = self.n_filters, kernel_size = self.kernel_2d, - strides = 1, + strides = self.strides_2d, + padding = self.encoder_conv_padding, activation = "relu", name = name )(layer) + if self.pooling: + if info["n_dim"] == 1: + layer = keras.layers.MaxPooling1D( + pool_size = self.pooling_kernel_1d, + strides = self.pooling_stride_1d, + )(layer) + elif info["n_dim"] == 2: + layer = keras.layers.MaxPooling2D( + pool_size = (self.pooling_kernel_2d, self.pooling_kernel_2d), + strides = self.pooling_stride_2d, + )(layer) if self.batch_norm: - layer = keras.layers.BatchNormalization(name = name + "_batch_norm") + layer = keras.layers.BatchNormalization(name = name + "_batch_norm")(layer) if self.dropout > 0: - layer = keras.layers.Dropout(self.dropout, name = name + "_dropout") + layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")(layer) encoder = keras.layers.Flatten()(layer) @@ -292,51 +360,67 @@ def build_encoder(self, histogram, info): def build_decoder(self, histogram, info, input): - n_output_units = info["n_bins"] * self.n_filters - layer = keras.layers.Dense( + if self.decoder_conv_layers: + n_output_units = info["n_bins"] * self.n_filters + layer = keras.layers.Dense( units = n_output_units, activation = "relu", name = "decoder_input_%s" % info["name"] - )(input) - target_shape = info["shape"] + (self.n_filters,) - layer = keras.layers.Reshape(target_shape = target_shape)(layer) - - for i in range(self.n_hidden_layers): - if i == (self.n_hidden_layers - 1): - activation = "relu" - n_filters = 1 - name = "output_%s" % (info["name"]) - batch_norm = False - dropout = 0 - else: - activation = "relu" - n_filters = self.n_filters - name = "decoder_%d_%s" % (i, info["name"]) - batch_norm = self.batch_norm - dropout = self.dropout - - if info["n_dim"] == 1: - layer = keras.layers.Conv1DTranspose( - filters = n_filters, - kernel_size = self.kernel_1d, - strides = 1, - padding = "same", - activation = activation, - name = name - )(layer) - elif info["n_dim"] == 2: - layer = keras.layers.Conv2DTranspose( - filters = n_filters, - kernel_size = self.kernel_2d, - strides = 1, - padding = "same", - activation = activation, - name = name - )(layer) - if batch_norm: - layer = keras.layers.BatchNormalization(name = name + "_batch_norm") - if dropout > 0: - layer = keras.layers.Dropout(self.dropout, name = name + "_dropout") + )(input) + target_shape = info["shape"] + (self.n_filters,) + layer = keras.layers.Reshape(target_shape = target_shape)(layer) + + for i in range(self.n_conv_layers): + if i == (self.n_conv_layers - 1): + activation = "relu" + n_filters = 1 + name = "output_%s" % (info["name"]) + batch_norm = False + dropout = 0 + else: + activation = "relu" + n_filters = self.n_filters + name = "decoder_%d_%s" % (i, info["name"]) + batch_norm = self.batch_norm + dropout = self.dropout + + if info["n_dim"] == 1: + layer = keras.layers.Conv1DTranspose( + filters = n_filters, + kernel_size = self.kernel_1d, + strides = self.self.strides_1d, + padding = "same", + activation = activation, + name = name + )(layer) + elif info["n_dim"] == 2: + layer = keras.layers.Conv2DTranspose( + filters = n_filters, + kernel_size = self.kernel_2d, + strides = self.strides_2d, + padding = "same", + activation = activation, + name = name + )(layer) + if batch_norm: + layer = keras.layers.BatchNormalization(name = name + "_batch_norm")(layer) + if dropout > 0: + layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")(layer) + else: + n_output_units = info["n_bins"] + layer = keras.layers.Dense( + units = n_output_units, + activation = "relu" + )(input) + target_shape = info["shape"] + (1,) + + layer = keras.layers.Reshape( + target_shape = target_shape, + name = "output_%s" % (info["name"]) + )(layer) output = layer return output +def mse_cutoff(y_true, y_pred): + mse = keras.losses.MeanSquaredError() + return tf.math.minimum(mse(y_true, y_pred), 5e-6) diff --git a/autodqm_ml/algorithms/pca.py b/autodqm_ml/algorithms/pca.py index cbc60eb..64aab08 100644 --- a/autodqm_ml/algorithms/pca.py +++ b/autodqm_ml/algorithms/pca.py @@ -15,6 +15,7 @@ from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm from autodqm_ml.data_formats.histogram import Histogram from autodqm_ml.plotting.plot_tools import plot1D, plotMSESummary +from autodqm_ml.constants import kGOOD, kANOMALOUS DEFAULT_OPT = { "n_components" : 2 @@ -27,8 +28,11 @@ class PCA(MLAlgorithm): def __init__(self, **kwargs): super(PCA, self).__init__(**kwargs) + if not hasattr(self, "n_components"): self.n_components = DEFAULT_OPT["n_components"] + + self.hist_shape = None def load_model(self, model_file): """ @@ -95,17 +99,35 @@ def get_histogram(self, histogram, split = "all"): :return: a 1d histogram (flattened if originally a 2d histogram) :rtype: awkward.Array """ - - if split == "train": - runs = self.df[self.df.train_label == 0] - elif split == "test": - runs = self.df[self.df.train_label == 1] - elif split == "all": - runs = self.df - - h = runs[histogram] + if 'CSC' in histogram: + label_field = 'CSC_label' + elif 'emtf' in histogram: + label_field = 'EMTF_label' + else: + label_field = None + + if label_field and len(numpy.unique(self.df[label_field])) > 1: #Don't Include Anomalous Runs in Training + if split == "train": + cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))] + elif split == "test": + cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))] + elif split == "all": + cut = self.df.run_number >= 0 + else: + cut = self.df[label_field] == kGOOD + else: + if split == "train": + cut = self.df.train_label == 0 + elif split == "test": + cut = self.df.train_label == 1 + else: + cut = self.df.run_number >= 0 # dummy all True cut + + df = self.df[cut] + h = df[histogram] n_dim = len(awkward.to_numpy(h[0]).shape) + self.hist_shape = awkward.to_numpy(h).shape if n_dim == 2: h = awkward.flatten(h, axis = 2) @@ -165,6 +187,6 @@ def predict(self): (original_hist - reconstructed_hist) ** 2, axis = -1 ) - - self.add_prediction(histogram, sse, reconstructed_hist) + + self.add_prediction(histogram, sse, numpy.array(reconstructed_hist).reshape(self.hist_shape)) diff --git a/autodqm_ml/evaluation/roc_tools.py b/autodqm_ml/evaluation/roc_tools.py index 729e3f8..b23ae91 100644 --- a/autodqm_ml/evaluation/roc_tools.py +++ b/autodqm_ml/evaluation/roc_tools.py @@ -49,7 +49,7 @@ def bootstrap_indices(x): return numpy.random.randint(0, len(x), len(x)) -def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 100, interp = 10000): +def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 1000, interp = 10000): """ Calculates tpr and fpr arrays (with uncertainty for tpr) and auc and uncertainty Keyword arguments: diff --git a/autodqm_ml/plotting/plot_tools.py b/autodqm_ml/plotting/plot_tools.py index ee81c7f..23c863a 100644 --- a/autodqm_ml/plotting/plot_tools.py +++ b/autodqm_ml/plotting/plot_tools.py @@ -2,8 +2,15 @@ import numpy as np from pathlib import Path import awkward +import os -from yahist import Hist1D +import pandas as pd + +from yahist import Hist1D, Hist2D + +from matplotlib import colors + +from datetime import datetime import logging logger = logging.getLogger(__name__) @@ -43,22 +50,28 @@ def make_sse_plot(name, recos, save_name, **kwargs): plt.clf() -def make_original_vs_reconstructed_plot(name, original, recos, run, save_name, **kwargs): +def make_original_vs_reconstructed_plot(name, original, recos, run, save_name, hist_layout, **kwargs): n_dim = len(np.array(original).shape) if n_dim == 1: make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name, **kwargs) elif n_dim == 2: - original_flat = awkward.flatten(original, axis = -1) - recos_flat = {} - for algorithm, reco in recos.items(): - recos_flat[algorithm] = { - "reco" : awkward.flatten(reco["reco"], axis = -1), - "score" : reco["score"] - } - - make_original_vs_reconstructed_plot1d(name, original_flat, recos_flat, run, save_name, **kwargs) + if hist_layout == 'flatten': + original_flat = awkward.flatten(original, axis = -1) + recos_flat = {} + for algorithm, reco in recos.items(): + recos_flat[algorithm] = { + "reco" : awkward.flatten(reco["reco"], axis = -1), + "score" : reco["score"] + } + make_original_vs_reconstructed_plot1d(name, original_flat, recos_flat, run, save_name, **kwargs) + elif hist_layout == '2d': + make_original_vs_reconstructed_plot2d(name, original, recos, run, save_name, **kwargs) + else: + message = "[plot_tools.py : make_original_vs_reconstructed_plot] Please specify a valid histogram layout option: flatten (default), 2d" + logger.exception(message) + raise RuntimeError() else: message = "[plot_tools.py : make_original_vs_reconstructed_plot] Plotting not implemented for histograms with dimension %d." % (n_dim) @@ -76,6 +89,7 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name, h_orig = Hist1D(original, bins = bins, label = "original") h_orig._counts = original h_reco = [] + for reco, info in recos.items(): h = Hist1D(info["reco"], bins = bins, label = "%s [sse : %.2E]" % (reco, info["score"])) h._counts = info["reco"] @@ -109,7 +123,68 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name, plt.savefig(save_name.replace(".pdf", ".png")) plt.clf() +def make_original_vs_reconstructed_plot2d(name, original, recos, run, save_name, **kwargs): + x_label = name + " (a.u.)" + y_label = "Fraction of events" + extent = (0, 1, 0, 1) + color_map = plt.cm.Purples + rat_lim = kwargs.get("rat_lim", [0.0, 2.0]) + log_y = kwargs.get("log_y", False) + h_reco = [] + labels = [] + base_vmax = awkward.max(original) + base_vmin = awkward.min(original) + ratio_vmax = -10 + ratio_vmin = 10 + ratios = [] + for reco, info in recos.items(): + h_reco.append(info["reco"]) + ratio = np.abs(info["reco"] - original) + ratios.append(ratio) + base_vmax = np.max((base_vmax, awkward.max(info["reco"]))) + base_vmin = np.min((base_vmin, awkward.min(info["reco"]))) + ratio_vmax = np.max((ratio_vmax, awkward.max(ratio))) + ratio_vmin = np.min((ratio_vmin, awkward.min(ratio))) + labels.append("%s [sse : %.2E]" % (reco, info["score"])) + + fig, axes = plt.subplots(2, len(h_reco) + 1, figsize=(5 + len(h_reco)*5, 6), gridspec_kw=dict(height_ratios=[3, 1]), sharey = True, sharex=True) + + if log_y: + base_norm = colors.LogNorm(base_vmin, base_vmax) + ratio_norm = colors.LogNorm(ratio_vmin, ratio_vmax) + else: + base_norm = colors.Normalize(base_vmin, base_vmax) + ratio_norm = colors.Normalize(ratio_vmin, ratio_vmax) + #plt.grid() + axes[0][0].imshow(original, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto') + axes[0][0].set_title("Original") + #plt.colorbar(mesh, ax = axes[0][0]) + axes[0][0].grid() + for idx, h in enumerate(h_reco): + if idx == len(h_reco) - 1: + pos = axes[0][idx+1].imshow(h, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto') + cax = axes[0][idx+1].inset_axes([1.1, 0, 0.1, 1]) + plt.colorbar(pos, cax = cax, ax = axes[0][idx+1]) + pos = axes[1][idx+1].imshow(ratios[idx], norm=ratio_norm, cmap=color_map, extent = extent, aspect = 'auto') + cax = axes[1][idx+1].inset_axes([1.1, 0, 0.1, 1]) + plt.colorbar(pos, cax = cax, ax = axes[1][idx+1]) + else: + pos = axes[0][idx+1].imshow(h, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto') + pos = axes[1][idx+1].imshow(ratios[idx], norm=ratio_norm, cmap=color_map, extent = extent, aspect = 'auto') + axes[0][idx+1].set_title(labels[idx]) + axes[0][idx+1].grid() + axes[1][idx+1].grid() + axes[1][0].remove() + axes[0][0].set_ylabel(y_label) + axes[1][1].set_ylabel("ML Reco - Original") + axes[0][0].set_xlabel(x_label) + + logger.debug("[plot_tools.py : make_original_vs_reconstructed_plot1d] Writing plot to file '%s'. " % (save_name)) + plt.savefig(save_name, bbox_inches='tight') + plt.savefig(save_name.replace(".pdf", ".png"), bbox_inches='tight') + plt.clf() + def plot1D(original_hist, reconstructed_hist, run, hist_path, algo, threshold): """ plots given original and recontructed histogram. Will plot the MSE plot if the SSE is over the threshold. @@ -255,3 +330,182 @@ def plot_roc_curve(h_name, results, save_name, **kwargs): plt.savefig(save_name.replace(".pdf", ".png")) plt.clf() +def plot_rescaled_score_hist(data, hist, savename): + + fig, axes = plt.subplots(len(data['score']), 1, figsize = (12, 4*len(data['score']))) + if len(data['score']) == 1: + axes = [axes] + for i in range(len(data['score'])): + score = data['score'][i] + score_min = awkward.min(score) + score_max = awkward.max(score) - score_min + score = score - awkward.min(score) + score = score/awkward.max(score) + axes[i].hist(score, bins = np.logspace(np.log10(1e-4),np.log10(1.0), 100), color = 'tab:blue', alpha = 0.8, label = 'All Runs') + axes[i].set_ylabel(data['algo'][i]) + axes[i].set_yscale('log') + axes[i].set_xscale('log') + if 'bad' in data: + badax = axes[i].twinx() + bad = data['bad'][i] + bad = bad - score_min + bad = bad/score_max + badax.hist(bad, bins = np.logspace(np.log10(1e-4),np.log10(1.0), 100), range = (0, 1), color = 'tab:orange', alpha = 0.8, label ='Bad Runs') + axes[i].spines['left'].set_color('tab:blue') + badax.xaxis.label.set_color('tab:orange') + axes[i].xaxis.label.set_color('tab:blue') + if i == 0: + badax.set_ylabel('Anomalous Runs') + badax.spines['right'].set_color('tab:orange') + badax.set_xscale('log') + fig.suptitle(hist) + axes[0].legend() + axes[0].set_title('Min-Max Scaled Anomaly Scores') + fig.savefig(savename, bbox_inches = 'tight') + fig.savefig(savename.replace('.png', '.pdf'), bbox_inches = 'tight') + +def make_training_plots(history, hist, save_file): + epochs = range(len(history['loss'])) + print(len(history.columns)) + fig, axes = plt.subplots(1, len(history.columns), figsize = (len(history.columns)*9, 9)) + i = 0 + fig.suptitle(hist, fontsize = 22) + for stat, y in history.items(): + axes[i].plot(epochs, y) + axes[i].set_xlabel('Epoch', fontsize = 15) + axes[i].set_title(stat, fontsize = 18) + axes[i].set_yscale('log') + i += 1 + plt.savefig(save_file, bbox_inches = 'tight') +def multi_exp_plots(paths, xlabel, x, title, legend = None, logx = False, logy = False): + fig, axes = plt.subplots(1, 4, figsize = (36, 9)) + i = 0 + fig.suptitle(title, fontsize = 22) + if not legend: + legend = [None]*len(x) + if type(paths) == list: + for i in range(len(paths)): + make_one_var_exp_plots(paths[i], xlabel, x, axes, legend[i], logx) + if paths[0][len(paths[0]) - 1] == '/': + savepath = paths[0][:len(paths[0]) - 1] + else: + savepath = paths[0] + filename = title.replace(' ', '_') + for c in '()[]{}/.,:;?!@#$^&*': + filename = filename.replace(c, '') + savepath = savepath[:str.rindex(savepath, '/')] + '/' + filename + '_plots.png' + print(savepath) + else: + make_one_var_exp_plots(paths, xlabel, x, axes, legend, logx) + plt.savefig(paths + 'plots.png', bbox_inches = 'tight') + +def make_one_var_exp_plots(path, xlabel, x, axes, label = None, logx = False): + data = {'Epochs Trained':[], 'Epochs Trained Std':[], + 'Best Train Loss':[], 'Best Train Loss Std':[], + 'Best Validation Loss':[], 'Best Validation Loss Std':[], + 'Ending Learning Rate':[], 'Ending Learning Rate Std':[]} + levels = [dir for dir in os.listdir(path) if not '.png' in dir] + if not label: + if path[len(path) - 1] == '/': + label = path[:len(path) - 1] + label = label[label.rindex('/') + 1:] + else: + label = path[path.rindex('/')] + for level in levels: + dirs = os.listdir(path+level) + files = [file for file in dirs if '.csv' in file] + subset = {'Epochs Trained':[], 'Best Train Loss':[], 'Best Validation Loss':[],'Ending Learning Rate':[]} + for file in files: + filepath = path + level + '/' + file + df = pd.read_csv(filepath) + subset['Epochs Trained'].append(len(df['loss'])) + subset['Best Train Loss'].append(np.min(df['loss'])) + subset['Best Validation Loss'].append(np.min(df['val_loss'])) + subset['Ending Learning Rate'].append(df['lr'].iloc[len(df['loss']) - 1]) + for item, values in subset.items(): + data[item].append(np.mean(values)) + data[item + ' Std'].append(np.std(values)) + i = 0 + for item in data: + if not 'Std' in item: + axes[i].errorbar(x, data[item], data[item + ' Std'], label = label) + axes[i].set_title(item, fontsize = 18) + if logx: + axes[i].set_xscale('log') + if i == 3: + axes[i].set_yscale('log') + axes[i].set_xlabel(xlabel, fontsize = 15) + i += 1 + + +def multi_exp_bar_plots(paths, xlabel, title, legend = None): + b = 9 + s = b/(len(xlabel) + .5) + fig, axes = plt.subplots(1, 5, figsize = (45, 9)) + fig.suptitle(title, fontsize = 22) + if not legend: + legend = [None]*len(paths) + if type(paths) == list: + for i in range(len(paths)): + make_one_var_exp_bar_plots(paths[i], xlabel, axes, i, b, s, len(paths), legend[i]) + if paths[0][len(paths[0]) - 1] == '/': + savepath = paths[0][:len(paths[0]) - 1] + else: + savepath = paths[0] + filename = title.replace(' ', '_') + for c in '()[]{}/.,:;?!@#$^&*': + filename = filename.replace(c, '') + savepath = savepath[:str.rindex(savepath, '/')] + '/' + filename + '_plots.png' + print(savepath) + else: + + make_one_var_exp_bar_plots(paths, xlabel, axes, 0, b, s, 1, legend[0]) + plt.savefig(paths + 'plots.png', bbox_inches = 'tight') + +def make_one_var_exp_bar_plots(path, xlabel, axes, i, b, s, n, label = None): + data = {'Epochs Trained':[], 'Epochs Trained Std':[], + 'Best Train Loss':[], 'Best Train Loss Std':[], + 'Best Validation Loss':[], 'Best Validation Loss Std':[], + 'Ending Learning Rate':[], 'Ending Learning Rate Std':[]} + levels = [dir for dir in os.listdir(path) if not '.png' in dir and not 'assess' in dir] + if not label: + if path[len(path) - 1] == '/': + label = path[:len(path) - 1] + label = label[label.rindex('/') + 1:] + else: + label = path[path.rindex('/')] + for level in levels: + dirs = os.listdir(path+level) + files = [file for file in dirs if '.csv' in file] + subset = {'Epochs Trained':[], 'Best Train Loss':[], 'Best Validation Loss':[],'Ending Learning Rate':[]} + for file in files: + filepath = path + level + '/' + file + df = pd.read_csv(filepath) + subset['Epochs Trained'].append(len(df['loss'])) + subset['Best Train Loss'].append(np.min(df['loss'])) + subset['Best Validation Loss'].append(np.min(df['val_loss'])) + subset['Ending Learning Rate'].append(df['lr'].iloc[len(df['loss']) - 1]) + if 'mse' in df.columns: + if 'Best MSE' not in data: + data['Best MSE'] = [] + data['Best MSE Std'] = [] + if 'Best MSE' not in subset: + subset['Best MSE'] = [] + subset['Best MSE'].append(np.min(df['mse'])) + for item, values in subset.items(): + data[item].append(np.mean(values)) + data[item + ' Std'].append(np.std(values)) + if 'MSE' in item: + print(level, np.mean(values)) + print(values) + m = 0 + for item in data: + if not 'Std' in item: + x = [s + 2*s*i + j*b for j in range(len(xlabel))] + print(data[item]) + axes[m].bar(x, data[item], yerr = data[item + ' Std'], width = (b-.5)/n, label = label) + axes[m].set_title(item, fontsize = 18) + if 'Loss' in item: + axes[m].set_yscale('log') + axes[m].set_xticks(x, xlabel, fontsize = 15) + m += 1 diff --git a/scripts/assess.py b/scripts/assess.py index 8ad7c36..79efdd5 100644 --- a/scripts/assess.py +++ b/scripts/assess.py @@ -4,9 +4,11 @@ import awkward import numpy +import pandas + from autodqm_ml.utils import setup_logger from autodqm_ml.utils import expand_path -from autodqm_ml.plotting.plot_tools import make_original_vs_reconstructed_plot, make_sse_plot, plot_roc_curve +from autodqm_ml.plotting.plot_tools import make_original_vs_reconstructed_plot, make_sse_plot, plot_roc_curve, plot_rescaled_score_hist from autodqm_ml.evaluation.roc_tools import calc_roc_and_unc, print_eff_table from autodqm_ml.constants import kANOMALOUS, kGOOD @@ -60,6 +62,12 @@ def parse_arguments(): action="store_true", help="make a nicely browsable web page" ) + parser.add_argument( + "--hist_layout", + type = str, + required = False, + default = 'flatten' + ) parser.add_argument( "--debug", help = "run logger in DEBUG mode (INFO is default)", @@ -100,8 +108,15 @@ def main(args): logger_mode = "DEBUG" if args.debug else "INFO" log_file = "%s/fetch_data_log_%s.txt" % (args.output_dir, "assess") logger = setup_logger(logger_mode, log_file) - - + + stats = { + 'hist': [], + 'dim_0': [], + 'dim_1': [], + 'algo': [], + 'avg_an_score': [], + 'std_an_score': [] + } histograms = { x : {"algorithms" : {}} for x in args.histograms.split(",") } runs = awkward.from_parquet(args.input_file) @@ -121,6 +136,7 @@ def main(args): # Print out runs with N highest sse scores for each histogram N = 5 for h, info in histograms.items(): + score_hist_data = {'algo':[], 'score':[], 'bad':[]} #track scores for histogram for algorithm, algorithm_info in info["algorithms"].items(): runs_sorted = runs[awkward.argsort(runs[algorithm_info["score"]], ascending=False)] logger.info("[assess.py] For histogram '%s', algorithm '%s', the mean +/- std anomaly score is: %.2e +/- %.2e." % (h, algorithm, awkward.mean(runs[algorithm_info["score"]]), awkward.std(runs[algorithm_info["score"]]))) @@ -128,18 +144,37 @@ def main(args): logger.info("\t The runs with the highest anomaly scores are:") for i in range(N): logger.info("\t Run number : %d, Anomaly Score : %.2e" % (runs_sorted.run_number[i], runs_sorted[algorithm_info["score"]][i])) - + stats['hist'].append(h) + stats['algo'].append(algorithm) + stats['avg_an_score'].append(awkward.mean(runs[algorithm_info["score"]])) + stats['std_an_score'].append(awkward.std(runs[algorithm_info["score"]])) + if 'CSC' in h: + label_field = 'CSC_label' + elif 'emtf' in h: + label_field = 'EMTF_label' + if len(numpy.unique(runs[label_field])) > 1: + score_hist_data['algo'].append(algorithm) + score_hist_data['score'].append(runs[algorithm_info["score"]][runs[label_field] == 0]) + score_hist_data['bad'].append(runs[algorithm_info["score"]][runs[label_field] == 1]) + if not os.path.isdir(args.output_dir + "/" + h.replace("/", "").replace(" ", "") + "/"): + os.mkdir(args.output_dir + "/" + h.replace("/", "").replace(" ", "") + "/") + if len(score_hist_data['algo']) != 0: + plot_rescaled_score_hist(score_hist_data, h, args.output_dir + "/" + h.replace("/", "").replace(" ", "") + "/" + "score_hist.png") # Histogram of sse for algorithms splits = { "train_label" : [("train", 0), ("test", 1)], "label" : [("anomalous", kANOMALOUS), ("good", kGOOD)] } - + for h, info in histograms.items(): for split, split_info in splits.items(): recos_by_label = { k : {} for k,v in info["algorithms"].items() } for name, id in split_info: - runs_set = runs[runs[split] == id] + if 'CSC' in h: + label_field = 'CSC_label' + elif 'emtf' in h: + label_field = 'EMTF_label' + runs_set = runs[runs[label_field] == id] if len(runs_set) == 0: logger.warning("[assess.py] For histogram '%s', no runs belong to the set '%s', skipping making a histogram of SSE for this." % (h, name)) continue @@ -149,36 +184,43 @@ def main(args): recos_by_label[algorithm][name] = { "score" : runs_set[algorithm_info["score"]] } h_name = h.replace("/", "").replace(" ", "") - save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (split, name) + save_name = args.output_dir + "/" + h_name + "/sse_%s_%s.pdf" % (split, name) make_sse_plot(h_name, recos, save_name) for algorithm, recos_alg in recos_by_label.items(): if not recos_alg: continue - save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (algorithm, split) + save_name = args.output_dir + "/" + h_name + "/sse_%s_%s.pdf" % (algorithm, split) make_sse_plot(h_name, recos_alg, save_name) - - + # ROC curves (if there are labeled runs) - has_labeled_runs = True - labeled_runs_cut = runs.run_number < 0 # dummy all False cut - for name, id in splits["label"]: - cut = runs.label == id - labeled_runs_cut = labeled_runs_cut | cut - runs_set = runs[cut] - has_labeled_runs = has_labeled_runs and (len(runs_set) > 0) - - if has_labeled_runs: - labeled_runs = runs[labeled_runs_cut] - roc_results = {} - for h, info in histograms.items(): + has_labeled_runs = {h:True for h in histograms} + labeled_runs_cut = {h:runs.run_number < 0 for h in histograms} + for h, info in histograms.items(): + if 'CSC' in h: + label_field = 'CSC_label' + elif 'emtf' in h: + label_field = 'EMTF_label' + for name, id in splits["label"]: + cut = runs[label_field] == id + labeled_runs_cut[h] = labeled_runs_cut[h] | cut + runs_set = runs[cut] + has_labeled_runs[h] = has_labeled_runs[h] and (len(runs_set) > 0) + roc_results = {} + for h, info in histograms.items(): + if has_labeled_runs[h]: + labeled_runs = runs[labeled_runs_cut[h]] roc_results[h] = {} + if 'CSC' in h: + label_field = 'CSC_label' + elif 'emtf' in h: + label_field = 'EMTF_label' for algorithm, algorithm_info in info["algorithms"].items(): pred = labeled_runs[algorithm_info["score"]] - roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs.label, pred) + roc_results[h][algorithm] = calc_roc_and_unc(labeled_runs[label_field], pred) h_name = h.replace("/", "").replace(" ", "") - save_name = args.output_dir + "/" + h_name + "_roc.pdf" + save_name = args.output_dir + "/" + h_name + "/roc.pdf" plot_roc_curve(h_name, roc_results[h], save_name) plot_roc_curve(h_name, roc_results[h], save_name.replace(".pdf", "_log.pdf"), log = True) print_eff_table(h_name, roc_results[h]) @@ -197,9 +239,10 @@ def main(args): for run in selected_runs: selected_runs_idx = selected_runs_idx | (runs.run_number == run) logger.debug("[assess.py] Will make plots for the %d specified runs: %s" % (len(selected_runs), str(selected_runs))) - + runs_trim = runs[selected_runs_idx] for h, info in histograms.items(): + stats_checked = False for i in range(len(runs_trim)): run = runs_trim[i] run_number = run.run_number @@ -209,12 +252,25 @@ def main(args): if algorithm_info["reco"] is None: continue recos[algorithm] = { "reco" : run[algorithm_info["reco"]], "score" : run[algorithm_info["score"]]} + + if not stats_checked: + if run[algorithm_info["reco"]].ndim > 1: + stats['dim_0'].append(len(run[algorithm_info["reco"]])) + stats['dim_1'].append(len(run[algorithm_info["reco"]][0])) + else: + stats['dim_0'].append(len(run[algorithm_info["reco"]])) + stats['dim_1'].append(numpy.nan) + stats_checked = True h_name = h.replace("/", "").replace(" ", "") - save_name = args.output_dir + "/" + h_name + "_Run%d.pdf" % run_number - make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name) + save_name = args.output_dir + "/" + h_name + "/Run%d.pdf" % run_number + make_original_vs_reconstructed_plot(h_name, original, recos, run_number, save_name, hist_layout = args.hist_layout) logger.info("[assess.py] Plots written to directory '%s'." % (args.output_dir)) - + stat_parquet_dir = args.output_dir + "/assessment_stats.parquet" + stat_csv_dir = args.output_dir + "/assessment_stats.csv" + pandas.DataFrame(stats).to_csv(stat_csv_dir) + pandas.DataFrame(stats).to_parquet(stat_parquet_dir) + logger.info("[assess.py] Assessment statistics written to '%s' and '%s'" % (stat_parquet_dir, stat_csv_dir)) if args.make_webpage: os.system("cp web/index.php %s" % args.output_dir) os.system("chmod 755 %s" % args.output_dir) diff --git a/scripts/train.py b/scripts/train.py index b2d8dfb..b5d81a1 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -7,6 +7,7 @@ from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm from autodqm_ml.algorithms.pca import PCA from autodqm_ml.algorithms.autoencoder import AutoEncoder +from autodqm_ml.algorithms.mirrorAE import MirrorAE from autodqm_ml.utils import expand_path parser = argparse.ArgumentParser() @@ -51,10 +52,9 @@ ) parser.add_argument( "--train_highest_only", - help = "If True, only trains on the runs with the highest stats, or the highest number of entries. The test set becomes the remaining runs.", - type = bool, - required = False, - default = False + help = "If set, only trains on the runs with the highest stats, or the highest number of entries. The test set becomes the remaining runs.", + action = "store_true", + required = False ) @@ -72,7 +72,13 @@ # required = False, # default = 0.5, #) - +parser.add_argument( + "--stats_output_dir", + help = "Makes a series of plots and a table containing data taken during training progression, and saves them in the provided output directory. If not provided, will not save train stats.", + required = False, + type = str, + default = None +) parser.add_argument( "--reference", help = "reference run number to use for comparisons with StatisticalTester", @@ -127,7 +133,7 @@ config = vars(args) config["name"] = args.algorithm.lower() -if not config["name"] in ["autoencoder", "pca", "statistical_tester"]: +if not config["name"] in ["autoencoder", "pca", "statistical_tester", "mirrorae"]: message = "[train.py] Requested algorithm '%s' is not in the supported list of algorithms ['autoencoder', 'pca']." % (config["name"]) logger.exception(message) raise RuntimeError() @@ -138,7 +144,8 @@ algorithm = AutoEncoder(**config) elif config["name"] == "statistical_tester": algorithm = StatisticalTester(**config) - +elif config["name"] == "mirrorae": + algorithm = MirrorAE(**config) if args.input_file is None and "input_file" not in config.keys(): message = "[train.py] An input file for training the ML algorithm was not supplied through CLI nor found in the json config file for the algorithm." logger.exception(message) From 5d70e342ba35a70965b37cd8672a5d56f5aaa43d Mon Sep 17 00:00:00 2001 From: jocain Date: Fri, 12 Aug 2022 05:09:14 -0700 Subject: [PATCH 2/2] Autoencoder/General Training Features + Specialized Mirror AE --- autodqm_ml/algorithms/mirrorAE.py | 520 ++++++++++++++++++++++++++++++ 1 file changed, 520 insertions(+) create mode 100644 autodqm_ml/algorithms/mirrorAE.py diff --git a/autodqm_ml/algorithms/mirrorAE.py b/autodqm_ml/algorithms/mirrorAE.py new file mode 100644 index 0000000..a2adfa2 --- /dev/null +++ b/autodqm_ml/algorithms/mirrorAE.py @@ -0,0 +1,520 @@ +import os +import pandas +import numpy +import json +import awkward +import copy + +import logging +logger = logging.getLogger(__name__) + +import tensorflow as tf +import tensorflow.keras as keras + +from datetime import datetime +from autodqm_ml.constants import kGOOD, kANOMALOUS +from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm +from autodqm_ml import utils +from autodqm_ml.plotting.plot_tools import make_training_plots +DEFAULT_OPT = { + "batch_size" : 128, + "val_batch_size" : 1024, + "learning_rate" : 0.001, + "loss" : "mse", + "n_epochs" : 1000, + "early_stopping" : True, + "early_stopping_rounds" : 3, + "n_conv_layers": 1, + "tied_dense": True, + "n_hidden_layers": 1, + "n_nodes": 15, + "n_components" : 3, + "kernel_1d" : 3, + "kernel_2d" : 3, + "strides_1d" : 1, + "strides_2d" : 1, + "pooling" : False, + "pooling_kernel_1d" : 2, + "pooling_kernel_2d" : 2, + "pooling_stride_1d" : None, + "pooling_stride_2d" : None, + "decoder_conv_layers" : True, + "dropout" : 0.0, + "batch_norm" : False, + "n_filters" : 12, + "retain_best":False, + "lr_plateau_decay": False, + "lr_plateau_decay_rounds": 5, + "lr_plateau_decay_factor": 0.1, + "lr_plateau_threshold": 0, + "overwrite":False +} + +class MirrorAE(MLAlgorithm): + """ + Autoencoder base class. + + :param config: dictionary with hyperparameters for autoencoder training. Any hyperparameters not specified will be taken from the default values in `DEFAULT_OPT` + :type config: dict + :param mode: string to specify whether you want to train an autoencoder for each histogram ("individual") or a single autoencoder on all histograms ("simultaneous") + :type mode: str + """ + def __init__(self, **kwargs): + super(MirrorAE, self).__init__(**kwargs) + + self.config = utils.update_dict( + original = DEFAULT_OPT, + new = kwargs.get('config', {}) + ) + + self.mode = kwargs.get('autoencoder_mode', 'individual') + if self.mode is None: + self.mode = "individual" + + if not self.mode in ["individual", "simultaneous"]: + logger.exception("[AutoEncoder : __init__] mode '%s' is not a recognized option for AutoEncoder. Currently available modes are 'individual' (default) and 'simultaneous'." % (self.mode)) + raise ValueError() + self.models = {} + + logger.debug("[AutoEncoder : __init__] Constructing AutoEncoder with the following training options and hyperparameters:") + for param, value in self.config.items(): + logger.debug("\t %s : %s" % (param, str(value))) + + + def load_model(self, model_file): + """ + + """ + custom_layers = {'DenseTied': DenseTied} + with keras.utils.custom_object_scope(custom_layers): + model = keras.models.load_model(model_file) + return model + + + def save_model(self, model, model_file): + """ + + """ + logger.debug("[AutoEncoder : save_model] Saving trained autoencoder to file '%s'." % (model_file)) + model.save(model_file) + + + def train(self): + """ + + """ + if self.mode == "simultaneous": + self.models = { None : None } + logger.debug("[AutoEncoder : train] Mode selected as 'simultaneous', meaning a single autoencoder will be trained simultaneously on all histograms. Use 'individual' if you wish to train one autoencoder for each histogram.") + elif self.mode == "individual": + self.models = { k : None for k,v in self.histograms.items() } #copy.deepcopy(self.histograms) + logger.debug("[AutoEncoder : train] Mode selected as 'individual', meaning one autoencoder will be trained for each histogram. Use 'simultaneous' if you wish to train a single autoencoder for all histograms.") + + for histogram, histogram_info in self.models.items(): + if histogram is None: + model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag) + else: + model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag) + + if os.path.exists(model_file): + self.models[histogram] = self.load_model(model_file) + if self.config['overwrite']: + logger.warning("[AutoEncoder : train] Overwrite has been turned on and a trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file and continue to train it." % (self.tag, model_file)) + model = self.models[histogram] + else: + logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file)) + continue + + inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram) + inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram) + + if histogram is None: + hist_name = str(list(self.models.keys())) + else: + hist_name = histogram + logger.debug("[AutoEncoder : train] Training autoencoder with %d dimensions in latent space for histogram(s) '%s' with %d training examples." % (self.config["n_components"], hist_name, len(list(inputs.values())[0]))) + + if self.mode == "simultaneous": + histograms = self.histograms + elif self.mode == "individual": + histograms = { histogram : self.histograms[histogram] } + + if not os.path.exists(model_file): + model = AutoEncoder_DNN(histograms, **self.config).model() + if self.config["overwrite"]: + logger.warning("[AutoEncoder : train] Overwrite has been turned on but no existing model was found with tag '%s' at file '%s'. We will create and train a new model." % (self.tag, model_file)) + model.summary() + model.compile( + optimizer = keras.optimizers.Adam(learning_rate = self.config["learning_rate"]), + loss = self.config["loss"], + metrics = ['mse'] + ) + + callbacks = [] + if self.config["early_stopping"]: + callbacks.append(keras.callbacks.EarlyStopping(monitor = 'val_mse', patience = self.config["early_stopping_rounds"], restore_best_weights = self.config['retain_best'])) + if self.config["lr_plateau_decay"]: + callbacks.append(keras.callbacks.ReduceLROnPlateau(patience = self.config["lr_plateau_decay_rounds"], factor = self.config["lr_plateau_decay_factor"], min_delta = self.config["lr_plateau_threshold"])) + + history = model.fit( + inputs, + outputs, + validation_data = (inputs_val, outputs_val), + callbacks = callbacks, + epochs = self.config["n_epochs"], + batch_size = self.config["batch_size"] + ) + if self.stats_output_dir: + if not os.path.isdir(self.stats_output_dir): + os.mkdir(self.stats_output_dir) + history = pandas.DataFrame(history.history) + runlabel = 'run_' + datetime.now().strftime('%H%M%S%m%d') + logger.info("[AutoEncoder : train] Saving training statistics in '%s'." % (self.stats_output_dir)) + history.to_csv(self.stats_output_dir + runlabel + '_history.csv') + history.to_parquet(self.stats_output_dir + runlabel + '_history.parquet') + make_training_plots(history, histogram, self.stats_output_dir + runlabel + '_plots.png') + + + + self.save_model(model, model_file) + self.models[histogram] = model + + + def predict(self, batch_size = 1024): + for histogram, model in self.models.items(): + inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram) + + predictions = model.predict(inputs, batch_size = batch_size) + if self.mode == "simultaneous" and self.n_histograms >= 2: + predictions = { name : pred for name, pred in zip(model.output_names, predictions) } + else: + predictions = { model.output_names[0] : predictions } + + for name, pred in predictions.items(): + hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1] + original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions] + + reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions] + awkward.from_numpy(pred), + axis = -1 + ) + #reconstructed_hist = pred + sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension + (original_hist - reconstructed_hist) ** 2, + axis = -1 + ) + + # For 2d histograms, we need to sum over one more axis to get a single SSE score for each run + if self.histograms[hist_name]["n_dim"] == 2: + sse = awkward.sum(sse, axis = -1) # second histogram dimension + + self.add_prediction(hist_name, sse, reconstructed_hist) + + def make_inputs(self, split = None, histogram_name = None): + """ + + """ + inputs = {} + outputs = {} + + for histogram, info in self.histograms.items(): + if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram + if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder + continue + if 'CSC' in histogram_name: + label_field = 'CSC_label' + elif 'emtf' in histogram_name: + label_field = 'EMTF_label' + else: + label_field = None + + if label_field and len(numpy.unique(self.df[label_field])) > 1: #Don't Include Anomalous Runs in Training + if split == "train": + cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))] + elif split == "test": + cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))] + elif split == "all": + cut = self.df.run_number >= 0 + else: + cut = [l == kGOOD for l in self.df[label_field]] + else: + if split == "train": + cut = self.df.train_label == 0 + elif split == "test": + cut = self.df.train_label == 1 + else: + cut = self.df.run_number >= 0 # dummy all True cut + + df = self.df[cut] + data = tf.convert_to_tensor(df[histogram]) + inputs["input_" + info["name"]] = data + outputs["output_" + info["name"]] = data + + return inputs, outputs + + +class AutoEncoder_DNN(): + """ + An AutoEncoder instance owns AutoEncoder_DNN(s), which is the actual implementation of the DNN. + """ + def __init__(self, histograms, **kwargs): + self.n_histograms = len(histograms.keys()) + + self.__dict__.update(kwargs) + self.inputs = [] + self.outputs = [] + self.encoders = [] + self.decoders = [] + self.coupled = {} + self.conv_sizes = {} + for histogram, info in histograms.items(): + input, encoder = self.build_encoder(histogram, info) + self.inputs.append(input) + self.encoders.append(encoder) + output = self.build_decoder(histogram, info, encoder) + self.outputs.append(output) + + + def model(self): + model = keras.models.Model( + inputs = self.inputs, + outputs = self.outputs, + name = "autoencoder" + ) + #model.summary() + return model + + + def build_encoder(self, histogram, info): + input = keras.layers.Input( + shape = info["shape"] + (1,), + name = "input_%s" % info["name"] + ) + self.conv_sizes[histogram] = [tf.shape(input)._inferred_value[1:]] + layer = input + for i in range(self.n_conv_layers): + name = "encoder_%d_%s" % (i, info["name"]) + if info["n_dim"] == 1: + layer = keras.layers.Conv1D( + filters = self.n_filters, + kernel_size = self.kernel_1d, + strides = self.strides_1d, + activation = "relu", + name = name + )(layer) + elif info["n_dim"] == 2: + layer = keras.layers.Conv2D( + filters = self.n_filters, + kernel_size = self.kernel_2d, + use_bias = False, + strides = self.strides_2d, + activation = "relu", + name = name + )(layer) + self.conv_sizes[histogram].append(tf.shape(layer)._inferred_value[1:]) + if self.batch_norm: + layer = keras.layers.BatchNormalization(name = name + "_batch_norm")(layer) + if self.dropout > 0: + layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")(layer) + + if self.n_hidden_layers >= 1: + layer = keras.layers.Flatten()(layer) + self.coupled[histogram] = [] + + for i in range(self.n_hidden_layers-1): + dense = keras.layers.Dense( + units = self.n_nodes, + activation = "relu", + use_bias = False, + name = "Encoder_Hidden_%d" % i, + ) + if self.tied_dense: + self.coupled[histogram].append(dense) + layer = dense(layer) + + dense = keras.layers.Dense( + units = self.n_components, + activation = "relu", + use_bias = False, + name = "Encoder_Output" + ) + if self.tied_dense: + self.coupled[histogram].append(dense) + encoder = dense(layer) + + else: + encoder = layer + + return input, encoder + + + def build_decoder(self, histogram, info, input): + layer = input + if not self.tied_dense: + self.coupled[histogram] = [None]*self.n_hidden_layers + if self.n_hidden_layers >= 1: + self.coupled[histogram].reverse() + for i in range(len(self.coupled[histogram]) - 1): + layer = DenseTied( + units = self.n_nodes, + activation = "relu", + use_bias = False, + name = "Decoder_Hidden_%d" % i, + tied_to = self.coupled[histogram][i] + )(layer) + units = 1 + for m in self.conv_sizes[histogram][len(self.conv_sizes[histogram]) - 1]: + units *= m + layer = DenseTied( + units = units, + activation = "relu", + use_bias = False, + name = "Decoder_Hidden_%d" % (len(self.coupled[histogram]) - 1), + tied_to = self.coupled[histogram][len(self.coupled[histogram]) - 1] + )(layer) + + self.conv_sizes[histogram].reverse() + if self.n_conv_layers >= 1: + name = "decoder_reshape" + target_shape = self.conv_sizes[histogram][0] + else: + target_shape = info["shape"] + (1,) + print(target_shape) + name = "output_%s" % (info["name"]) + if self.n_hidden_layers >= 1: + layer = keras.layers.Reshape(target_shape = target_shape, name = name)(layer) + for i in range(self.n_conv_layers): + if i == (self.n_conv_layers - 1): + activation = "relu" + n_filters = 1 + name = "output_%s" % (info["name"]) + batch_norm = False + dropout = 0 + else: + activation = "relu" + n_filters = self.n_filters + name = "decoder_%d_%s" % (i, info["name"]) + batch_norm = self.batch_norm + dropout = self.dropout + cur_size = tf.shape(layer)._inferred_value[1:] + if info["n_dim"] == 1: + k = self.kernel_1d + s = self.strides_1d + elif info["n_dim"] == 2: + k = self.kernel_2d + s = self.strides_2d + l = self.conv_sizes[histogram][i+1] + out_pads = [l[j] - (cur_size[j] - 1)*s - k for j in range(info["n_dim"])] + if info["n_dim"] == 1: + layer = keras.layers.Conv1DTranspose( + filters = n_filters, + kernel_size = self.kernel_1d, + strides = self.strides_1d, + use_bias = False, + output_padding = out_pads, + activation = activation, + name = name + )(layer) + elif info["n_dim"] == 2: + layer = keras.layers.Conv2DTranspose( + filters = n_filters, + kernel_size = self.kernel_2d, + strides = self.strides_2d, + output_padding = out_pads, + activation = activation, + use_bias = False, + name = name + )(layer) + if batch_norm: + layer = keras.layers.BatchNormalization(name = name + "_batch_norm")(layer) + if dropout > 0: + layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")(layer) + + output = layer + return output + +class DenseTied(keras.layers.Layer): + def __init__(self, units, + activation=None, + use_bias=True, + kernel_initializer='glorot_uniform', + bias_initializer='zeros', + kernel_regularizer=None, + bias_regularizer=None, + activity_regularizer=None, + kernel_constraint=None, + bias_constraint=None, + tied_to=None, + **kwargs): + self.tied_to = tied_to + if 'input_shape' not in kwargs and 'input_dim' in kwargs: + kwargs['input_shape'] = (kwargs.pop('input_dim'),) + super().__init__(**kwargs) + self.config_terms = { + 'units':units, + 'activation':activation, + 'use_bias':use_bias, + 'kernel_initializer':kernel_initializer, + 'bias_initializer':bias_initializer, + 'kernel_regularizer':kernel_regularizer, + 'bias_regularizer':bias_regularizer, + 'activity_regularizer':activity_regularizer, + 'kernel_constraint':kernel_constraint, + 'bias_constraint':bias_constraint, + 'tied_to':tied_to + } + self.units = units + self.activation = keras.activations.get(activation) + self.use_bias = use_bias + self.kernel_initializer = keras.initializers.get(kernel_initializer) + self.bias_initializer = keras.initializers.get(bias_initializer) + self.kernel_regularizer = keras.regularizers.get(kernel_regularizer) + self.bias_regularizer = keras.regularizers.get(bias_regularizer) + self.activity_regularizer = keras.regularizers.get(activity_regularizer) + self.kernel_constraint = keras.constraints.get(kernel_constraint) + self.bias_constraint = keras.constraints.get(bias_constraint) + self.input_spec = keras.layers.InputSpec(min_ndim=2) + self.supports_masking = True + + def get_config(self): + config = super().get_config() + config.update(self.config_terms) + return config + + def build(self, input_shape): + assert len(input_shape) >= 2 + input_dim = input_shape[-1] + + if self.tied_to is not None: + self.kernel = tf.Variable(tf.transpose(self.tied_to.kernel), trainable = False) + self._non_trainable_weights.append(self.kernel) + else: + self.kernel = self.add_weight(shape=(input_dim, self.units), + initializer=self.kernel_initializer, + name='kernel', + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint) + if self.use_bias: + self.bias = self.add_weight(shape=(self.units,), + initializer=self.bias_initializer, + name='bias', + regularizer=self.bias_regularizer, + constraint=self.bias_constraint) + else: + self.bias = None + self.input_spec = keras.layers.InputSpec(min_ndim=2, axes={-1: input_dim}) + self.built = True + + def compute_output_shape(self, input_shape): + assert input_shape and len(input_shape) >= 2 + output_shape = list(input_shape) + output_shape[-1] = self.units + return tuple(output_shape) + + def call(self, inputs): + output = tf.tensordot(inputs, self.kernel, axes = 1) + if self.use_bias: + output = tf.nn.bias_add(output, self.bias, data_format='channels_last') + if self.activation is not None: + output = self.activation(output) + return output