Merge pull request #30 from AutoDQM/autoencoder_training

Autoencoder Training Features
AutoDQM · Aug 22, 2022 · dea6a7d · dea6a7d
2 parents 046bc9a + 5d70e34
commit dea6a7d
Show file tree

Hide file tree

Showing 7 changed files with 1,079 additions and 136 deletions.
diff --git a/autodqm_ml/algorithms/autoencoder.py b/autodqm_ml/algorithms/autoencoder.py
diff --git a/autodqm_ml/algorithms/mirrorAE.py b/autodqm_ml/algorithms/mirrorAE.py
diff --git a/autodqm_ml/algorithms/pca.py b/autodqm_ml/algorithms/pca.py
@@ -15,6 +15,7 @@
 from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
 from autodqm_ml.data_formats.histogram import Histogram
 from autodqm_ml.plotting.plot_tools import plot1D, plotMSESummary
+from autodqm_ml.constants import kGOOD, kANOMALOUS
 
 DEFAULT_OPT = {
         "n_components" : 2
@@ -27,8 +28,11 @@ class PCA(MLAlgorithm):
     def __init__(self, **kwargs):
         super(PCA, self).__init__(**kwargs)
 
+
         if not hasattr(self, "n_components"):
             self.n_components = DEFAULT_OPT["n_components"]
+
+        self.hist_shape = None
 
     def load_model(self, model_file):
         """
@@ -95,17 +99,35 @@ def get_histogram(self, histogram, split = "all"):
         :return: a 1d histogram (flattened if originally a 2d histogram)
         :rtype: awkward.Array
         """
-
-        if split == "train":
-            runs = self.df[self.df.train_label == 0]
-        elif split == "test":
-            runs = self.df[self.df.train_label == 1]
-        elif split == "all":
-            runs = self.df
-
-        h = runs[histogram]
+        if 'CSC' in histogram:
+            label_field = 'CSC_label'
+        elif 'emtf' in histogram:
+            label_field = 'EMTF_label'
+        else:
+            label_field = None
+
+        if label_field and len(numpy.unique(self.df[label_field])) > 1: #Don't Include Anomalous Runs in Training
+           if split == "train":
+               cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
+           elif split == "test":
+               cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
+           elif split == "all":
+               cut = self.df.run_number >= 0
+           else:
+               cut = self.df[label_field] == kGOOD
+        else:
+           if split == "train":
+               cut = self.df.train_label == 0
+           elif split == "test":
+               cut = self.df.train_label == 1
+           else:
+               cut = self.df.run_number >= 0 # dummy all True cut
+
+        df = self.df[cut]
+        h = df[histogram]
 
         n_dim = len(awkward.to_numpy(h[0]).shape)
+        self.hist_shape = awkward.to_numpy(h).shape
 
         if n_dim == 2:
             h = awkward.flatten(h, axis = 2)
@@ -165,6 +187,6 @@ def predict(self):
                     (original_hist - reconstructed_hist) ** 2,
                     axis = -1
             )
-
-            self.add_prediction(histogram, sse, reconstructed_hist)
+           
+            self.add_prediction(histogram, sse, numpy.array(reconstructed_hist).reshape(self.hist_shape))
 
diff --git a/autodqm_ml/evaluation/roc_tools.py b/autodqm_ml/evaluation/roc_tools.py
@@ -49,7 +49,7 @@ def bootstrap_indices(x):
     return numpy.random.randint(0, len(x), len(x))
 
 
-def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 100, interp = 10000):
+def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 1000, interp = 10000):
     """
     Calculates tpr and fpr arrays (with uncertainty for tpr) and auc and uncertainty
     Keyword arguments:

diff --git a/autodqm_ml/plotting/plot_tools.py b/autodqm_ml/plotting/plot_tools.py
@@ -2,8 +2,15 @@
 import numpy as np 
 from pathlib import Path
 import awkward
+import os
 
-from yahist import Hist1D
+import pandas as pd
+
+from yahist import Hist1D, Hist2D
+
+from matplotlib import colors
+
+from datetime import datetime
 
 import logging
 logger = logging.getLogger(__name__)
@@ -43,22 +50,28 @@ def make_sse_plot(name, recos, save_name, **kwargs):
     plt.clf()
 
 
-def make_original_vs_reconstructed_plot(name, original, recos, run, save_name, **kwargs): 
+def make_original_vs_reconstructed_plot(name, original, recos, run, save_name, hist_layout, **kwargs): 
     n_dim = len(np.array(original).shape)
 
     if n_dim == 1:
         make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name, **kwargs)
 
     elif n_dim == 2:
-        original_flat = awkward.flatten(original, axis = -1)
-        recos_flat = {}
-        for algorithm, reco in recos.items():
-            recos_flat[algorithm] = {
-                    "reco" : awkward.flatten(reco["reco"], axis = -1),
-                    "score" : reco["score"]
-            }
-
-        make_original_vs_reconstructed_plot1d(name, original_flat, recos_flat, run, save_name, **kwargs)
+        if hist_layout == 'flatten':
+            original_flat = awkward.flatten(original, axis = -1)
+            recos_flat = {}
+            for algorithm, reco in recos.items():
+                recos_flat[algorithm] = {
+                          "reco" : awkward.flatten(reco["reco"], axis = -1),
+                          "score" : reco["score"]
+                }
+            make_original_vs_reconstructed_plot1d(name, original_flat, recos_flat, run, save_name, **kwargs)     
+        elif hist_layout == '2d':
+            make_original_vs_reconstructed_plot2d(name, original, recos, run, save_name, **kwargs)
+        else:
+            message = "[plot_tools.py : make_original_vs_reconstructed_plot] Please specify a valid histogram layout option: flatten (default), 2d"
+            logger.exception(message)
+            raise RuntimeError()
 
     else:
         message = "[plot_tools.py : make_original_vs_reconstructed_plot] Plotting not implemented for histograms with dimension %d." % (n_dim)
@@ -76,6 +89,7 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
     h_orig = Hist1D(original, bins = bins, label = "original")
     h_orig._counts = original
     h_reco = []
+
     for reco, info in recos.items():
         h = Hist1D(info["reco"], bins = bins, label = "%s [sse : %.2E]" % (reco, info["score"]))
         h._counts = info["reco"]
@@ -109,7 +123,68 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
     plt.savefig(save_name.replace(".pdf", ".png"))
     plt.clf()
 
+def make_original_vs_reconstructed_plot2d(name, original, recos, run, save_name, **kwargs):
+    x_label = name + " (a.u.)"
+    y_label = "Fraction of events"
+    extent = (0, 1, 0, 1)
+    color_map = plt.cm.Purples
+    rat_lim = kwargs.get("rat_lim", [0.0, 2.0])
+    log_y = kwargs.get("log_y", False)
+    h_reco = []
+    labels = []
+    base_vmax = awkward.max(original)
+    base_vmin = awkward.min(original)
+    ratio_vmax = -10
+    ratio_vmin = 10
+    ratios = []
+    for reco, info in recos.items():
+        h_reco.append(info["reco"])
+        ratio = np.abs(info["reco"] - original)
+        ratios.append(ratio)
+        base_vmax = np.max((base_vmax, awkward.max(info["reco"])))
+        base_vmin = np.min((base_vmin, awkward.min(info["reco"])))
+        ratio_vmax = np.max((ratio_vmax, awkward.max(ratio)))
+        ratio_vmin = np.min((ratio_vmin, awkward.min(ratio)))
+        labels.append("%s [sse : %.2E]" % (reco, info["score"]))
+
+    fig, axes = plt.subplots(2, len(h_reco) + 1, figsize=(5 + len(h_reco)*5, 6), gridspec_kw=dict(height_ratios=[3, 1]), sharey = True, sharex=True)
+
+    if log_y:
+        base_norm = colors.LogNorm(base_vmin, base_vmax)
+        ratio_norm = colors.LogNorm(ratio_vmin, ratio_vmax)
+    else:
+        base_norm = colors.Normalize(base_vmin, base_vmax)
+        ratio_norm = colors.Normalize(ratio_vmin, ratio_vmax)
+    #plt.grid()
+    axes[0][0].imshow(original, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
+    axes[0][0].set_title("Original")
+    #plt.colorbar(mesh, ax = axes[0][0])
+    axes[0][0].grid()
+    for idx, h in enumerate(h_reco):
+        if idx == len(h_reco) - 1:
+            pos = axes[0][idx+1].imshow(h, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
+            cax = axes[0][idx+1].inset_axes([1.1, 0, 0.1, 1])
+            plt.colorbar(pos, cax = cax, ax = axes[0][idx+1])
+            pos = axes[1][idx+1].imshow(ratios[idx], norm=ratio_norm, cmap=color_map, extent = extent, aspect = 'auto')
+            cax = axes[1][idx+1].inset_axes([1.1, 0, 0.1, 1])
+            plt.colorbar(pos, cax = cax, ax = axes[1][idx+1])
+        else:
+            pos = axes[0][idx+1].imshow(h, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
+            pos = axes[1][idx+1].imshow(ratios[idx], norm=ratio_norm, cmap=color_map, extent = extent, aspect = 'auto')
+        axes[0][idx+1].set_title(labels[idx])
+        axes[0][idx+1].grid()
+        axes[1][idx+1].grid()
+    axes[1][0].remove()
+    axes[0][0].set_ylabel(y_label)
+    axes[1][1].set_ylabel("ML Reco - Original")
+    axes[0][0].set_xlabel(x_label)
+
 
+    logger.debug("[plot_tools.py : make_original_vs_reconstructed_plot1d] Writing plot to file '%s'. " % (save_name))
+    plt.savefig(save_name, bbox_inches='tight')
+    plt.savefig(save_name.replace(".pdf", ".png"), bbox_inches='tight')
+    plt.clf()
+
 def plot1D(original_hist, reconstructed_hist, run, hist_path, algo, threshold):    
     """
     plots given original and recontructed histogram. Will plot the MSE plot if the SSE is over the threshold. 
@@ -255,3 +330,182 @@ def plot_roc_curve(h_name, results, save_name, **kwargs):
     plt.savefig(save_name.replace(".pdf", ".png"))
     plt.clf()
 
+def plot_rescaled_score_hist(data, hist, savename):
+
+    fig, axes = plt.subplots(len(data['score']), 1, figsize = (12, 4*len(data['score'])))
+    if len(data['score']) == 1:
+        axes = [axes]
+    for i in range(len(data['score'])):
+        score = data['score'][i]
+        score_min = awkward.min(score)
+        score_max = awkward.max(score) - score_min
+        score = score - awkward.min(score)
+        score = score/awkward.max(score)
+        axes[i].hist(score, bins = np.logspace(np.log10(1e-4),np.log10(1.0), 100), color = 'tab:blue', alpha = 0.8, label = 'All Runs')
+        axes[i].set_ylabel(data['algo'][i])
+        axes[i].set_yscale('log')
+        axes[i].set_xscale('log')
+        if 'bad' in data:
+            badax = axes[i].twinx()
+            bad = data['bad'][i]
+            bad = bad - score_min
+            bad = bad/score_max
+            badax.hist(bad, bins = np.logspace(np.log10(1e-4),np.log10(1.0), 100), range = (0, 1), color = 'tab:orange', alpha = 0.8, label ='Bad Runs')
+            axes[i].spines['left'].set_color('tab:blue')
+            badax.xaxis.label.set_color('tab:orange')
+            axes[i].xaxis.label.set_color('tab:blue')
+            if i == 0:
+               badax.set_ylabel('Anomalous Runs')
+            badax.spines['right'].set_color('tab:orange')
+            badax.set_xscale('log')    
+    fig.suptitle(hist)
+    axes[0].legend()
+    axes[0].set_title('Min-Max Scaled Anomaly Scores')
+    fig.savefig(savename, bbox_inches = 'tight')
+    fig.savefig(savename.replace('.png', '.pdf'), bbox_inches = 'tight')
+
+def make_training_plots(history, hist, save_file):
+        epochs = range(len(history['loss']))
+        print(len(history.columns))
+        fig, axes = plt.subplots(1, len(history.columns), figsize = (len(history.columns)*9, 9))
+        i = 0
+        fig.suptitle(hist, fontsize = 22)
+        for stat, y in history.items():
+            axes[i].plot(epochs, y)
+            axes[i].set_xlabel('Epoch', fontsize = 15)
+            axes[i].set_title(stat, fontsize = 18)
+            axes[i].set_yscale('log')
+            i += 1
+        plt.savefig(save_file, bbox_inches = 'tight')
+def multi_exp_plots(paths, xlabel, x, title, legend = None, logx = False, logy = False):
+    fig, axes = plt.subplots(1, 4, figsize = (36, 9))
+    i = 0
+    fig.suptitle(title, fontsize = 22)
+    if not legend:
+        legend = [None]*len(x)
+    if type(paths) == list:
+        for i in range(len(paths)):
+            make_one_var_exp_plots(paths[i], xlabel, x, axes, legend[i], logx)
+        if paths[0][len(paths[0]) - 1] == '/':
+            savepath = paths[0][:len(paths[0]) - 1]
+        else:
+            savepath = paths[0]
+        filename = title.replace(' ', '_')
+        for c in '()[]{}/.,:;?!@#$^&*':
+            filename = filename.replace(c, '')
+        savepath = savepath[:str.rindex(savepath, '/')] + '/' + filename + '_plots.png'
+        print(savepath)
+    else:
+        make_one_var_exp_plots(paths, xlabel, x, axes, legend, logx)
+        plt.savefig(paths + 'plots.png', bbox_inches = 'tight')
+
+def make_one_var_exp_plots(path, xlabel, x, axes, label = None, logx = False):
+    data = {'Epochs Trained':[], 'Epochs Trained Std':[],
+            'Best Train Loss':[], 'Best Train Loss Std':[],
+            'Best Validation Loss':[], 'Best Validation Loss Std':[],
+            'Ending Learning Rate':[], 'Ending Learning Rate Std':[]}
+    levels = [dir for dir in os.listdir(path) if not '.png' in dir]
+    if not label:
+        if path[len(path) - 1] == '/':
+            label = path[:len(path) - 1]
+            label = label[label.rindex('/') + 1:]
+        else:
+            label = path[path.rindex('/')]
+    for level in levels:
+        dirs = os.listdir(path+level)
+        files = [file for file in dirs if '.csv' in file]
+        subset = {'Epochs Trained':[], 'Best Train Loss':[], 'Best Validation Loss':[],'Ending Learning Rate':[]}
+        for file in files:
+            filepath = path + level + '/' + file
+            df = pd.read_csv(filepath)
+            subset['Epochs Trained'].append(len(df['loss']))
+            subset['Best Train Loss'].append(np.min(df['loss']))
+            subset['Best Validation Loss'].append(np.min(df['val_loss']))
+            subset['Ending Learning Rate'].append(df['lr'].iloc[len(df['loss']) - 1])
+        for item, values in subset.items():
+            data[item].append(np.mean(values))
+            data[item + ' Std'].append(np.std(values))
+    i = 0
+    for item in data:
+        if not 'Std' in item:
+          axes[i].errorbar(x, data[item], data[item + ' Std'], label = label)
+          axes[i].set_title(item, fontsize = 18)
+          if logx:
+            axes[i].set_xscale('log')
+          if i == 3:
+            axes[i].set_yscale('log')
+          axes[i].set_xlabel(xlabel, fontsize = 15)
+          i += 1
+
+
+def multi_exp_bar_plots(paths, xlabel, title, legend = None):
+    b = 9
+    s = b/(len(xlabel) + .5)
+    fig, axes = plt.subplots(1, 5, figsize = (45, 9))
+    fig.suptitle(title, fontsize = 22)
+    if not legend:
+        legend = [None]*len(paths)
+    if type(paths) == list:
+        for i in range(len(paths)):
+            make_one_var_exp_bar_plots(paths[i], xlabel, axes, i, b, s, len(paths), legend[i])
+        if paths[0][len(paths[0]) - 1] == '/':
+            savepath = paths[0][:len(paths[0]) - 1]
+        else:
+            savepath = paths[0]
+        filename = title.replace(' ', '_')
+        for c in '()[]{}/.,:;?!@#$^&*':
+            filename = filename.replace(c, '')
+        savepath = savepath[:str.rindex(savepath, '/')] + '/' + filename + '_plots.png'
+        print(savepath)
+    else:
+
+        make_one_var_exp_bar_plots(paths, xlabel, axes, 0, b, s, 1, legend[0])
+        plt.savefig(paths + 'plots.png', bbox_inches = 'tight')
+
+def make_one_var_exp_bar_plots(path, xlabel, axes, i, b, s, n, label = None):
+    data = {'Epochs Trained':[], 'Epochs Trained Std':[],
+            'Best Train Loss':[], 'Best Train Loss Std':[],
+            'Best Validation Loss':[], 'Best Validation Loss Std':[],
+            'Ending Learning Rate':[], 'Ending Learning Rate Std':[]}
+    levels = [dir for dir in os.listdir(path) if not '.png' in dir and not 'assess' in dir]
+    if not label:
+        if path[len(path) - 1] == '/':
+            label = path[:len(path) - 1]
+            label = label[label.rindex('/') + 1:]
+        else:
+            label = path[path.rindex('/')]
+    for level in levels:
+        dirs = os.listdir(path+level)
+        files = [file for file in dirs if '.csv' in file]
+        subset = {'Epochs Trained':[], 'Best Train Loss':[], 'Best Validation Loss':[],'Ending Learning Rate':[]}
+        for file in files:
+            filepath = path + level + '/' + file
+            df = pd.read_csv(filepath)
+            subset['Epochs Trained'].append(len(df['loss']))
+            subset['Best Train Loss'].append(np.min(df['loss']))
+            subset['Best Validation Loss'].append(np.min(df['val_loss']))
+            subset['Ending Learning Rate'].append(df['lr'].iloc[len(df['loss']) - 1])
+            if 'mse' in df.columns:
+                if 'Best MSE' not in data:
+                    data['Best MSE'] = []
+                    data['Best MSE Std'] = []
+                if 'Best MSE' not in subset:
+                    subset['Best MSE'] = []
+                subset['Best MSE'].append(np.min(df['mse']))
+        for item, values in subset.items():
+            data[item].append(np.mean(values))
+            data[item + ' Std'].append(np.std(values))
+            if 'MSE' in item:
+                print(level, np.mean(values))
+                print(values)
+    m = 0
+    for item in data:
+        if not 'Std' in item:
+          x = [s + 2*s*i + j*b for j in range(len(xlabel))]
+          print(data[item])
+          axes[m].bar(x, data[item], yerr = data[item + ' Std'], width = (b-.5)/n, label = label)
+          axes[m].set_title(item, fontsize = 18)
+          if 'Loss' in item:
+            axes[m].set_yscale('log')
+          axes[m].set_xticks(x, xlabel, fontsize = 15)
+          m += 1