Skip to content

Commit

Permalink
Merge pull request #30 from AutoDQM/autoencoder_training
Browse files Browse the repository at this point in the history
Autoencoder Training Features
  • Loading branch information
sammy-may authored Aug 22, 2022
2 parents 046bc9a + 5d70e34 commit dea6a7d
Show file tree
Hide file tree
Showing 7 changed files with 1,079 additions and 136 deletions.
240 changes: 162 additions & 78 deletions autodqm_ml/algorithms/autoencoder.py

Large diffs are not rendered by default.

520 changes: 520 additions & 0 deletions autodqm_ml/algorithms/mirrorAE.py

Large diffs are not rendered by default.

44 changes: 33 additions & 11 deletions autodqm_ml/algorithms/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
from autodqm_ml.data_formats.histogram import Histogram
from autodqm_ml.plotting.plot_tools import plot1D, plotMSESummary
from autodqm_ml.constants import kGOOD, kANOMALOUS

DEFAULT_OPT = {
"n_components" : 2
Expand All @@ -27,8 +28,11 @@ class PCA(MLAlgorithm):
def __init__(self, **kwargs):
super(PCA, self).__init__(**kwargs)


if not hasattr(self, "n_components"):
self.n_components = DEFAULT_OPT["n_components"]

self.hist_shape = None

def load_model(self, model_file):
"""
Expand Down Expand Up @@ -95,17 +99,35 @@ def get_histogram(self, histogram, split = "all"):
:return: a 1d histogram (flattened if originally a 2d histogram)
:rtype: awkward.Array
"""

if split == "train":
runs = self.df[self.df.train_label == 0]
elif split == "test":
runs = self.df[self.df.train_label == 1]
elif split == "all":
runs = self.df

h = runs[histogram]
if 'CSC' in histogram:
label_field = 'CSC_label'
elif 'emtf' in histogram:
label_field = 'EMTF_label'
else:
label_field = None

if label_field and len(numpy.unique(self.df[label_field])) > 1: #Don't Include Anomalous Runs in Training
if split == "train":
cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
elif split == "test":
cut = [self.df.train_label[i] == 0 and self.df[label_field][i] == kGOOD for i in range(len(self.df))]
elif split == "all":
cut = self.df.run_number >= 0
else:
cut = self.df[label_field] == kGOOD
else:
if split == "train":
cut = self.df.train_label == 0
elif split == "test":
cut = self.df.train_label == 1
else:
cut = self.df.run_number >= 0 # dummy all True cut

df = self.df[cut]
h = df[histogram]

n_dim = len(awkward.to_numpy(h[0]).shape)
self.hist_shape = awkward.to_numpy(h).shape

if n_dim == 2:
h = awkward.flatten(h, axis = 2)
Expand Down Expand Up @@ -165,6 +187,6 @@ def predict(self):
(original_hist - reconstructed_hist) ** 2,
axis = -1
)

self.add_prediction(histogram, sse, reconstructed_hist)
self.add_prediction(histogram, sse, numpy.array(reconstructed_hist).reshape(self.hist_shape))

2 changes: 1 addition & 1 deletion autodqm_ml/evaluation/roc_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def bootstrap_indices(x):
return numpy.random.randint(0, len(x), len(x))


def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 100, interp = 10000):
def calc_roc_and_unc(y, pred, sample_weight = None, n_bootstrap = 1000, interp = 10000):
"""
Calculates tpr and fpr arrays (with uncertainty for tpr) and auc and uncertainty
Keyword arguments:
Expand Down
276 changes: 265 additions & 11 deletions autodqm_ml/plotting/plot_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@
import numpy as np
from pathlib import Path
import awkward
import os

from yahist import Hist1D
import pandas as pd

from yahist import Hist1D, Hist2D

from matplotlib import colors

from datetime import datetime

import logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -43,22 +50,28 @@ def make_sse_plot(name, recos, save_name, **kwargs):
plt.clf()


def make_original_vs_reconstructed_plot(name, original, recos, run, save_name, **kwargs):
def make_original_vs_reconstructed_plot(name, original, recos, run, save_name, hist_layout, **kwargs):
n_dim = len(np.array(original).shape)

if n_dim == 1:
make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name, **kwargs)

elif n_dim == 2:
original_flat = awkward.flatten(original, axis = -1)
recos_flat = {}
for algorithm, reco in recos.items():
recos_flat[algorithm] = {
"reco" : awkward.flatten(reco["reco"], axis = -1),
"score" : reco["score"]
}

make_original_vs_reconstructed_plot1d(name, original_flat, recos_flat, run, save_name, **kwargs)
if hist_layout == 'flatten':
original_flat = awkward.flatten(original, axis = -1)
recos_flat = {}
for algorithm, reco in recos.items():
recos_flat[algorithm] = {
"reco" : awkward.flatten(reco["reco"], axis = -1),
"score" : reco["score"]
}
make_original_vs_reconstructed_plot1d(name, original_flat, recos_flat, run, save_name, **kwargs)
elif hist_layout == '2d':
make_original_vs_reconstructed_plot2d(name, original, recos, run, save_name, **kwargs)
else:
message = "[plot_tools.py : make_original_vs_reconstructed_plot] Please specify a valid histogram layout option: flatten (default), 2d"
logger.exception(message)
raise RuntimeError()

else:
message = "[plot_tools.py : make_original_vs_reconstructed_plot] Plotting not implemented for histograms with dimension %d." % (n_dim)
Expand All @@ -76,6 +89,7 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
h_orig = Hist1D(original, bins = bins, label = "original")
h_orig._counts = original
h_reco = []

for reco, info in recos.items():
h = Hist1D(info["reco"], bins = bins, label = "%s [sse : %.2E]" % (reco, info["score"]))
h._counts = info["reco"]
Expand Down Expand Up @@ -109,7 +123,68 @@ def make_original_vs_reconstructed_plot1d(name, original, recos, run, save_name,
plt.savefig(save_name.replace(".pdf", ".png"))
plt.clf()

def make_original_vs_reconstructed_plot2d(name, original, recos, run, save_name, **kwargs):
x_label = name + " (a.u.)"
y_label = "Fraction of events"
extent = (0, 1, 0, 1)
color_map = plt.cm.Purples
rat_lim = kwargs.get("rat_lim", [0.0, 2.0])
log_y = kwargs.get("log_y", False)
h_reco = []
labels = []
base_vmax = awkward.max(original)
base_vmin = awkward.min(original)
ratio_vmax = -10
ratio_vmin = 10
ratios = []
for reco, info in recos.items():
h_reco.append(info["reco"])
ratio = np.abs(info["reco"] - original)
ratios.append(ratio)
base_vmax = np.max((base_vmax, awkward.max(info["reco"])))
base_vmin = np.min((base_vmin, awkward.min(info["reco"])))
ratio_vmax = np.max((ratio_vmax, awkward.max(ratio)))
ratio_vmin = np.min((ratio_vmin, awkward.min(ratio)))
labels.append("%s [sse : %.2E]" % (reco, info["score"]))

fig, axes = plt.subplots(2, len(h_reco) + 1, figsize=(5 + len(h_reco)*5, 6), gridspec_kw=dict(height_ratios=[3, 1]), sharey = True, sharex=True)

if log_y:
base_norm = colors.LogNorm(base_vmin, base_vmax)
ratio_norm = colors.LogNorm(ratio_vmin, ratio_vmax)
else:
base_norm = colors.Normalize(base_vmin, base_vmax)
ratio_norm = colors.Normalize(ratio_vmin, ratio_vmax)
#plt.grid()
axes[0][0].imshow(original, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
axes[0][0].set_title("Original")
#plt.colorbar(mesh, ax = axes[0][0])
axes[0][0].grid()
for idx, h in enumerate(h_reco):
if idx == len(h_reco) - 1:
pos = axes[0][idx+1].imshow(h, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
cax = axes[0][idx+1].inset_axes([1.1, 0, 0.1, 1])
plt.colorbar(pos, cax = cax, ax = axes[0][idx+1])
pos = axes[1][idx+1].imshow(ratios[idx], norm=ratio_norm, cmap=color_map, extent = extent, aspect = 'auto')
cax = axes[1][idx+1].inset_axes([1.1, 0, 0.1, 1])
plt.colorbar(pos, cax = cax, ax = axes[1][idx+1])
else:
pos = axes[0][idx+1].imshow(h, norm=base_norm, cmap=color_map, extent = extent, aspect = 'auto')
pos = axes[1][idx+1].imshow(ratios[idx], norm=ratio_norm, cmap=color_map, extent = extent, aspect = 'auto')
axes[0][idx+1].set_title(labels[idx])
axes[0][idx+1].grid()
axes[1][idx+1].grid()
axes[1][0].remove()
axes[0][0].set_ylabel(y_label)
axes[1][1].set_ylabel("ML Reco - Original")
axes[0][0].set_xlabel(x_label)


logger.debug("[plot_tools.py : make_original_vs_reconstructed_plot1d] Writing plot to file '%s'. " % (save_name))
plt.savefig(save_name, bbox_inches='tight')
plt.savefig(save_name.replace(".pdf", ".png"), bbox_inches='tight')
plt.clf()

def plot1D(original_hist, reconstructed_hist, run, hist_path, algo, threshold):
"""
plots given original and recontructed histogram. Will plot the MSE plot if the SSE is over the threshold.
Expand Down Expand Up @@ -255,3 +330,182 @@ def plot_roc_curve(h_name, results, save_name, **kwargs):
plt.savefig(save_name.replace(".pdf", ".png"))
plt.clf()

def plot_rescaled_score_hist(data, hist, savename):

fig, axes = plt.subplots(len(data['score']), 1, figsize = (12, 4*len(data['score'])))
if len(data['score']) == 1:
axes = [axes]
for i in range(len(data['score'])):
score = data['score'][i]
score_min = awkward.min(score)
score_max = awkward.max(score) - score_min
score = score - awkward.min(score)
score = score/awkward.max(score)
axes[i].hist(score, bins = np.logspace(np.log10(1e-4),np.log10(1.0), 100), color = 'tab:blue', alpha = 0.8, label = 'All Runs')
axes[i].set_ylabel(data['algo'][i])
axes[i].set_yscale('log')
axes[i].set_xscale('log')
if 'bad' in data:
badax = axes[i].twinx()
bad = data['bad'][i]
bad = bad - score_min
bad = bad/score_max
badax.hist(bad, bins = np.logspace(np.log10(1e-4),np.log10(1.0), 100), range = (0, 1), color = 'tab:orange', alpha = 0.8, label ='Bad Runs')
axes[i].spines['left'].set_color('tab:blue')
badax.xaxis.label.set_color('tab:orange')
axes[i].xaxis.label.set_color('tab:blue')
if i == 0:
badax.set_ylabel('Anomalous Runs')
badax.spines['right'].set_color('tab:orange')
badax.set_xscale('log')
fig.suptitle(hist)
axes[0].legend()
axes[0].set_title('Min-Max Scaled Anomaly Scores')
fig.savefig(savename, bbox_inches = 'tight')
fig.savefig(savename.replace('.png', '.pdf'), bbox_inches = 'tight')

def make_training_plots(history, hist, save_file):
epochs = range(len(history['loss']))
print(len(history.columns))
fig, axes = plt.subplots(1, len(history.columns), figsize = (len(history.columns)*9, 9))
i = 0
fig.suptitle(hist, fontsize = 22)
for stat, y in history.items():
axes[i].plot(epochs, y)
axes[i].set_xlabel('Epoch', fontsize = 15)
axes[i].set_title(stat, fontsize = 18)
axes[i].set_yscale('log')
i += 1
plt.savefig(save_file, bbox_inches = 'tight')
def multi_exp_plots(paths, xlabel, x, title, legend = None, logx = False, logy = False):
fig, axes = plt.subplots(1, 4, figsize = (36, 9))
i = 0
fig.suptitle(title, fontsize = 22)
if not legend:
legend = [None]*len(x)
if type(paths) == list:
for i in range(len(paths)):
make_one_var_exp_plots(paths[i], xlabel, x, axes, legend[i], logx)
if paths[0][len(paths[0]) - 1] == '/':
savepath = paths[0][:len(paths[0]) - 1]
else:
savepath = paths[0]
filename = title.replace(' ', '_')
for c in '()[]{}/.,:;?!@#$^&*':
filename = filename.replace(c, '')
savepath = savepath[:str.rindex(savepath, '/')] + '/' + filename + '_plots.png'
print(savepath)
else:
make_one_var_exp_plots(paths, xlabel, x, axes, legend, logx)
plt.savefig(paths + 'plots.png', bbox_inches = 'tight')

def make_one_var_exp_plots(path, xlabel, x, axes, label = None, logx = False):
data = {'Epochs Trained':[], 'Epochs Trained Std':[],
'Best Train Loss':[], 'Best Train Loss Std':[],
'Best Validation Loss':[], 'Best Validation Loss Std':[],
'Ending Learning Rate':[], 'Ending Learning Rate Std':[]}
levels = [dir for dir in os.listdir(path) if not '.png' in dir]
if not label:
if path[len(path) - 1] == '/':
label = path[:len(path) - 1]
label = label[label.rindex('/') + 1:]
else:
label = path[path.rindex('/')]
for level in levels:
dirs = os.listdir(path+level)
files = [file for file in dirs if '.csv' in file]
subset = {'Epochs Trained':[], 'Best Train Loss':[], 'Best Validation Loss':[],'Ending Learning Rate':[]}
for file in files:
filepath = path + level + '/' + file
df = pd.read_csv(filepath)
subset['Epochs Trained'].append(len(df['loss']))
subset['Best Train Loss'].append(np.min(df['loss']))
subset['Best Validation Loss'].append(np.min(df['val_loss']))
subset['Ending Learning Rate'].append(df['lr'].iloc[len(df['loss']) - 1])
for item, values in subset.items():
data[item].append(np.mean(values))
data[item + ' Std'].append(np.std(values))
i = 0
for item in data:
if not 'Std' in item:
axes[i].errorbar(x, data[item], data[item + ' Std'], label = label)
axes[i].set_title(item, fontsize = 18)
if logx:
axes[i].set_xscale('log')
if i == 3:
axes[i].set_yscale('log')
axes[i].set_xlabel(xlabel, fontsize = 15)
i += 1


def multi_exp_bar_plots(paths, xlabel, title, legend = None):
b = 9
s = b/(len(xlabel) + .5)
fig, axes = plt.subplots(1, 5, figsize = (45, 9))
fig.suptitle(title, fontsize = 22)
if not legend:
legend = [None]*len(paths)
if type(paths) == list:
for i in range(len(paths)):
make_one_var_exp_bar_plots(paths[i], xlabel, axes, i, b, s, len(paths), legend[i])
if paths[0][len(paths[0]) - 1] == '/':
savepath = paths[0][:len(paths[0]) - 1]
else:
savepath = paths[0]
filename = title.replace(' ', '_')
for c in '()[]{}/.,:;?!@#$^&*':
filename = filename.replace(c, '')
savepath = savepath[:str.rindex(savepath, '/')] + '/' + filename + '_plots.png'
print(savepath)
else:

make_one_var_exp_bar_plots(paths, xlabel, axes, 0, b, s, 1, legend[0])
plt.savefig(paths + 'plots.png', bbox_inches = 'tight')

def make_one_var_exp_bar_plots(path, xlabel, axes, i, b, s, n, label = None):
data = {'Epochs Trained':[], 'Epochs Trained Std':[],
'Best Train Loss':[], 'Best Train Loss Std':[],
'Best Validation Loss':[], 'Best Validation Loss Std':[],
'Ending Learning Rate':[], 'Ending Learning Rate Std':[]}
levels = [dir for dir in os.listdir(path) if not '.png' in dir and not 'assess' in dir]
if not label:
if path[len(path) - 1] == '/':
label = path[:len(path) - 1]
label = label[label.rindex('/') + 1:]
else:
label = path[path.rindex('/')]
for level in levels:
dirs = os.listdir(path+level)
files = [file for file in dirs if '.csv' in file]
subset = {'Epochs Trained':[], 'Best Train Loss':[], 'Best Validation Loss':[],'Ending Learning Rate':[]}
for file in files:
filepath = path + level + '/' + file
df = pd.read_csv(filepath)
subset['Epochs Trained'].append(len(df['loss']))
subset['Best Train Loss'].append(np.min(df['loss']))
subset['Best Validation Loss'].append(np.min(df['val_loss']))
subset['Ending Learning Rate'].append(df['lr'].iloc[len(df['loss']) - 1])
if 'mse' in df.columns:
if 'Best MSE' not in data:
data['Best MSE'] = []
data['Best MSE Std'] = []
if 'Best MSE' not in subset:
subset['Best MSE'] = []
subset['Best MSE'].append(np.min(df['mse']))
for item, values in subset.items():
data[item].append(np.mean(values))
data[item + ' Std'].append(np.std(values))
if 'MSE' in item:
print(level, np.mean(values))
print(values)
m = 0
for item in data:
if not 'Std' in item:
x = [s + 2*s*i + j*b for j in range(len(xlabel))]
print(data[item])
axes[m].bar(x, data[item], yerr = data[item + ' Std'], width = (b-.5)/n, label = label)
axes[m].set_title(item, fontsize = 18)
if 'Loss' in item:
axes[m].set_yscale('log')
axes[m].set_xticks(x, xlabel, fontsize = 15)
m += 1
Loading

0 comments on commit dea6a7d

Please sign in to comment.