Skip to content

Commit

Permalink
Working on #18 : make autoencoders configurable through json, add opt…
Browse files Browse the repository at this point in the history
…ion to train either single ae for all histograms or one ae per histogram (default)
  • Loading branch information
sam-may committed Feb 3, 2022
1 parent 6f27045 commit 29124cb
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 56 deletions.
6 changes: 4 additions & 2 deletions autodqm_ml/algorithms/anomaly_detection_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s

if histograms:
self.histograms = histograms
self.histogram_name_map = {} # we replace "/" and spaces in input histogram names to play nicely with other packages, this map lets you go back and forth between them

logger.debug("[AnomalyDetectionAlgorithm : load_data] Loading training data from file '%s'" % (self.input_file))

Expand All @@ -81,6 +82,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
# Set helpful metadata
for histogram, histogram_info in self.histograms.items():
self.histograms[histogram]["name"] = histogram.replace("/", "").replace(" ","")
self.histogram_name_map[self.histograms[histogram]["name"]] = histogram

a = awkward.to_numpy(df[histogram][0])
self.histograms[histogram]["shape"] = a.shape
Expand Down Expand Up @@ -134,9 +136,9 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
self.n_train = awkward.sum(df.train_label == 0)
self.n_test = awkward.sum(df.train_label == 1)
self.df = df
self.n_histograms = len(list(self.histograms.keys()))


logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (len(list(self.histograms.keys())), self.n_train, self.n_test))
logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (self.n_histograms, self.n_train, self.n_test))

self.data_is_loaded = True

Expand Down
214 changes: 161 additions & 53 deletions autodqm_ml/algorithms/autoencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy
import json
import awkward
import copy

import logging
logger = logging.getLogger(__name__)
Expand All @@ -14,26 +15,46 @@
from autodqm_ml import utils

DEFAULT_OPT = {
"batch_size" : 16,
"val_batch_size" : 1024,
"n_epochs" : 1000,
"early_stopping" : True,
"early_stopping_rounds" : 3,
"n_hidden_layers" : 2,
"n_nodes" : 25,
"n_nodes" : 10,
"n_components" : 3,
"kernel_1d" : 3,
"kernel_2d" : 3,
"strides_1d" : 1,
"strides_2d" : 1,
"dropout" : 0.0,
"batch_norm" : False,
"n_filters" : 8
}

class AutoEncoder(MLAlgorithm):
"""
Autoencoder base class.
:param config: dictionary with hyperparameters for autoencoder training. Any hyperparameters not specified will be taken from the default values in `DEFAULT_OPT`
:type config: dict
:param mode: string to specify whether you want to train an autoencoder for each histogram ("individual") or a single autoencoder on all histograms ("simultaneous")
:type mode: str
"""
def __init__(self, **kwargs):
super(AutoEncoder, self).__init__(**kwargs)

self.config = utils.update_dict(
original = DEFAULT_OPT,
new = self.__dict__
new = kwargs.get('config', {})
)

self.mode = kwargs.get('autoencoder_mode', 'individual')
if not self.mode in ["individual", "simultaneous"]:
logger.exception("AutoEncoder : __init__] mode '%s' is not a recognized option for AutoEncoder. Currently available modes are 'individual' (default) and 'simultaneous'." % (self.mode))
raise ValueError()
self.models = {}


def load_model(self, model_file):
"""
Expand All @@ -47,66 +68,129 @@ def save_model(self, model, model_file):
"""
"""
logger.debug("[AutoEncoder : save_model] Saving trained autoencoder to file '%s'." % (model_file))
model.save(model_file)


def train(self, n_epochs = 1000, batch_size = 128):
def train(self):
"""
"""
model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
if os.path.exists(model_file):
logger.warning("[AutoEncoder : train] A trained AutoEncoder alread exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
self.model = self.load_model(model_file)
return

inputs, outputs = self.make_inputs(split = "train")
inputs_val, outputs_val = self.make_inputs(split = "test")

self.model = AutoEncoder_DNN(self.histograms, **self.config).model()

self.model.compile(
optimizer = keras.optimizers.Adam(),
loss = keras.losses.MeanSquaredError()
)
if self.mode == "simultaneous":
self.models = { None : None }
logger.debug("[AutoEncoder : train] Mode selected as 'simultaneous', meaning a single autoencoder will be trained simultaneously on all histograms. Use 'individual' if you wish to train one autoencoder for each histogram.")
elif self.mode == "individual":
self.models = { k : None for k,v in self.histograms.items() } #copy.deepcopy(self.histograms)
logger.debug("[AutoEncoder : train] Mode selected as 'individual', meaning one autoencoder will be trained for each histogram. Use 'simultaneous' if you wish to train a single autoencoder for all histograms.")

for histogram, histogram_info in self.models.items():
if histogram is None:
model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
else:
model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag)

if os.path.exists(model_file):
logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
self.models[histogram] = self.load_model(model_file)
return

self.model.fit(
inputs,
outputs,
validation_data = (inputs_val, outputs_val),
callbacks = [keras.callbacks.EarlyStopping(patience = 3)],
epochs = n_epochs,
batch_size = batch_size
)
self.save_model(self.model, model_file)


def predict(self, batch_size = 1024):
inputs, outputs = self.make_inputs(split = "all")
pred = self.model.predict(inputs, batch_size = batch_size)

idx = 0
for histogram, histogram_info in self.histograms.items():
original_hist = self.df[histogram]
if len(self.histograms.items()) >= 2:
reconstructed_hist = awkward.flatten(awkward.from_numpy(pred[idx]), axis = -1)
else:
reconstructed_hist = awkward.flatten(awkward.from_numpy(pred), axis = -1)
inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram)
inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram)

sse = awkward.sum(
(original_hist - reconstructed_hist) ** 2,
axis = -1
if histogram is None:
hist_name = str(list(self.models.keys()))
else:
hist_name = histogram
logger.debug("[AutoEncoder : train] Training autoencoder with %d dimensions in latent space for histogram(s) '%s' with %d training examples." % (self.config["n_components"], hist_name, len(list(inputs.values())[0])))

if self.mode == "simultaneous":
histograms = self.histograms
elif self.mode == "individual":
histograms = { histogram : self.histograms[histogram] }

model = AutoEncoder_DNN(histograms, **self.config).model()

model.compile(
optimizer = keras.optimizers.Adam(),
loss = keras.losses.MeanSquaredError()
)

# For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
if histogram_info["n_dim"] == 2:
sse = awkward.sum(sse, axis = -1)

self.add_prediction(histogram, sse, reconstructed_hist)
idx += 1
callbacks = []
if self.config["early_stopping"]:
callbacks.append(keras.callbacks.EarlyStopping(patience = self.config["early_stopping_rounds"))

model.fit(
inputs,
outputs,
validation_data = (inputs_val, outputs_val),
callbacks = callbacks,
epochs = self.config["n_epochs"],
batch_size = self.config["batch_size"]
)

self.save_model(model, model_file)
self.models[histogram] = model


def predict(self, batch_size = 1024):
for histogram, model in self.models.items():
inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram)
predictions = model.predict(inputs, batch_size = batch_size)

def make_inputs(self, split = None):
if self.mode == "simultaneous" and self.n_histograms >= 2:
predictions = { name : pred for name, pred in zip(model.output_names, predictions) }
else:
predictions = { model.output_names[0] : predictions }

for name, pred in predictions.items():
hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1]
original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions]

reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions]
awkward.from_numpy(pred),
axis = -1
)

sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension
(original_hist - reconstructed_hist) ** 2,
axis = -1
)

# For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
if self.histograms[hist_name]["n_dim"] == 2:
sse = awkward.sum(sse, axis = -1) # second histogram dimension

self.add_prediction(hist_name, sse, reconstructed_hist)

"""
idx = 0
for histogram, histogram_info in self.histograms.items():
original_hist = self.df[histogram]
if self.n_histograms >= 2:
#reconstructed_hist = awkward.flatten(
# awkward.from_numpy(pred["output_" + histogram_info["name"]]),
# axis = -1
#)
reconstructed_hist = awkward.flatten(awkward.from_numpy(pred[idx]), axis = -1)
else:
#reconstructed_hist = awkward.flatten(awkward.from_numpy(pred), axis = -1)
sse = awkward.sum(
(original_hist - reconstructed_hist) ** 2,
axis = -1
)
# For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
if histogram_info["n_dim"] == 2:
sse = awkward.sum(sse, axis = -1)
self.add_prediction(histogram, sse, reconstructed_hist)
idx += 1
"""


def make_inputs(self, split = None, histogram_name = None):
"""
"""
Expand All @@ -123,21 +207,28 @@ def make_inputs(self, split = None):
df = self.df[cut]

for histogram, info in self.histograms.items():
if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram
if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder
continue

data = tf.convert_to_tensor(df[histogram])
inputs["input_" + info["name"]] = data
outputs["output_" + info["name"]] = data



return inputs, outputs


class AutoEncoder_DNN(keras.models.Model):
#class AutoEncoder_DNN(keras.models.Model):
class AutoEncoder_DNN():
"""
Model defined through the Keras Model Subclassing API: https://www.tensorflow.org/guide/keras/custom_layers_and_models
An AutoEncoder instance owns a single AutoEncoder_DNN, which is the actual implementation of the DNN.
"""
def __init__(self, histograms, **kwargs):
super(AutoEncoder_DNN, self).__init__()
#super(AutoEncoder_DNN, self).__init__()

self.n_histograms = len(histograms.keys())

Expand Down Expand Up @@ -179,7 +270,11 @@ def __init__(self, histograms, **kwargs):


def model(self):
model = keras.models.Model(inputs = self.inputs, outputs = self.outputs)
model = keras.models.Model(
inputs = self.inputs,
outputs = self.outputs,
name = "autoencoder"
)
model.summary()
return model

Expand Down Expand Up @@ -209,7 +304,12 @@ def build_encoder(self, histogram, info):
activation = "relu",
name = name
)(layer)

if self.batch_norm:
layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
if self.dropout > 0:
layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")


encoder = keras.layers.Flatten()(layer)
return input, encoder

Expand All @@ -229,10 +329,14 @@ def build_decoder(self, histogram, info, input):
activation = "relu"
n_filters = 1
name = "output_%s" % (info["name"])
batch_norm = False
dropout = 0
else:
activation = "relu"
n_filters = self.n_filters
name = "decoder_%d_%s" % (i, info["name"])
batch_norm = self.batch_norm
dropout = self.dropout

if info["n_dim"] == 1:
layer = keras.layers.Conv1DTranspose(
Expand All @@ -252,6 +356,10 @@ def build_decoder(self, histogram, info, input):
activation = activation,
name = name
)(layer)
if batch_norm:
layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
if dropout > 0:
layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")

output = layer
return output
3 changes: 2 additions & 1 deletion autodqm_ml/algorithms/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def save_model(self, pca, model_file):
:param model_file: folder name to place trained PCA pickles
:type model_file: str
"""
logger.debug("[PCA : save_model] Saving trained PCA to file '%s'." % (model_file))

os.system("mkdir -p %s" % self.output_dir)
pcaParams = {
'name' : model_file.split("/")[-1].replace(".json", ""),
Expand Down Expand Up @@ -141,7 +143,6 @@ def train(self):
pca.fit(input)
self.model[histogram] = pca

logger.debug("[PCA : train] Saving trained PCA to file '%s'." % (model_file))
self.save_model(pca, model_file)


Expand Down
7 changes: 7 additions & 0 deletions scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@
required = False,
default = None
)
parser.add_argument(
"--autoencoder_mode",
help = "specify whether you want to train an autoencoder for each histogram ('individual') or a single autoencoder on all histograms ('simultaneous')",
type = str,
required = False,
default = None
)
parser.add_argument(
"--debug",
help = "run logger in DEBUG mode (INFO is default)",
Expand Down

0 comments on commit 29124cb

Please sign in to comment.