Skip to content

Commit

Permalink
Merge pull request #27 from AutoDQM/autoencoder_dev_3Feb2022
Browse files Browse the repository at this point in the history
Developments for autoencoders and assess script
  • Loading branch information
chadfreer authored Feb 23, 2022
2 parents 850ba6a + 8f03834 commit 65968ac
Show file tree
Hide file tree
Showing 11 changed files with 1,087 additions and 91 deletions.
7 changes: 1 addition & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,10 @@ and then rerunning the command to create the `conda` env. The resulting `conda e

**3. Install autodqm-ml**

**Users** can install with:
```
python setup.py install
```
**Developers** are suggested to install with:
Install with:
```
pip install -e .
```
to avoid rerunning the whole installation every time there is a change.

Once your setup is installed, you can activate your python environment with
```
Expand Down
17 changes: 15 additions & 2 deletions autodqm_ml/algorithms/anomaly_detection_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas
import numpy
import awkward
import json

from autodqm_ml import utils
from autodqm_ml.data_formats.histogram import Histogram
Expand Down Expand Up @@ -72,6 +73,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s

if histograms:
self.histograms = histograms
self.histogram_name_map = {} # we replace "/" and spaces in input histogram names to play nicely with other packages, this map lets you go back and forth between them

logger.debug("[AnomalyDetectionAlgorithm : load_data] Loading training data from file '%s'" % (self.input_file))

Expand All @@ -81,6 +83,7 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
# Set helpful metadata
for histogram, histogram_info in self.histograms.items():
self.histograms[histogram]["name"] = histogram.replace("/", "").replace(" ","")
self.histogram_name_map[self.histograms[histogram]["name"]] = histogram

a = awkward.to_numpy(df[histogram][0])
self.histograms[histogram]["shape"] = a.shape
Expand Down Expand Up @@ -134,9 +137,9 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
self.n_train = awkward.sum(df.train_label == 0)
self.n_test = awkward.sum(df.train_label == 1)
self.df = df
self.n_histograms = len(list(self.histograms.keys()))


logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (len(list(self.histograms.keys())), self.n_train, self.n_test))
logger.debug("[AnomalyDetectionAlgorithm : load_data] Loaded data for %d histograms with %d events in training set and %d events in testing set." % (self.n_histograms, self.n_train, self.n_test))

self.data_is_loaded = True

Expand All @@ -160,3 +163,13 @@ def save(self):
self.output_file = "%s/%s.parquet" % (self.output_dir, self.input_file.split("/")[-1].replace(".parquet", ""))
logger.info("[AnomalyDetectionAlgorithm : save] Saving output with additional fields to file '%s'." % (self.output_file))
awkward.to_parquet(self.df, self.output_file)

self.config_file = "%s/%s_%s.json" % (self.output_dir, self.name, self.tag)
config = {}
for k,v in vars(self).items():
if utils.is_json_serializable(v):
config[k] = v

logger.info("[AnomalyDetectionAlgorithm : save] Saving AnomalyDetectionAlgorithm config to file '%s'." % (self.config_file))
with open(self.config_file, "w") as f_out:
json.dump(config, f_out, sort_keys = True, indent = 4)
203 changes: 144 additions & 59 deletions autodqm_ml/algorithms/autoencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy
import json
import awkward
import copy

import logging
logger = logging.getLogger(__name__)
Expand All @@ -14,26 +15,54 @@
from autodqm_ml import utils

DEFAULT_OPT = {
"batch_size" : 128,
"val_batch_size" : 1024,
"learning_rate" : 0.001,
"n_epochs" : 1000,
"early_stopping" : True,
"early_stopping_rounds" : 3,
"n_hidden_layers" : 2,
"n_nodes" : 25,
"n_nodes" : 50,
"n_components" : 3,
"kernel_1d" : 3,
"kernel_2d" : 3,
"n_filters" : 8
"strides_1d" : 1,
"strides_2d" : 1,
"dropout" : 0.0,
"batch_norm" : False,
"n_filters" : 12
}

class AutoEncoder(MLAlgorithm):
"""
Autoencoder base class.
:param config: dictionary with hyperparameters for autoencoder training. Any hyperparameters not specified will be taken from the default values in `DEFAULT_OPT`
:type config: dict
:param mode: string to specify whether you want to train an autoencoder for each histogram ("individual") or a single autoencoder on all histograms ("simultaneous")
:type mode: str
"""
def __init__(self, **kwargs):
super(AutoEncoder, self).__init__(**kwargs)

self.config = utils.update_dict(
original = DEFAULT_OPT,
new = self.__dict__
new = kwargs.get('config', {})
)

self.mode = kwargs.get('autoencoder_mode', 'individual')
if self.mode is None:
self.mode = "individual"

if not self.mode in ["individual", "simultaneous"]:
logger.exception("[AutoEncoder : __init__] mode '%s' is not a recognized option for AutoEncoder. Currently available modes are 'individual' (default) and 'simultaneous'." % (self.mode))
raise ValueError()
self.models = {}

logger.debug("[AutoEncoder : __init__] Constructing AutoEncoder with the following training options and hyperparameters:")
for param, value in self.config.items():
logger.debug("\t %s : %s" % (param, str(value)))


def load_model(self, model_file):
"""
Expand All @@ -47,66 +76,103 @@ def save_model(self, model, model_file):
"""
"""
logger.debug("[AutoEncoder : save_model] Saving trained autoencoder to file '%s'." % (model_file))
model.save(model_file)


def train(self, n_epochs = 1000, batch_size = 128):
def train(self):
"""
"""
model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
if os.path.exists(model_file):
logger.warning("[AutoEncoder : train] A trained AutoEncoder alread exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
self.model = self.load_model(model_file)
return

inputs, outputs = self.make_inputs(split = "train")
inputs_val, outputs_val = self.make_inputs(split = "test")

self.model = AutoEncoder_DNN(self.histograms, **self.config).model()

self.model.compile(
optimizer = keras.optimizers.Adam(),
loss = keras.losses.MeanSquaredError()
)
if self.mode == "simultaneous":
self.models = { None : None }
logger.debug("[AutoEncoder : train] Mode selected as 'simultaneous', meaning a single autoencoder will be trained simultaneously on all histograms. Use 'individual' if you wish to train one autoencoder for each histogram.")
elif self.mode == "individual":
self.models = { k : None for k,v in self.histograms.items() } #copy.deepcopy(self.histograms)
logger.debug("[AutoEncoder : train] Mode selected as 'individual', meaning one autoencoder will be trained for each histogram. Use 'simultaneous' if you wish to train a single autoencoder for all histograms.")

for histogram, histogram_info in self.models.items():
if histogram is None:
model_file = "%s/autoencoder_%s.h5" % (self.output_dir, self.tag)
else:
model_file = "%s/autoencoder_%s_%s.h5" % (self.output_dir, histogram, self.tag)

if os.path.exists(model_file):
logger.warning("[AutoEncoder : train] A trained AutoEncoder already exists with tag '%s' at file '%s'. We will load the saved model from the file rather than retraining. If you wish to retrain please provide a new tag or delete the old outputs." % (self.tag, model_file))
self.models[histogram] = self.load_model(model_file)
return

self.model.fit(
inputs,
outputs,
validation_data = (inputs_val, outputs_val),
callbacks = [keras.callbacks.EarlyStopping(patience = 3)],
epochs = n_epochs,
batch_size = batch_size
)
self.save_model(self.model, model_file)


def predict(self, batch_size = 1024):
inputs, outputs = self.make_inputs(split = "all")
pred = self.model.predict(inputs, batch_size = batch_size)

idx = 0
for histogram, histogram_info in self.histograms.items():
original_hist = self.df[histogram]
if len(self.histograms.items()) >= 2:
reconstructed_hist = awkward.flatten(awkward.from_numpy(pred[idx]), axis = -1)
else:
reconstructed_hist = awkward.flatten(awkward.from_numpy(pred), axis = -1)
inputs, outputs = self.make_inputs(split = "train", histogram_name = histogram)
inputs_val, outputs_val = self.make_inputs(split = "test", histogram_name = histogram)

sse = awkward.sum(
(original_hist - reconstructed_hist) ** 2,
axis = -1
if histogram is None:
hist_name = str(list(self.models.keys()))
else:
hist_name = histogram
logger.debug("[AutoEncoder : train] Training autoencoder with %d dimensions in latent space for histogram(s) '%s' with %d training examples." % (self.config["n_components"], hist_name, len(list(inputs.values())[0])))

if self.mode == "simultaneous":
histograms = self.histograms
elif self.mode == "individual":
histograms = { histogram : self.histograms[histogram] }

model = AutoEncoder_DNN(histograms, **self.config).model()

model.compile(
optimizer = keras.optimizers.Adam(learning_rate = self.config["learning_rate"]),
loss = keras.losses.MeanSquaredError()
)

# For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
if histogram_info["n_dim"] == 2:
sse = awkward.sum(sse, axis = -1)

self.add_prediction(histogram, sse, reconstructed_hist)
idx += 1
callbacks = []
if self.config["early_stopping"]:
callbacks.append(keras.callbacks.EarlyStopping(patience = self.config["early_stopping_rounds"]))

model.fit(
inputs,
outputs,
validation_data = (inputs_val, outputs_val),
callbacks = callbacks,
epochs = self.config["n_epochs"],
batch_size = self.config["batch_size"]
)

self.save_model(model, model_file)
self.models[histogram] = model


def predict(self, batch_size = 1024):
for histogram, model in self.models.items():
inputs, outputs = self.make_inputs(split = "all", histogram_name = histogram)
predictions = model.predict(inputs, batch_size = batch_size)

def make_inputs(self, split = None):
if self.mode == "simultaneous" and self.n_histograms >= 2:
predictions = { name : pred for name, pred in zip(model.output_names, predictions) }
else:
predictions = { model.output_names[0] : predictions }

for name, pred in predictions.items():
hist_name = self.histogram_name_map[name.replace("output_", "")] # shape [n_runs, histogram dimensions, 1]
original_hist = self.df[hist_name] # shape [n_runs, histogram dimensions]

reconstructed_hist = awkward.flatten( # change shape from [n_runs, histogram dimensions, 1] -> [n_runs, histogram dimensions]
awkward.from_numpy(pred),
axis = -1
)

sse = awkward.sum( # perform sum along inner-most axis, i.e. first histogram dimension
(original_hist - reconstructed_hist) ** 2,
axis = -1
)

# For 2d histograms, we need to sum over one more axis to get a single SSE score for each run
if self.histograms[hist_name]["n_dim"] == 2:
sse = awkward.sum(sse, axis = -1) # second histogram dimension

self.add_prediction(hist_name, sse, reconstructed_hist)


def make_inputs(self, split = None, histogram_name = None):
"""
"""
Expand All @@ -118,27 +184,29 @@ def make_inputs(self, split = None):
elif split == "test":
cut = self.df.train_label == 1
else:
cut = self.df.train_label >= 0
cut = self.df.run_number >= 0 # dummy all True cut

df = self.df[cut]

for histogram, info in self.histograms.items():
if histogram_name is not None: # self.mode == "individual", i.e. separate autoencoder for each histogram
if not histogram == histogram_name: # only grab the relevant histogram for this autoencoder
continue

data = tf.convert_to_tensor(df[histogram])
inputs["input_" + info["name"]] = data
outputs["output_" + info["name"]] = data



return inputs, outputs


class AutoEncoder_DNN(keras.models.Model):
class AutoEncoder_DNN():
"""
Model defined through the Keras Model Subclassing API: https://www.tensorflow.org/guide/keras/custom_layers_and_models
An AutoEncoder instance owns a single AutoEncoder_DNN, which is the actual implementation of the DNN.
An AutoEncoder instance owns AutoEncoder_DNN(s), which is the actual implementation of the DNN.
"""
def __init__(self, histograms, **kwargs):
super(AutoEncoder_DNN, self).__init__()

self.n_histograms = len(histograms.keys())

self.__dict__.update(kwargs)
Expand Down Expand Up @@ -179,7 +247,11 @@ def __init__(self, histograms, **kwargs):


def model(self):
model = keras.models.Model(inputs = self.inputs, outputs = self.outputs)
model = keras.models.Model(
inputs = self.inputs,
outputs = self.outputs,
name = "autoencoder"
)
model.summary()
return model

Expand Down Expand Up @@ -209,7 +281,12 @@ def build_encoder(self, histogram, info):
activation = "relu",
name = name
)(layer)

if self.batch_norm:
layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
if self.dropout > 0:
layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")


encoder = keras.layers.Flatten()(layer)
return input, encoder

Expand All @@ -229,10 +306,14 @@ def build_decoder(self, histogram, info, input):
activation = "relu"
n_filters = 1
name = "output_%s" % (info["name"])
batch_norm = False
dropout = 0
else:
activation = "relu"
n_filters = self.n_filters
name = "decoder_%d_%s" % (i, info["name"])
batch_norm = self.batch_norm
dropout = self.dropout

if info["n_dim"] == 1:
layer = keras.layers.Conv1DTranspose(
Expand All @@ -252,6 +333,10 @@ def build_decoder(self, histogram, info, input):
activation = activation,
name = name
)(layer)
if batch_norm:
layer = keras.layers.BatchNormalization(name = name + "_batch_norm")
if dropout > 0:
layer = keras.layers.Dropout(self.dropout, name = name + "_dropout")

output = layer
return output
3 changes: 2 additions & 1 deletion autodqm_ml/algorithms/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def save_model(self, pca, model_file):
:param model_file: folder name to place trained PCA pickles
:type model_file: str
"""
logger.debug("[PCA : save_model] Saving trained PCA to file '%s'." % (model_file))

os.system("mkdir -p %s" % self.output_dir)
pcaParams = {
'name' : model_file.split("/")[-1].replace(".json", ""),
Expand Down Expand Up @@ -141,7 +143,6 @@ def train(self):
pca.fit(input)
self.model[histogram] = pca

logger.debug("[PCA : train] Saving trained PCA to file '%s'." % (model_file))
self.save_model(pca, model_file)


Expand Down
Loading

0 comments on commit 65968ac

Please sign in to comment.