Skip to content

Commit

Permalink
Working on #18: add sse histograms by good/bad runs, if applicable
Browse files Browse the repository at this point in the history
  • Loading branch information
sam-may committed Nov 23, 2021
1 parent 159de15 commit 26f808b
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 8 deletions.
2 changes: 2 additions & 0 deletions autodqm_ml/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
kANOMALOUS = 1
kGOOD = 0
35 changes: 27 additions & 8 deletions scripts/assess.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from autodqm_ml.utils import setup_logger
from autodqm_ml.utils import expand_path
from autodqm_ml.plotting.plot_tools import make_original_vs_reconstructed_plot, make_sse_plot
from autodqm_ml.constants import kANOMALOUS, kGOOD

def parse_arguments():
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -123,14 +124,32 @@ def main(args):

# Histogram of sse for algorithms
for h, info in histograms.items():
for set, id in zip(["train", "test"], [0, 1]):
runs_set = runs[runs.train_label == id]
recos = {}
for algorithm, algorithm_info in info["algorithms"].items():
recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] }
h_name = h.replace("/", "").replace(" ", "")
save_name = args.output_dir + "/" + h_name + "_sse_%s.pdf" % set
make_sse_plot(h_name, recos, save_name)
splits = {
"train_label" : [("train", 0), ("test", 1)],
"label" : [("anomalous", kANOMALOUS), ("good", kGOOD)]
}
for split, split_info in splits.items():
for name, id in split_info:
runs_set = runs[runs[split] == id]
if len(runs_set) == 0:
logger.warning("[assess.py] For histogram '%s', no runs belong to the set '%s', skipping making a histogram of SSE for this." % (h, name))
continue
recos = {}
for algorithm, algorithm_info in info["algorithms"].items():
recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] }
h_name = h.replace("/", "").replace(" ", "")
save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (split, name)
make_sse_plot(h_name, recos, save_name)


#for set, id in zip(["train", "test"], [0, 1]):
# runs_set = runs[runs.train_label == id]
# recos = {}
# for algorithm, algorithm_info in info["algorithms"].items():
# recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] }
# h_name = h.replace("/", "").replace(" ", "")
# save_name = args.output_dir + "/" + h_name + "_sse_%s.pdf" % set
# make_sse_plot(h_name, recos, save_name)

# Plots of original/reconstructed histograms
if args.runs is None:
Expand Down
133 changes: 133 additions & 0 deletions scripts/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import os
import json
import argparse

from autodqm_ml.utils import setup_logger
from autodqm_ml.algorithms.statistical_tester import StatisticalTester
from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
from autodqm_ml.algorithms.pca import PCA
from autodqm_ml.algorithms.autoencoder import AutoEncoder
from autodqm_ml.utils import expand_path

parser = argparse.ArgumentParser()
parser.add_argument(
"--output_dir",
help = "output directory to place files in",
type = str,
required = False,
default = "output"
)
parser.add_argument(
"--tag",
help = "tag to identify output files",
type = str,
required = False,
default = "test"
)
parser.add_argument(
"--algorithm",
help = "name of algorithm ('PCA' or 'Autoencoder' or 'StatisticalTester') to train with default options OR path to json filed specifying particular options for training a given algorithm.",
type = str,
required = True
)
parser.add_argument(
"--input_file",
help = "input file (i.e. output from fetch_data.py) to use for training the ML algorithm",
type = str,
required = False,
default = None
)
parser.add_argument(
"--histograms",
help = "csv list of histograms on which to train the ML algorithm. If multiple are supplied, for PCAs one PCA will be trained for each histogram, while for autoencoders, a single AutoEncoder taking each of the histograms as inputs will be trained.",
type = str,
required = False,
default = None
)
parser.add_argument(
"--reference",
help = "reference run number to use for comparisons with StatisticalTester",
type = int,
required = False,
default = None
)
parser.add_argument(
"--n_components",
help = "dimension of latent space (number of principle components for PCA)",
type = int,
required = False,
default = None
)
parser.add_argument(
"--debug",
help = "run logger in DEBUG mode (INFO is default)",
required = False,
action = "store_true"
)

args = parser.parse_args()
os.system("mkdir -p %s/" % args.output_dir)

logger_mode = "DEBUG" if args.debug else "INFO"
log_file = "%s/fetch_data_log_%s.txt" % (args.output_dir, args.tag)
logger = setup_logger(logger_mode, log_file)

if "json" in args.algorithm:
if not os.path.exists(args.algorithm):
algorithm_config_file = expand_path(args.algorithm)
else:
algorithm_config_file = algo
with open(algorithm_config_file, "r") as f_in:
config = json.load(f_in)

else:
config = vars(args)
config["name"] = args.algorithm.lower()

if not config["name"] in ["autoencoder", "pca", "statistical_tester"]:
message = "[train.py] Requested algorithm '%s' is not in the supported list of algorithms ['autoencoder', 'pca']." % (config["name"])
logger.exception(message)
raise RuntimeError()

if config["name"] == "pca":
algorithm = PCA(**config)
elif config["name"] == "autoencoder":
algorithm = AutoEncoder(**config)
elif config["name"] == "statistical_tester":
algorithm = StatisticalTester(**config)

if args.input_file is None and "input_file" not in config.keys():
message = "[train.py] An input file for training the ML algorithm was not supplied through CLI nor found in the json config file for the algorithm."
logger.exception(message)
raise RuntimeError()

if args.histograms is None and "histograms" not in config.keys():
message = "[train.py] A list of histograms to train on was not supplied through CLI nor found in the json config file for the algorithm."
logger.exception(message)
raise RuntimeError()

if args.input_file is not None: #
training_file = args.input_file
else:
training_file = config["input_file"]

if args.histograms is not None:
histograms = {x : { "normalize" : True} for x in args.histograms.split(",")}
else:
histograms = config["histograms"]

# Load data
algorithm.load_data(
file = training_file,
histograms = histograms
)

# Train
if isinstance(algorithm, MLAlgorithm):
algorithm.train()

# Predict
algorithm.predict()

# Save model and new df with score zipped in
algorithm.save()

0 comments on commit 26f808b

Please sign in to comment.