Working on #18: add sse histograms by good/bad runs, if applicable

AutoDQM · Nov 23, 2021 · 26f808b · 26f808b
1 parent 159de15
commit 26f808b
Show file tree

Hide file tree

Showing 3 changed files with 162 additions and 8 deletions.
diff --git a/autodqm_ml/constants.py b/autodqm_ml/constants.py
@@ -0,0 +1,2 @@
+kANOMALOUS = 1
+kGOOD = 0
diff --git a/scripts/assess.py b/scripts/assess.py
@@ -7,6 +7,7 @@
 from autodqm_ml.utils import setup_logger
 from autodqm_ml.utils import expand_path
 from autodqm_ml.plotting.plot_tools import make_original_vs_reconstructed_plot, make_sse_plot
+from autodqm_ml.constants import kANOMALOUS, kGOOD
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
@@ -123,14 +124,32 @@ def main(args):
 
     # Histogram of sse for algorithms
     for h, info in histograms.items():
-        for set, id in zip(["train", "test"], [0, 1]):
-            runs_set = runs[runs.train_label == id]
-            recos = {}
-            for algorithm, algorithm_info in info["algorithms"].items():
-                recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] }
-            h_name = h.replace("/", "").replace(" ", "")
-            save_name = args.output_dir + "/" + h_name + "_sse_%s.pdf" % set
-            make_sse_plot(h_name, recos, save_name)
+        splits = {
+                "train_label" : [("train", 0), ("test", 1)],
+                "label" : [("anomalous", kANOMALOUS), ("good", kGOOD)]
+        }
+        for split, split_info in splits.items():
+            for name, id in split_info:
+                runs_set = runs[runs[split] == id]
+                if len(runs_set) == 0:
+                    logger.warning("[assess.py] For histogram '%s', no runs belong to the set '%s', skipping making a histogram of SSE for this." % (h, name))
+                    continue
+                recos = {}
+                for algorithm, algorithm_info in info["algorithms"].items():
+                    recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] }
+                h_name = h.replace("/", "").replace(" ", "")
+                save_name = args.output_dir + "/" + h_name + "_sse_%s_%s.pdf" % (split, name)
+                make_sse_plot(h_name, recos, save_name)
+
+
+        #for set, id in zip(["train", "test"], [0, 1]):
+        #    runs_set = runs[runs.train_label == id]
+        #    recos = {}
+        #    for algorithm, algorithm_info in info["algorithms"].items():
+        #        recos[algorithm] = { "score" : runs_set[algorithm_info["score"]] }
+        #    h_name = h.replace("/", "").replace(" ", "")
+        #    save_name = args.output_dir + "/" + h_name + "_sse_%s.pdf" % set
+        #    make_sse_plot(h_name, recos, save_name)
 
     # Plots of original/reconstructed histograms
     if args.runs is None:

diff --git a/scripts/train.py b/scripts/train.py
@@ -0,0 +1,133 @@
+import os
+import json
+import argparse
+
+from autodqm_ml.utils import setup_logger
+from autodqm_ml.algorithms.statistical_tester import StatisticalTester
+from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
+from autodqm_ml.algorithms.pca import PCA
+from autodqm_ml.algorithms.autoencoder import AutoEncoder
+from autodqm_ml.utils import expand_path
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--output_dir",
+    help = "output directory to place files in",
+    type = str,
+    required = False,
+    default = "output"
+)
+parser.add_argument(
+    "--tag",
+    help = "tag to identify output files",
+    type = str,
+    required = False,
+    default = "test"
+)
+parser.add_argument(
+    "--algorithm",
+    help = "name of algorithm ('PCA' or 'Autoencoder' or 'StatisticalTester') to train with default options OR path to json filed specifying particular options for training a given algorithm.",
+    type = str,
+    required = True
+)
+parser.add_argument(
+    "--input_file",
+    help = "input file (i.e. output from fetch_data.py) to use for training the ML algorithm",
+    type = str,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--histograms",
+    help = "csv list of histograms on which to train the ML algorithm. If multiple are supplied, for PCAs one PCA will be trained for each histogram, while for autoencoders, a single AutoEncoder taking each of the histograms as inputs will be trained.",
+    type = str,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--reference",
+    help = "reference run number to use for comparisons with StatisticalTester",
+    type = int,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--n_components",
+    help = "dimension of latent space (number of principle components for PCA)",
+    type = int,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--debug",
+    help = "run logger in DEBUG mode (INFO is default)",
+    required = False,
+    action = "store_true"
+)
+
+args = parser.parse_args()
+os.system("mkdir -p %s/" % args.output_dir)
+
+logger_mode = "DEBUG" if args.debug else "INFO"
+log_file = "%s/fetch_data_log_%s.txt" % (args.output_dir, args.tag)
+logger = setup_logger(logger_mode, log_file)
+
+if "json" in args.algorithm:
+    if not os.path.exists(args.algorithm):
+        algorithm_config_file = expand_path(args.algorithm)
+    else:
+        algorithm_config_file = algo
+    with open(algorithm_config_file, "r") as f_in:
+        config = json.load(f_in)
+
+else:
+    config = vars(args)
+    config["name"] = args.algorithm.lower() 
+
+if not config["name"] in ["autoencoder", "pca", "statistical_tester"]:
+    message = "[train.py] Requested algorithm '%s' is not in the supported list of algorithms ['autoencoder', 'pca']." % (config["name"])
+    logger.exception(message)
+    raise RuntimeError()
+
+if config["name"] == "pca":
+    algorithm = PCA(**config)
+elif config["name"] == "autoencoder":
+    algorithm = AutoEncoder(**config)
+elif config["name"] == "statistical_tester":
+    algorithm = StatisticalTester(**config)
+
+if args.input_file is None and "input_file" not in config.keys():
+    message = "[train.py] An input file for training the ML algorithm was not supplied through CLI nor found in the json config file for the algorithm."
+    logger.exception(message)
+    raise RuntimeError()
+
+if args.histograms is None and "histograms" not in config.keys():
+    message = "[train.py] A list of histograms to train on was not supplied through CLI nor found in the json config file for the algorithm."
+    logger.exception(message)
+    raise RuntimeError()
+
+if args.input_file is not None: # 
+    training_file = args.input_file
+else:
+    training_file = config["input_file"]
+
+if args.histograms is not None:
+    histograms = {x : { "normalize" : True} for x in args.histograms.split(",")}
+else:
+    histograms = config["histograms"]
+
+# Load data
+algorithm.load_data(
+    file = training_file,
+    histograms = histograms
+)
+
+# Train
+if isinstance(algorithm, MLAlgorithm):
+    algorithm.train()
+
+# Predict
+algorithm.predict()
+
+# Save model and new df with score zipped in
+algorithm.save()