Merge pull request #29 from AutoDQM/additional_training_options

A couple of additional options for train.py
AutoDQM · Jul 19, 2022 · 046bc9a · 046bc9a
2 parents f9495f0 + ff2b588
commit 046bc9a
Show file tree

Hide file tree

Showing 2 changed files with 204 additions and 160 deletions.
diff --git a/autodqm_ml/algorithms/anomaly_detection_algorithm.py b/autodqm_ml/algorithms/anomaly_detection_algorithm.py
@@ -79,7 +79,8 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
 
         # Load dataframe
         df = awkward.from_parquet(self.input_file)
-
+
+
         # Set helpful metadata
         for histogram, histogram_info in self.histograms.items():
             self.histograms[histogram]["name"] = histogram.replace("/", "").replace(" ","")
@@ -93,7 +94,25 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
                 self.histograms[histogram]["n_bins"] *= x 
 
         if not "train_label" in df.fields: # don't overwrite if a train/test split was already determined
-            if train_frac > 0:
+            if self.train_highest_only: #if desired, prioritize high-stat runs in train set
+                histogram = next(iter(self.histograms.items()))[0]
+
+		# Sum up all entires in histogram, order, then partition by training fraction
+                logger.debug("[AnomalyDetectionAlgorithm : load_data] Assigning training/test set labels based on run stats, using histogram '%s'. For random train/test splitting, set train_highest_only to False (default) " % (histogram))
+                df["train_label"] = awkward.sum(df[histogram], axis = -1)
+                if self.histograms[histogram]["n_dim"] == 2:
+                    df["train_label"] = awkward.sum(df["train_label"], axis = -1)
+                df["train_label"] = awkward.argsort(df["train_label"])
+
+                if train_frac > 0:
+                    partition = int((1 - train_frac)*len(df))
+                else:
+                    logger.debug("[AnomalyDetectionAlgorithm : load_data] No Training fraction given, using 50/50 train/test split.")
+                    partition = int(0.5*len(sorted_stats))
+
+                df["train_label"] = df["train_label"] >= partition
+
+            elif train_frac > 0:
                 df["train_label"] = numpy.random.choice(2, size = len(df), p = [train_frac, 1 - train_frac]) # 0 = train, 1 = test, -1 = don't use in training or testing
                 df["train_label"] = awkward.where(
                         df.label == kANOMALOUS,
@@ -117,10 +136,10 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
                 if awkward.all((n_entries <= 1.000001) & (n_entries >= 0.999999)): # was already normalized in a previous train.py run which would have removed low stat bins as well, so continue
                     continue
                 else:
-                    cut = cut & (n_entries >= 10000) # FIXME: hard-coded to 10k for now
+                    cut = cut & (n_entries >= self.low_stat_threshold) # FIXME: hard-coded to 10k for now  THIS IS FIXED (Not hard coded any more)
             n_runs_pre = len(df)
             n_runs_post = awkward.sum(cut)
-            logger.debug("[anomaly_detection_algorithm : load_data] Removing %d/%d runs in which one or more of the requested histograms had less than 10000 entries." % (n_runs_pre - n_runs_post, n_runs_pre))
+            logger.debug("[anomaly_detection_algorithm : load_data] Removing %d/%d runs in which one or more of the requested histograms had less than %d entries." % (n_runs_pre - n_runs_post, n_runs_pre, self.low_stat_threshold))
             df = df[cut]
 
         for histogram, histogram_info in self.histograms.items():
@@ -133,7 +152,6 @@ def load_data(self, file = None, histograms = {}, train_frac = 0.5, remove_low_s
 
                     logger.debug("[anomaly_detection_algorithm : load_data] Scaling all entries in histogram '%s' by the sum of total entries." % histogram)
                     df[histogram] = df[histogram] * (1. / sum) 
-
         self.n_train = awkward.sum(df.train_label == 0)
         self.n_test = awkward.sum(df.train_label == 1)
         self.df = df

diff --git a/scripts/train.py b/scripts/train.py
@@ -1,155 +1,181 @@
-import os
-import json
-import argparse
-
-from autodqm_ml.utils import setup_logger
-from autodqm_ml.algorithms.statistical_tester import StatisticalTester
-from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
-from autodqm_ml.algorithms.pca import PCA
-from autodqm_ml.algorithms.autoencoder import AutoEncoder
-from autodqm_ml.utils import expand_path
-
-parser = argparse.ArgumentParser()
-
-# Required arguments
-parser.add_argument(
-    "--algorithm",
-    help = "name of algorithm ('PCA' or 'Autoencoder' or 'StatisticalTester') to train with default options OR path to json filed specifying particular options for training a given algorithm.",
-    type = str,
-    required = True
-)
-
-# Optional arguments
-parser.add_argument(
-    "--output_dir",
-    help = "output directory to place files in",
-    type = str,
-    required = False,
-    default = None
-)
-parser.add_argument(
-    "--tag",
-    help = "tag to identify output files",
-    type = str,
-    required = False,
-    default = None
-)
-parser.add_argument(
-    "--input_file",
-    help = "input file (i.e. output from fetch_data.py) to use for training the ML algorithm",
-    type = str,
-    required = False,
-    default = None
-)
-parser.add_argument(
-    "--histograms",
-    help = "csv list of histograms on which to train the ML algorithm. If multiple are supplied, for PCAs one PCA will be trained for each histogram, while for autoencoders, a single AutoEncoder taking each of the histograms as inputs will be trained.",
-    type = str,
-    required = False,
-    default = None
-)
-parser.add_argument(
-    "--reference",
-    help = "reference run number to use for comparisons with StatisticalTester",
-    type = int,
-    required = False,
-    default = None
-)
-parser.add_argument(
-    "--n_components",
-    help = "dimension of latent space (number of principle components for PCA)",
-    type = int,
-    required = False,
-    default = None
-)
-parser.add_argument(
-    "--autoencoder_mode",
-    help = "specify whether you want to train an autoencoder for each histogram ('individual') or a single autoencoder on all histograms ('simultaneous')",
-    type = str,
-    required = False,
-    default = None
-)
-parser.add_argument(
-    "--debug",
-    help = "run logger in DEBUG mode (INFO is default)",
-    required = False,
-    action = "store_true"
-)
-
-args = parser.parse_args()
-os.system("mkdir -p %s/" % args.output_dir)
-
-logger_mode = "DEBUG" if args.debug else "INFO"
-log_file = "%s/fetch_data_log_%s.txt" % (args.output_dir, args.tag)
-logger = setup_logger(logger_mode, log_file)
-
-if "json" in args.algorithm:
-    if not os.path.exists(args.algorithm):
-        algorithm_config_file = expand_path(args.algorithm)
-    else:
-        algorithm_config_file = args.algorithm
-
-    with open(algorithm_config_file, "r") as f_in:
-        config = json.load(f_in)
-
-    # Add command line arguments to config
-    for k,v in vars(args).items():
-        if v is not None:
-            config[k] = v # note: if you specify an argument both through command line argument and json, we give precedence to the version from command line arguments 
-
-else:
-    config = vars(args)
-    config["name"] = args.algorithm.lower() 
-
-if not config["name"] in ["autoencoder", "pca", "statistical_tester"]:
-    message = "[train.py] Requested algorithm '%s' is not in the supported list of algorithms ['autoencoder', 'pca']." % (config["name"])
-    logger.exception(message)
-    raise RuntimeError()
-
-if config["name"] == "pca":
-    algorithm = PCA(**config)
-elif config["name"] == "autoencoder":
-    algorithm = AutoEncoder(**config)
-elif config["name"] == "statistical_tester":
-    algorithm = StatisticalTester(**config)
-
-if args.input_file is None and "input_file" not in config.keys():
-    message = "[train.py] An input file for training the ML algorithm was not supplied through CLI nor found in the json config file for the algorithm."
-    logger.exception(message)
-    raise RuntimeError()
-
-if args.histograms is None and "histograms" not in config.keys():
-    message = "[train.py] A list of histograms to train on was not supplied through CLI nor found in the json config file for the algorithm."
-    logger.exception(message)
-    raise RuntimeError()
-
-if args.input_file is not None: # 
-    training_file = args.input_file
-else:
-    training_file = config["input_file"]
-
-if args.histograms is not None:
-    histograms = {x : { "normalize" : True} for x in args.histograms.split(",")}
-elif isinstance(config["histograms"], str):
-    histograms = {x : { "normalize" : True} for x in config["histograms"].split(",")}
-elif isinstance(config["histograms"], dict):
-    histograms = config["histograms"]
-else:
-    logger.exception("[train.py] The `histograms` argument should either be a csv list of histogram names (str) or a dictionary (if provided through a json config).")
-    raise RuntimeError()
-
-# Load data
-algorithm.load_data(
-    file = training_file,
-    histograms = histograms
-)
-
-# Train
-if isinstance(algorithm, MLAlgorithm):
-    algorithm.train()
-
-# Predict
-algorithm.predict()
-
-# Save model and new df with score zipped in
-algorithm.save() 
+import os
+import json
+import argparse
+
+from autodqm_ml.utils import setup_logger
+from autodqm_ml.algorithms.statistical_tester import StatisticalTester
+from autodqm_ml.algorithms.ml_algorithm import MLAlgorithm
+from autodqm_ml.algorithms.pca import PCA
+from autodqm_ml.algorithms.autoencoder import AutoEncoder
+from autodqm_ml.utils import expand_path
+
+parser = argparse.ArgumentParser()
+
+# Required arguments
+parser.add_argument(
+    "--algorithm",
+    help = "name of algorithm ('PCA' or 'Autoencoder' or 'StatisticalTester') to train with default options OR path to json filed specifying particular options for training a given algorithm.",
+    type = str,
+    required = True
+)
+
+# Optional arguments
+parser.add_argument(
+    "--output_dir",
+    help = "output directory to place files in",
+    type = str,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--tag",
+    help = "tag to identify output files",
+    type = str,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--input_file",
+    help = "input file (i.e. output from fetch_data.py) to use for training the ML algorithm",
+    type = str,
+    required = False,
+    default = None
+)
+
+parser.add_argument(
+    "--low_stat_threshold",
+    help = "Minimum number of entries required per histogram for training. If a histogram has less than the set minimum, the histogram will not be included in training.",
+    type = int,
+    required = False,
+    default = 10000
+)
+parser.add_argument(
+    "--train_highest_only",
+    help = "If True, only trains on the runs with the highest stats, or the highest number of entries. The test set becomes the remaining runs.",
+    type = bool,
+    required = False,
+    default = False
+)
+
+
+parser.add_argument(
+    "--histograms",
+    help = "csv list of histograms on which to train the ML algorithm. If multiple are supplied, for PCAs one PCA will be trained for each histogram, while for autoencoders, a single AutoEncoder taking each of the histograms as inputs will be trained.",
+    type = str,
+    required = False,
+    default = None
+)
+# To be added when I figure out how to add both safely.
+#parser.add_argument(
+#    "--train_size",
+#    help = "proportion of data to be used in model training (as opposed to model testing). Entering a number less than 0 does something weird, but I forgot what that is."
+#    required = False,
+#    default = 0.5,
+#)
+
+parser.add_argument(
+    "--reference",
+    help = "reference run number to use for comparisons with StatisticalTester",
+    type = int,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--n_components",
+    help = "dimension of latent space (number of principle components for PCA)",
+    type = int,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--autoencoder_mode",
+    help = "specify whether you want to train an autoencoder for each histogram ('individual') or a single autoencoder on all histograms ('simultaneous')",
+    type = str,
+    required = False,
+    default = None
+)
+parser.add_argument(
+    "--debug",
+    help = "run logger in DEBUG mode (INFO is default)",
+    required = False,
+    action = "store_true"
+)
+
+args = parser.parse_args()
+
+os.system("mkdir -p %s/" % args.output_dir)
+
+logger_mode = "DEBUG" if args.debug else "INFO"
+log_file = "%s/fetch_data_log_%s.txt" % (args.output_dir, args.tag)
+logger = setup_logger(logger_mode, log_file)
+
+if "json" in args.algorithm:
+    if not os.path.exists(args.algorithm):
+        algorithm_config_file = expand_path(args.algorithm)
+    else:
+        algorithm_config_file = args.algorithm
+
+    with open(algorithm_config_file, "r") as f_in:
+        config = json.load(f_in)
+
+    # Add command line arguments to config
+    for k,v in vars(args).items():
+        if v is not None:
+            config[k] = v # note: if you specify an argument both through command line argument and json, we give precedence to the version from command line arguments
+
+else:
+    config = vars(args)
+    config["name"] = args.algorithm.lower()
+
+if not config["name"] in ["autoencoder", "pca", "statistical_tester"]:
+    message = "[train.py] Requested algorithm '%s' is not in the supported list of algorithms ['autoencoder', 'pca']." % (config["name"])
+    logger.exception(message)
+    raise RuntimeError()
+
+if config["name"] == "pca":
+    algorithm = PCA(**config)
+elif config["name"] == "autoencoder":
+    algorithm = AutoEncoder(**config)
+elif config["name"] == "statistical_tester":
+    algorithm = StatisticalTester(**config)
+
+if args.input_file is None and "input_file" not in config.keys():
+    message = "[train.py] An input file for training the ML algorithm was not supplied through CLI nor found in the json config file for the algorithm."
+    logger.exception(message)
+    raise RuntimeError()
+
+if args.histograms is None and "histograms" not in config.keys():
+    message = "[train.py] A list of histograms to train on was not supplied through CLI nor found in the json config file for the algorithm."
+    logger.exception(message)
+    raise RuntimeError()
+
+if args.input_file is not None: #
+    training_file = args.input_file
+else:
+    training_file = config["input_file"]
+
+if args.histograms is not None:
+    histograms = {x : { "normalize" : True} for x in args.histograms.split(",")}
+elif isinstance(config["histograms"], str):
+    histograms = {x : { "normalize" : True} for x in config["histograms"].split(",")}
+elif isinstance(config["histograms"], dict):
+    histograms = config["histograms"]
+else:
+    logger.exception("[train.py] The `histograms` argument should either be a csv list of histogram names (str) or a dictionary (if provided through a json config).")
+    raise RuntimeError()
+
+# Load data
+algorithm.load_data(
+    file= training_file,
+    histograms = histograms
+)
+
+# Train
+if isinstance(algorithm, MLAlgorithm):
+    algorithm.train()
+
+# Predict
+algorithm.predict()
+
+# Save model and new df with score zipped in
+algorithm.save()