From 3a3a9f27444ea77c17034fa8740c520bc7d870fb Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 14:05:08 -0500
Subject: [PATCH 01/51] Print out total shot counts in preprocess.py

---
 plasma/preprocessor/preprocess.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index e93e7309..2c6384e4 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -257,6 +257,11 @@ def guarantee_preprocessed(conf, verbose=False):
     shot_list_train, shot_list_validate, shot_list_test = apply_bleed_in(
         conf, shot_list_train, shot_list_validate, shot_list_test)
     if verbose:
+        g.print_unique('total: {} shots, {} disruptive'.format(
+            len(shot_list_validate)+len(shot_list_train)+len(shot_list_test),
+            shot_list_validate.num_disruptive()
+            + shot_list_train.num_disruptive()
+            + shot_list_test.num_disruptive()))
         g.print_unique('validate: {} shots, {} disruptive'.format(
             len(shot_list_validate), shot_list_validate.num_disruptive()))
         g.print_unique('training: {} shots, {} disruptive'.format(

From 09c4d6771c7883c195b3f1b26fb6155e88b4469e Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 14:25:12 -0500
Subject: [PATCH 02/51] Always print # train shots before # validate shots in
 diagnostics

Canonical order: train, validate, test
---
 plasma/models/runner.py           | 4 ++--
 plasma/models/shallow_runner.py   | 6 +++---
 plasma/models/torch_runner.py     | 5 ++---
 plasma/preprocessor/preprocess.py | 4 ++--
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/plasma/models/runner.py b/plasma/models/runner.py
index 5b3f1d5a..29c42c3c 100644
--- a/plasma/models/runner.py
+++ b/plasma/models/runner.py
@@ -29,10 +29,10 @@ def train(conf, shot_list_train, shot_list_validate, loader,
     validation_losses = []
     validation_roc = []
     training_losses = []
-    print('validate: {} shots, {} disruptive'.format(
-        len(shot_list_validate), shot_list_validate.num_disruptive()))
     print('training: {} shots, {} disruptive'.format(
         len(shot_list_train), shot_list_train.num_disruptive()))
+    print('validate: {} shots, {} disruptive'.format(
+        len(shot_list_validate), shot_list_validate.num_disruptive()))
 
     if backend == 'tf' or backend == 'tensorflow':
         first_time = "tensorflow" not in sys.modules
diff --git a/plasma/models/shallow_runner.py b/plasma/models/shallow_runner.py
index 5d459a43..cdfe2c32 100644
--- a/plasma/models/shallow_runner.py
+++ b/plasma/models/shallow_runner.py
@@ -324,12 +324,12 @@ def build_callbacks(conf):
 def train(conf, shot_list_train, shot_list_validate, loader,
           shot_list_test=None):
     np.random.seed(1)
-    print('validate: {} shots, {} disruptive'.format(
-        len(shot_list_validate),
-        shot_list_validate.num_disruptive()))
     print('training: {} shots, {} disruptive'.format(
         len(shot_list_train),
         shot_list_train.num_disruptive()))
+    print('validate: {} shots, {} disruptive'.format(
+        len(shot_list_validate),
+        shot_list_validate.num_disruptive()))
 
     num_samples = conf['model']['shallow_model']['num_samples']
     feature_extractor = FeatureExtractor(loader)
diff --git a/plasma/models/torch_runner.py b/plasma/models/torch_runner.py
index 5c85e134..a4fde559 100644
--- a/plasma/models/torch_runner.py
+++ b/plasma/models/torch_runner.py
@@ -414,11 +414,10 @@ def train(conf, shot_list_train, shot_list_validate, loader):
     data_gen = partial(
         loader.training_batch_generator_full_shot_partial_reset,
         shot_list=shot_list_train)()
-    print('validate: {} shots, {} disruptive'.format(
-        len(shot_list_validate), shot_list_validate.num_disruptive()))
     print('training: {} shots, {} disruptive'.format(
         len(shot_list_train), shot_list_train.num_disruptive()))
-
+    print('validate: {} shots, {} disruptive'.format(
+        len(shot_list_validate), shot_list_validate.num_disruptive()))
     loader.set_inference_mode(False)
 
     train_model = build_torch_model(conf)
diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index 2c6384e4..deaccee3 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -262,10 +262,10 @@ def guarantee_preprocessed(conf, verbose=False):
             shot_list_validate.num_disruptive()
             + shot_list_train.num_disruptive()
             + shot_list_test.num_disruptive()))
-        g.print_unique('validate: {} shots, {} disruptive'.format(
-            len(shot_list_validate), shot_list_validate.num_disruptive()))
         g.print_unique('training: {} shots, {} disruptive'.format(
             len(shot_list_train), shot_list_train.num_disruptive()))
+        g.print_unique('validate: {} shots, {} disruptive'.format(
+            len(shot_list_validate), shot_list_validate.num_disruptive()))
         g.print_unique('testing: {} shots, {} disruptive'.format(
             len(shot_list_test), shot_list_test.num_disruptive()))
         g.print_unique("...done")

From df7228ea17915ffb5d7c34f7449b421ab3768861 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 14:40:33 -0500
Subject: [PATCH 03/51] Encapsulate printing of shot set sizes in new fn, in
 new file diagnostics.py

---
 plasma/models/runner.py           |  6 ++----
 plasma/models/shallow_runner.py   |  2 ++
 plasma/models/torch_runner.py     |  6 ++----
 plasma/preprocessor/preprocess.py | 16 ++++------------
 plasma/utils/diagnostics.py       | 28 ++++++++++++++++++++++++++++
 5 files changed, 38 insertions(+), 20 deletions(-)
 create mode 100644 plasma/utils/diagnostics.py

diff --git a/plasma/models/runner.py b/plasma/models/runner.py
index 29c42c3c..c2008817 100644
--- a/plasma/models/runner.py
+++ b/plasma/models/runner.py
@@ -1,6 +1,7 @@
 from plasma.utils.state_reset import reset_states
 from plasma.utils.evaluation import get_loss_from_list
 from plasma.utils.performance import PerformanceAnalyzer
+from plasma.utils.diagnostics import print_shot_list_sizes
 from plasma.models.loader import Loader, ProcessGenerator
 from plasma.conf import conf
 import pathos.multiprocessing as mp
@@ -29,10 +30,7 @@ def train(conf, shot_list_train, shot_list_validate, loader,
     validation_losses = []
     validation_roc = []
     training_losses = []
-    print('training: {} shots, {} disruptive'.format(
-        len(shot_list_train), shot_list_train.num_disruptive()))
-    print('validate: {} shots, {} disruptive'.format(
-        len(shot_list_validate), shot_list_validate.num_disruptive()))
+    print_shot_list_sizes(shot_list_train, shot_list_validate)
 
     if backend == 'tf' or backend == 'tensorflow':
         first_time = "tensorflow" not in sys.modules
diff --git a/plasma/models/shallow_runner.py b/plasma/models/shallow_runner.py
index cdfe2c32..0ea4962b 100644
--- a/plasma/models/shallow_runner.py
+++ b/plasma/models/shallow_runner.py
@@ -10,6 +10,7 @@
 # from plasma.utils.state_reset import reset_states
 from plasma.utils.evaluation import get_loss_from_list
 from plasma.utils.performance import PerformanceAnalyzer
+from plasma.utils.diagnostics import print_shot_list_sizes
 # from plasma.models.loader import Loader, ProcessGenerator
 # from plasma.conf import conf
 from sklearn.neural_network import MLPClassifier
@@ -324,6 +325,7 @@ def build_callbacks(conf):
 def train(conf, shot_list_train, shot_list_validate, loader,
           shot_list_test=None):
     np.random.seed(1)
+    print_shot_list_sizes(shot_list_train, shot_list_validate)
     print('training: {} shots, {} disruptive'.format(
         len(shot_list_train),
         shot_list_train.num_disruptive()))
diff --git a/plasma/models/torch_runner.py b/plasma/models/torch_runner.py
index a4fde559..195b5275 100644
--- a/plasma/models/torch_runner.py
+++ b/plasma/models/torch_runner.py
@@ -5,6 +5,7 @@
 from torch.autograd import Variable
 import torch.nn as nn
 import torch
+from plasma.utils.diagnostics import print_shot_list_sizes
 from plasma.utils.downloading import makedirs_process_safe
 from plasma.utils.performance import PerformanceAnalyzer
 from plasma.utils.evaluation import get_loss_from_list
@@ -414,10 +415,7 @@ def train(conf, shot_list_train, shot_list_validate, loader):
     data_gen = partial(
         loader.training_batch_generator_full_shot_partial_reset,
         shot_list=shot_list_train)()
-    print('training: {} shots, {} disruptive'.format(
-        len(shot_list_train), shot_list_train.num_disruptive()))
-    print('validate: {} shots, {} disruptive'.format(
-        len(shot_list_validate), shot_list_validate.num_disruptive()))
+    print_shot_list_sizes(shot_list_train, shot_list_validate)
     loader.set_inference_mode(False)
 
     train_model = build_torch_model(conf)
diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index deaccee3..0e649c2d 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -1,6 +1,6 @@
 '''
 #########################################################
-This file containts classes to handle data processing
+This file contains classes to handle data processing
 
 Author: Julian Kates-Harbeck, jkatesharbeck@g.harvard.edu
 
@@ -19,6 +19,7 @@
 import pathos.multiprocessing as mp
 
 from plasma.utils.processing import append_to_filename
+from plasma.utils.diagnostics import print_shot_list_sizes
 from plasma.primitives.shots import ShotList
 from plasma.utils.downloading import mkdirdepth
 
@@ -257,16 +258,7 @@ def guarantee_preprocessed(conf, verbose=False):
     shot_list_train, shot_list_validate, shot_list_test = apply_bleed_in(
         conf, shot_list_train, shot_list_validate, shot_list_test)
     if verbose:
-        g.print_unique('total: {} shots, {} disruptive'.format(
-            len(shot_list_validate)+len(shot_list_train)+len(shot_list_test),
-            shot_list_validate.num_disruptive()
-            + shot_list_train.num_disruptive()
-            + shot_list_test.num_disruptive()))
-        g.print_unique('training: {} shots, {} disruptive'.format(
-            len(shot_list_train), shot_list_train.num_disruptive()))
-        g.print_unique('validate: {} shots, {} disruptive'.format(
-            len(shot_list_validate), shot_list_validate.num_disruptive()))
-        g.print_unique('testing: {} shots, {} disruptive'.format(
-            len(shot_list_test), shot_list_test.num_disruptive()))
+        print_shot_list_sizes(shot_list_train, shot_list_validate,
+                              shot_list_test)
         g.print_unique("...done")
     return shot_list_train, shot_list_validate, shot_list_test
diff --git a/plasma/utils/diagnostics.py b/plasma/utils/diagnostics.py
new file mode 100644
index 00000000..6f632887
--- /dev/null
+++ b/plasma/utils/diagnostics.py
@@ -0,0 +1,28 @@
+'''
+#########################################################
+This file contains fns for printing diagnostic messages
+#########################################################
+'''
+
+from __future__ import print_function
+import plasma.global_vars as g
+
+
+def print_shot_list_sizes(shot_list_train, shot_list_validate,
+                          shot_list_test=None):
+    nshots = len(shot_list_train) + len(shot_list_validate)
+    nshots_disrupt = (shot_list_train.num_disruptive()
+                      + shot_list_validate.num_disruptive())
+    if shot_list_test is not None:
+        nshots += len(shot_list_test)
+        nshots_disrupt += shot_list_test.num_disruptive()
+    g.print_unique('total: {} shots, {} disruptive'.format(nshots,
+                                                           nshots_disrupt)
+    g.print_unique('training: {} shots, {} disruptive'.format(
+        len(shot_list_train), shot_list_train.num_disruptive()))
+    g.print_unique('validate: {} shots, {} disruptive'.format(
+        len(shot_list_validate), shot_list_validate.num_disruptive()))
+    if shot_list_test is not None:
+        g.print_unique('testing: {} shots, {} disruptive'.format(
+            len(shot_list_test), shot_list_test.num_disruptive()))
+    return

From cbc31b42aac55e1d2fce646925176a309e1ccb5f Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 14:42:02 -0500
Subject: [PATCH 04/51] Add missing bracket

---
 plasma/utils/diagnostics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plasma/utils/diagnostics.py b/plasma/utils/diagnostics.py
index 6f632887..d38d6e78 100644
--- a/plasma/utils/diagnostics.py
+++ b/plasma/utils/diagnostics.py
@@ -17,7 +17,7 @@ def print_shot_list_sizes(shot_list_train, shot_list_validate,
         nshots += len(shot_list_test)
         nshots_disrupt += shot_list_test.num_disruptive()
     g.print_unique('total: {} shots, {} disruptive'.format(nshots,
-                                                           nshots_disrupt)
+                                                           nshots_disrupt))
     g.print_unique('training: {} shots, {} disruptive'.format(
         len(shot_list_train), shot_list_train.num_disruptive()))
     g.print_unique('validate: {} shots, {} disruptive'.format(

From 366de2d8084e3bd5386c417790e5f4c2a143f98c Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 14:11:17 -0600
Subject: [PATCH 05/51] Comment out unused fn

---
 plasma/utils/processing.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/plasma/utils/processing.py b/plasma/utils/processing.py
index d6995bcb..12938c1b 100644
--- a/plasma/utils/processing.py
+++ b/plasma/utils/processing.py
@@ -90,15 +90,15 @@ def train_test_split_robust(x, frac, do_shuffle=False):
     return train, test
 
 
-def train_test_split_all(x, frac, do_shuffle=True):
-    groups = []
-    length = len(x[0])
-    mask = np.array(range(length)) < frac*length
-    if do_shuffle:
-        np.random.shuffle(mask)
-    for item in x:
-        groups.append((item[mask], item[~mask]))
-    return groups
+# def train_test_split_all(x, frac, do_shuffle=True):
+#     groups = []
+#     length = len(x[0])
+#     mask = np.array(range(length)) < frac*length
+#     if do_shuffle:
+#         np.random.shuffle(mask)
+#     for item in x:
+#         groups.append((item[mask], item[~mask]))
+#     return groups
 
 
 def concatenate_sublists(superlist):

From e527c38d215610ce714135d8bb0d050ce8c13fbb Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 15:21:11 -0500
Subject: [PATCH 06/51] Add details about how many omitted shots were
 disruptive

to preprocess.py
---
 plasma/preprocessor/preprocess.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index 0e649c2d..871a27fc 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -101,12 +101,16 @@ def preprocess_from_files(self, shot_files, use_shots):
 
         pool.close()
         pool.join()
-        print('Finished Preprocessing {} files in {} seconds'.format(
+        print('Finished preprocessing {} files in {} seconds'.format(
             len(shot_list_picked), time.time() - start_time))
+        print('Using {}/{} disruptive shots'.format(
+            used_shots.num_disruptive(), len(used_shots)))
         print('Omitted {} shots of {} total.'.format(
             len(shot_list_picked) - len(used_shots), len(shot_list_picked)))
-        print('{}/{} disruptive shots'.format(used_shots.num_disruptive(),
-                                              len(used_shots)))
+        print('Omitted {} disruptive shots of {} total disruptive.'.format(
+            shot_list_picked.num_disruptive() - used_shots.num_disruptive,
+            shot_list_picked.num_disruptive()))
+
         if len(used_shots) == 0:
             print("WARNING: All shots were omitted, please ensure raw data "
                   " is complete and available at {}.".format(

From 286dc8793feda0bd58ddd9c0cf7d2fc37ae7ed08 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 15:32:28 -0500
Subject: [PATCH 07/51] Typo

---
 plasma/preprocessor/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index 871a27fc..eaf99604 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -108,7 +108,7 @@ def preprocess_from_files(self, shot_files, use_shots):
         print('Omitted {} shots of {} total.'.format(
             len(shot_list_picked) - len(used_shots), len(shot_list_picked)))
         print('Omitted {} disruptive shots of {} total disruptive.'.format(
-            shot_list_picked.num_disruptive() - used_shots.num_disruptive,
+            shot_list_picked.num_disruptive() - used_shots.num_disruptive(),
             shot_list_picked.num_disruptive()))
 
         if len(used_shots) == 0:

From 300966252320fd792fc7616bc5962bdf69270139 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 15:52:43 -0600
Subject: [PATCH 08/51] Remove stray diagnostic print

---
 plasma/primitives/shots.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index cafb76ed..a9e95df9 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -121,7 +121,6 @@ def split_train_test(self, conf):
 
         shot_numbers_train = [shot.number for shot in shot_list_train]
         shot_numbers_test = [shot.number for shot in shot_list_test]
-        print(len(shot_numbers_train), len(shot_numbers_test))
         # make sure we only use pre-filtered valid shots
         shots_train = self.filter_by_number(shot_numbers_train)
         shots_test = self.filter_by_number(shot_numbers_test)

From 0e4715b78e5e73bb0de220c1e7818a0648d1ed47 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 21 Nov 2019 17:10:36 -0500
Subject: [PATCH 09/51] Reformat diagnostics

---
 plasma/preprocessor/preprocess.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index eaf99604..18cf3114 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -101,15 +101,17 @@ def preprocess_from_files(self, shot_files, use_shots):
 
         pool.close()
         pool.join()
-        print('Finished preprocessing {} files in {} seconds'.format(
+        print('\nFinished preprocessing {} files in {} seconds'.format(
             len(shot_list_picked), time.time() - start_time))
-        print('Using {}/{} disruptive shots'.format(
-            used_shots.num_disruptive(), len(used_shots)))
-        print('Omitted {} shots of {} total.'.format(
+        print('Using {} shots ({} disruptive shots)'.format(
+            len(used_shots), used_shots.num_disruptive()))
+        print('Omitted {} shots of {} total shots'.format(
             len(shot_list_picked) - len(used_shots), len(shot_list_picked)))
-        print('Omitted {} disruptive shots of {} total disruptive.'.format(
-            shot_list_picked.num_disruptive() - used_shots.num_disruptive(),
-            shot_list_picked.num_disruptive()))
+        print(
+            'Omitted {} disruptive shots of {} total disruptive shots'.format(
+                shot_list_picked.num_disruptive()
+                - used_shots.num_disruptive(),
+                shot_list_picked.num_disruptive()))
 
         if len(used_shots) == 0:
             print("WARNING: All shots were omitted, please ensure raw data "

From a99c01acf94d71a75d4249ba20a8d2cdbee94688 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Fri, 22 Nov 2019 10:25:44 -0600
Subject: [PATCH 10/51] Reduce number of lines

---
 plasma/primitives/data.py  | 16 ++++-------
 plasma/primitives/shots.py | 57 +++++++++++++-------------------------
 2 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index 63d30abc..5d84fb0b 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -5,7 +5,6 @@
 import re
 
 from scipy.interpolate import UnivariateSpline
-
 from plasma.utils.processing import get_individual_shot_file
 from plasma.utils.downloading import get_missing_value_array
 from plasma.utils.hashing import myhash
@@ -123,17 +122,14 @@ def load_data(self, prepath, shot, dtype='float32'):
             if self.is_ip:
                 print('shot {} has no current'.format(shot.number))
             else:
-                print(
-                    'Signal {}, shot {} contains no data'.format(
-                        self.description, shot.number))
+                print('Signal {}, shot {} contains no data'.format(
+                    self.description, shot.number))
             return None, None, False
 
         # make sure data doesn't contain nan
         if np.any(np.isnan(t)) or np.any(np.isnan(sig)):
-            print(
-                'Signal {}, shot {} contains NAN'.format(
-                    self.description,
-                    shot.number))
+            print('Signal {}, shot {} contains NAN'.format(
+                self.description, shot.number))
             return None, None, False
 
         return t, sig, True
@@ -278,8 +274,8 @@ def load_data(self, prepath, shot, dtype='float32'):
         return t, sig_interp, True
 
     def fetch_data(self, machine, shot_num, c):
-        time, data, mapping, success = self.fetch_data_basic(machine, shot_num,
-                                                             c)
+        time, data, mapping, success = self.fetch_data_basic(
+            machine, shot_num, c)
         path = self.get_path(machine)
         mapping_path = self.get_mapping_path(machine)
 
diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index a9e95df9..3079b8b2 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -13,7 +13,6 @@
 import os.path
 import sys
 import random as rnd
-
 import numpy as np
 
 from plasma.utils.processing import train_test_split, cut_and_resample_signal
@@ -36,11 +35,9 @@ def __repr__(self):
         return self.__str__()
 
     def get_single_shot_numbers_and_disruption_times(self, full_path):
-        data = np.loadtxt(
-            full_path, ndmin=1, dtype={
-                'names': (
-                    'num', 'disrupt_times'), 'formats': (
-                    'i4', 'f4')})
+        data = np.loadtxt(full_path, ndmin=1,
+                          dtype={'names': ('num', 'disrupt_times'),
+                                 'formats': ('i4', 'f4')})
         shots = np.array(list(zip(*data))[0])
         disrupt_times = np.array(list(zip(*data))[1])
         return shots, disrupt_times
@@ -77,21 +74,18 @@ def __init__(self, shots=None):
             assert(all([isinstance(shot, Shot) for shot in shots]))
             self.shots = [shot for shot in shots]
 
-    def load_from_shot_list_files_object(
-            self, shot_list_files_object, signals):
+    def load_from_shot_list_files_object(self, shot_list_files_object,
+                                         signals):
         machine = shot_list_files_object.machine
         shot_numbers, disruption_times = (
             shot_list_files_object.get_shot_numbers_and_disruption_times())
         for number, t in list(zip(shot_numbers, disruption_times)):
-            self.append(
-                Shot(number=number, t_disrupt=t, machine=machine,
-                     signals=[s for s in signals if
-                              s.is_defined_on_machine(machine)]
-                     )
-                )
-
-    def load_from_shot_list_files_objects(
-            self, shot_list_files_objects, signals):
+            self.append(Shot(number=number, t_disrupt=t, machine=machine,
+                             signals=[s for s in signals if
+                                      s.is_defined_on_machine(machine)]))
+
+    def load_from_shot_list_files_objects(self, shot_list_files_objects,
+                                          signals):
         for obj in shot_list_files_objects:
             self.load_from_shot_list_files_object(obj, signals)
 
@@ -276,16 +270,9 @@ class Shot(object):
     property.
     '''
 
-    def __init__(
-            self,
-            number=None,
-            machine=None,
-            signals=None,
-            signals_dict=None,
-            ttd=None,
-            valid=None,
-            is_disruptive=None,
-            t_disrupt=None):
+    def __init__(self, number=None, machine=None, signals=None,
+                 signals_dict=None, ttd=None, valid=None, is_disruptive=None,
+                 t_disrupt=None):
         '''
         Shot objects contain following attributes:
 
@@ -415,8 +402,7 @@ def get_signals_and_times_from_file(self, conf):
                 if self.is_disruptive and self.t_disrupt > np.max(t):
                     t_max_total = (
                         np.max(t) + signal.get_data_avail_tolerance(
-                            self.machine)
-                    )
+                            self.machine))
                     if (self.t_disrupt > t_max_total):
                         print('Shot {}: disruption event '.format(self.number),
                               'is not contained in valid time region of ',
@@ -425,8 +411,8 @@ def get_signals_and_times_from_file(self, conf):
                                   self.t_disrupt - np.max(t)))
                         valid = False
                     else:
-                        t_max = np.max(
-                            t) + signal.get_data_avail_tolerance(self.machine)
+                        t_max = np.max(t) + signal.get_data_avail_tolerance(
+                            self.machine)
                 else:
                     t_max = min(t_max, np.max(t))
 
@@ -449,13 +435,8 @@ def get_signals_and_times_from_file(self, conf):
 
         return time_arrays, signal_arrays, t_min, t_max, valid
 
-    def cut_and_resample_signals(
-            self,
-            time_arrays,
-            signal_arrays,
-            t_min,
-            t_max,
-            conf):
+    def cut_and_resample_signals(self, time_arrays, signal_arrays, t_min,
+                                 t_max, conf):
         dt = conf['data']['dt']
         signals_dict = dict()
 

From 31fafa34e0e4aeafc5ad04e40c7d5e61ecc65892 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Fri, 22 Nov 2019 11:35:43 -0600
Subject: [PATCH 11/51] Drop Rick Zamora's ALCF notes into docs/

---
 docs/ALCF.md       | 366 +++++++++++++++++++++++++++++++++++++++++++++
 examples/conf.yaml |   6 +-
 2 files changed, 369 insertions(+), 3 deletions(-)
 create mode 100644 docs/ALCF.md

diff --git a/docs/ALCF.md b/docs/ALCF.md
new file mode 100644
index 00000000..63877423
--- /dev/null
+++ b/docs/ALCF.md
@@ -0,0 +1,366 @@
+# ALCF Theta `plasma-python` FRNN Notes
+
+**Author: Rick Zamora (rzamora@anl.gov)**
+
+This document is intended to act as a tutorial for running the [plasma-python](https://github.com/PPPLDeepLearning/plasma-python) implementation of the Fusion recurrent neural network (FRNN) on the ALCF Theta supercomputer (Cray XC40; Intel KNL processors).  The steps followed in these notes are based on the Princeton [Tiger-GPU tutorial](https://github.com/PPPLDeepLearning/plasma-python/blob/master/docs/PrincetonUTutorial.md#location-of-the-data-on-tigress), hosted within the main GitHub repository for the project.
+
+## Environment Setup
+
+
+Choose a *root* directory for FRNN-related installations on Theta:
+
+```
+export FRNN_ROOT=<desired-root-directory>
+cd $FRNN_ROOT
+```
+
+*Personal Note: Using FRNN_ROOT=/home/zamora/ESP*
+
+Create a simple directory structure allowing experimental *builds* of the `plasma-python` python code/library:
+
+```
+mkdir build
+mkdir build/miniconda-3.6-4.5.4
+cd build/miniconda-3.6-4.5.4
+```
+
+### Custom Miniconda Environment Setup
+
+Copy miniconda installation script to working directory (and install):
+
+```
+cp /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/install_miniconda-3.6-4.5.4.sh .
+./install_miniconda-3.6-4.5.4.sh
+```
+
+The `install_miniconda-3.6-4.5.4.sh` script will install `miniconda-4.5.4` (using `Python-3.6`), as well as `Tensorflow-1.12.0` and `Keras 2.2.4`.
+
+
+Update your environment variables to use miniconda:
+
+```
+export PATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/bin:$PATH
+export PYTHONPATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/lib/python3.6/site-packages/:$PYTHONPATH
+```
+
+Note that the previous lines (as well as the definition of `FRNN_ROOT`) can be appended to your `$HOME/.bashrc` file if you want to use this environment on Theta by default.
+
+
+## Installing `plasma-python`
+
+Here, we assume the installation is within the custom miniconda environment installed in the previous steps. We also assume the following commands have already been executed:
+
+```
+export FRNN_ROOT=<desired-root-directory>
+export PATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/bin:$PATH
+export PYTHONPATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/lib/python3.6/site-packages/:$PYTHONPATH
+```
+
+*Personal Note: Using `export FRNN_ROOT=/lus/theta-fs0/projects/fusiondl_aesp/zamora/FRNN_project`*
+
+If the environment is set up correctly, installation of `plasma-python` is straightforward:
+
+```
+cd ${FRNN_ROOT}/build/miniconda-3.6-4.5.4
+git clone https://github.com/PPPLDeepLearning/plasma-python.git
+cd plasma-python
+python setup.py build
+python setup.py install
+```
+
+## Data Access
+
+Sample data and metadata is available in `/lus/theta-fs0/projects/FRNN/tigress/alexeys/signal_data` and `/lus/theta-fs0/projects/FRNN/tigress/alexeys/shot_lists`, respectively.  It is recommended that users create their own symbolic links to these directories. I recommend that you do this within a directory called `/lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/`. For example:
+
+```
+ln -s /lus/theta-fs0/projects/fusiondl_aesp/FRNN/tigress/alexeys/shot_lists  /lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/shot_lists
+ln -s /lus/theta-fs0/projects/fusiondl_aesp/FRNN/tigress/alexeys/signal_data  /lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/signal_data
+```
+
+For the examples included in `plasma-python`, there is a configuration file that specifies the root directory of the raw data. Change the `fs_path: '/tigress'` line in `examples/conf.yaml` to reflect the following:
+
+```
+fs_path: '/lus/theta-fs0/projects/fusiondl_aesp'
+```
+
+Its also a good idea to change `num_gpus: 4` to `num_gpus: 1`. I am also using the `jet_data_0D` dataset:
+
+```
+paths:
+    data: jet_data_0D
+```
+
+
+### Data Preprocessing
+
+#### The SLOW Way (On Theta)
+
+Theta is KNL-based, and is **not** the best resource for processing many text files in python. However, the preprocessing step *can* be used by using the following steps (although it may need to be repeated many times to get through the whole dataset in a 60-minute debug queues):
+
+```
+cd ${FRNN_ROOT}/build/miniconda-3.6-4.5.4/plasma-python/examples
+cp /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/submit_guarantee_preprocessed.sh .
+```
+
+Modify the paths defined in `submit_guarantee_preprocessed.sh` to match your environment.
+
+Note that the preprocessing module will use Pathos multiprocessing (not MPI/mpi4py).  Therefore, the script will see every compute core (all 256 per node) as an available resource.  Since the LUSTRE file system is unlikely to perform well with 256 processes (on the same node) opening/closing/creating files at once, it might improve performance if you make a slight change to line 85 in the `vi ~/plasma-python/plasma/preprocessor/preprocess.py` file:
+
+```
+line 85: use_cores = min( <desired-maximum-process-count>, max(1,mp.cpu_count()-2) )
+```
+
+After optionally re-building and installing plasm-python with this change, submit the preprocessing job:
+
+```
+qsub submit_guarantee_preprocessed.sh
+```
+
+#### The FAST Way (On Cooley)
+
+You will fine it much less painful to preprocess the data on Cooley, because the Haswell processors are much better suited for this... Log onto the ALCF Cooley Machine:
+
+```
+ssh <alcf-username>@cooley.alcf.anl.gov
+```
+
+Copy my `cooley_preprocess` example directory to whatever directory you choose to work in:
+
+```
+cp -r /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/cooley_preprocess .
+cd cooley_preprocess
+```
+
+This directory has a Singularity image with everything you need to run your code on Cooley. Assuming you have created symbolic links to the `shot_lists` and `signal_data` directories in `/lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/`, you can just submit the included `COBALT` script (to specify the data you want to process, just modify the included `conf.yaml` file):
+
+```
+qsub submit.sh
+```
+
+For me, this finishes in less than 10 minutes, and creates 5523 `.npz` files in the `/lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/processed_shots/` directory.  The output file of the COBALT submission ends with the following message:
+
+```
+5522/5523Finished Preprocessing 5523 files in 406.94421911239624 seconds
+Omitted 5523 shots of 5523 total.
+0/0 disruptive shots
+WARNING: All shots were omitted, please ensure raw data is complete and available at /lus/theta-fs0/projects/fusiondl_aesp/zamora/signal_data/.
+4327 1196
+```
+
+
+# Notes on Revisiting Pre-Processes
+
+## Preprocessing Information
+
+To understand what might be going wrong with the preprocessing step, let's investigate what the code is actually doing.
+
+**Step 1** Call `guarentee_preprocessed( conf )`, which is defined in `plasma/preprocessor/preprocess.py`. This function first initializes a `Preprocessor()` object (whose class definition is in the same file), and then checks if the preprocessing was already done (by looking for a file). The preprocessor object is called `pp`.
+
+**Step 2** Assuming preprocessing is needed, we call `pp.clean_shot_lists()`, which loops through each file in the `shot_lists` directory and calls `self.clean_shot_list()` (not plural) for each text-file item. I do not believe this function is doing any thing when I run it, because all the shot list files have been "cleaned." The cleaning of a shot-list file just means the data is corrected to have two columns, and the file is renamed (to have "clear" in the name).
+
+**Step 3** We call `pp.preprocess_all()`, which parses some of the config file, and ultimately calls `self.preprocess_from_files(shot_files_all,use_shots)` (where I believe `shot_files_all` is the output directory, and `use_shots` is the number of shots to use).
+
+**Step 4** The `preprocess_from_files()` function is used to do the actual preprocessing. It does this by creating a multiprocessing pool, and mapping the processes to the `self.preprocess_single_file` function (note that the code for `ShotList` class is in `plasma/primitives/shots.py`, and the preprocessing code is still in `plasma/preprocessor/preprocess.py`).
+
+**Important:** It looks like the code uses the path definitions in `data/shot_lists/signals.py` to define the location/path of signal data. I believe that some of the signal data is missing, which is causing every "shot" to be labeled as incomplete (and consequently thrown out).
+
+### Possible Issues
+
+From the preprocessing output, it is clear that the *Signal Radiated Power Core* data was not downloaded correctly. According to the `data/shot_lists/signals.py` file, the data *should* be in `/lus/theta-fs0/projects/fusiondl_aesp/<alcf-user-name>/signal_data/jet/ppf/bolo/kb5h/channel14`. However, the only subdirectory of `~/jet/ppf/` is `~/jet/ppf/efit`
+
+Another possible issue is that the `data/shot_lists/signals.py` file specifies the **name** of the directory containing the *Radiated Power* data incorrectly (*I THINK*). Instead of the following line:
+
+`pradtot = Signal("Radiated Power",['jpf/db/b5r-ptot>out'],[jet])`
+
+We might need this:
+
+`pradtot = Signal("Radiated Power",['jpf/db/b5r-ptot\>out'],[jet])`
+
+The issue has to do with the `>` character in the directory name (without the proper `\` escape character, python may be looking in the wrong path). **NOTE: I need to confirm that there is actually an issue with the way the code is actually using the string.**
+
+
+## Singularity/Docker Notes
+
+Recall that the data preprocessing step was PAINFULLY slow on Theta, and so I decided to use Cooley. To simplify the process of using Cooley, I created a Docker image with the necessary environment. **Personal Note:** I performed this work on my local machine (Mac) in `/Users/rzamora/container-recipes`.
+
+
+In order to use a Docker image within a Singularity container (required on ALCF machines), it is useful to build the image on your local machine and push it to "Docker Hub":
+
+
+**Step 1:** Install Docker if you don't have it. [Docker-Mac](https://www.docker.com/docker-mac) works well for Mac.
+
+**Step 2:** Build a Docker image using the recipe discussed below.
+
+```
+export IMAGENAME="test_image"
+export RECIPENAME="Docker.centos7-cuda-tf1.12.0"
+docker build -t $IMAGENAME -f $RECIPENAME .
+```
+
+You can check that the image is functional by starting an interactive shell session, and checking that the necessary python modules are available. For example (using `-it` for an interactive session):
+
+```
+docker run --rm -it -v $PWD:/tmp -w /tmp $IMAGENAME:latest bash
+# python -c "import keras; import plasma; print(plasma.__file__)"
+```
+
+Note that the `plasma-python` source code will be located in `/root/plasma-python/` for the recipe described below.
+
+**Step 3:** Push the image to [Docker Hub](https://hub.docker.com/).
+
+Using your docker-hub username:
+
+```
+docker login --username=<username>
+```
+
+Then, "tag" the image using the `IMAGE ID` value displayed with `docker image ls`:
+
+```
+docker tag <IMAGE-ID> <username>/<image-name>:<label>
+```
+
+Here, `<label>` is something like "latest".  To finally push the image to [Docker Hub](https://hub.docker.com/):
+
+```
+docker push <username>/<image-name>
+```
+
+### Docker Recipe
+
+The actual content of the docker recipe is mostly borrowed from an example on [GitHub](https://github.com/scieule/golden-heart/blob/master/Dockerfile):
+
+```
+FROM nvidia/cuda:9.1-cudnn7-devel-centos7
+
+# Setup environment:
+SHELL ["/bin/bash", "-c"]
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV LC_ALL en_US.UTF-8
+ENV CUDA_DEVICE_ORDER PCI_BUS_ID
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/cuda/extras/CUPTI/lib64
+
+RUN yum update -y
+
+RUN yum groupinstall -y "Development tools"
+
+RUN yum install -y  wget \
+                    unzip \
+                    screen tmux \
+                    ruby \
+                    vim \
+                    bc \
+                    man \
+                    ncurses-devel \
+                    zlib-devel \
+                    curl-devel \
+                    openssl-devel \
+                    which
+
+RUN yum install -y qt5*devel gtk2-devel
+
+RUN yum install -y  blas-devel \
+                    lapack-devel \
+                    atlas-devel \
+                    gcc-gfortran \
+                    tbb-devel \
+                    eigen3-devel \
+                    jasper-devel \
+                    libpng-devel \
+                    libtiff-devel \
+                    openexr-devel \
+                    libwebp-devel \
+                    libv4l-devel \
+                    libdc1394-devel \
+                    libv4l-devel \
+                    gstreamer-plugins-base-devel
+
+# C/C++ CMake Python
+RUN yum install -y  centos-release-scl && \
+    yum install -y  devtoolset-7-gcc* \
+                    devtoolset-7-valgrind \
+                    devtoolset-7-gdb \
+                    devtoolset-7-elfutils \
+                    clang \
+                    llvm-toolset-7 \
+                    llvm-toolset-7-cmake \
+                    rh-python36-python-devel \
+                    rh-python36-python-pip \
+                    rh-git29-git \
+                    devtoolset-7-make
+
+RUN echo "source scl_source enable devtoolset-7" >> /etc/bashrc
+RUN echo "source scl_source enable llvm-toolset-7" >> /etc/bashrc
+RUN echo "source scl_source enable rh-python36" >> /etc/bashrc
+RUN echo "source scl_source enable rh-git29" >> /etc/bashrc
+
+# Python libs & jupyter
+
+RUN source /etc/bashrc; pip3 install --upgrade pip
+RUN source /etc/bashrc; pip3 install numpy scipy matplotlib pandas \
+                                    tensorflow-gpu keras h5py tables \
+                                    scikit-image scikit-learn Pillow opencv-python \
+                                    jsonschema jinja2 tornado pyzmq ipython jupyter notebook
+
+# Install MPICH
+RUN  cd /root && wget -q http://www.mpich.org/static/downloads/3.2.1/mpich-3.2.1.tar.gz \
+  && tar xf mpich-3.2.1.tar.gz \
+  && rm mpich-3.2.1.tar.gz \
+  && cd mpich-3.2.1 \
+  && source /etc/bashrc; ./configure --prefix=/usr/local/mpich/install --disable-wrapper-rpath \
+  && make -j 4 install \
+  && cd .. \
+  && rm -rf mpich-3.2.1
+
+ENV PATH ${PATH}:/usr/local/mpich/install/bin
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/mpich/install/lib
+RUN env | sort
+
+# Install plasma-python (https://github.com/PPPLDeepLearning/plasma-python)
+# For 'pip'-based install: pip --no-cache-dir --disable-pip-version-check install -i https://testpypi.python.org/pypi plasma
+RUN cd /root && git clone https://github.com/PPPLDeepLearning/plasma-python \
+  && cd plasma-python \
+  && source /etc/bashrc; python setup.py install \
+  && cd ..
+
+# nccl2
+RUN cd /root && git clone https://github.com/NVIDIA/nccl.git \
+  && cd nccl \
+  && make -j src.build \
+  && make pkg.redhat.build \
+  && rpm -i build/pkg/rpm/x86_64/libnccl*
+
+# pip-install mpi4py
+RUN source /etc/bashrc; pip3 install mpi4py
+
+RUN yum install -y libffi libffi-devel
+
+RUN source /etc/bashrc; pip3 install tensorflow
+
+# Workaround to build horovod without needing cuda libraries available:
+# temporary add stub drivers to ld.so.cache
+RUN ldconfig /usr/local/cuda/lib64/stubs \
+  && source /etc/bashrc; HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_NCCL_HOME=/nccl/build/ pip3 --no-cache-dir install horovod \
+  && ldconfig
+
+ENV NCCL_P2P_DISABLE 1
+```
+
+### Converting Docker to Singularity
+
+Needed to build a singularity image for Cooley... Used vagrant:
+
+```
+cd ~/vm-singularity/
+vagrant up
+vagrant ssh
+sudo singularity build centos7-cuda-tf1.12.0-plasma.simg docker://rjzamora/centos7-cuda-tf1.12.0.dimg:latest
+```
+
+
+
+
+
+
+
diff --git a/examples/conf.yaml b/examples/conf.yaml
index dfda1145..b2cba607 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -10,18 +10,18 @@ paths:
     signal_prepath: '/signal_data/' # /signal_data/jet/
     shot_list_dir: '/shot_lists/'
     tensorboard_save_path: '/Graph/'
-    data: d3d_data_0D # 'd3d_to_jet_data' # 'd3d_to_jet_data' #  'jet_to_d3d_data' # jet_data
+    data: d3d_data_0D
     # if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
     specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
     executable: "mpi_learn.py"
     shallow_executable: "learn.py"
 
 data:
-    bleed_in: 0 # how many shots from the test sit to use in training?
+    bleed_in: 0 # how many shots from the test set to use in training?
     bleed_in_repeat_fac: 1 # how many times to repeat shots in training and validation?
     bleed_in_remove_from_test: True
     bleed_in_equalize_sets: False
-    # TODO(KGF): make next parameter use 'none' instead of None
+    # TODO(KGF): make next parameter use 'none' instead of None for consistency
     signal_to_augment: None # 'plasma current' # or None
     augmentation_mode: 'none'
     augment_during_training: False

From 8ecf571b2aea68752b1fd930746d319202109acd Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Fri, 22 Nov 2019 11:46:30 -0600
Subject: [PATCH 12/51] Convert YAML to 2 space indent from 4 spaces

---
 examples/conf.yaml | 259 +++++++++++++++++++++++----------------------
 1 file changed, 134 insertions(+), 125 deletions(-)

diff --git a/examples/conf.yaml b/examples/conf.yaml
index b2cba607..82832d49 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -7,135 +7,144 @@ target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
 num_gpus: 4  # per node
 
 paths:
-    signal_prepath: '/signal_data/' # /signal_data/jet/
-    shot_list_dir: '/shot_lists/'
-    tensorboard_save_path: '/Graph/'
-    data: d3d_data_0D
-    # if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
-    specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
-    executable: "mpi_learn.py"
-    shallow_executable: "learn.py"
+  signal_prepath: '/signal_data/' # /signal_data/jet/
+  shot_list_dir: '/shot_lists/'
+  tensorboard_save_path: '/Graph/'
+  data: d3d_data_0D
+  # if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
+  specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
+  executable: "mpi_learn.py"
+  shallow_executable: "learn.py"
 
 data:
-    bleed_in: 0 # how many shots from the test set to use in training?
-    bleed_in_repeat_fac: 1 # how many times to repeat shots in training and validation?
-    bleed_in_remove_from_test: True
-    bleed_in_equalize_sets: False
-    # TODO(KGF): make next parameter use 'none' instead of None for consistency
-    signal_to_augment: None # 'plasma current' # or None
-    augmentation_mode: 'none'
-    augment_during_training: False
-    cut_shot_ends: True
-    T_min_warn: 30
-    recompute: False
-    recompute_normalization: False
-    # specifies which of the signals in the signals_dirs order contains the plasma current info
-    current_index: 0
-    plotting: False
-    # how many shots to use
-    use_shots: 200000 # 1000 # 200000
-    positive_example_penalty: 1.0 # by what factor to upweight positive examples?
-    # normalization timescale
-    dt: 0.001
-    # maximum TTD considered
-    T_max: 1000.0
-    # The shortest works best so far: less overfitting. log TTd prediction also works well. 0.5 better than 0.2
-    T_warning: 1.024 # 1.024 # 1.024 # 0.512 # 0.25 # 1.0 # 1.0 # warning time in seconds
-    current_thresh: 750000
-    current_end_thresh: 10000
-    # the characteristic decay length of the decaying moving average window
-    window_decay: 2
-    # the width of the actual window
-    window_size: 10
-    # TODO(KGF): optimize the normalizer parameters
-    normalizer: 'var'
-    norm_stat_range: 100.0
-    equalize_classes: False
-    #  shallow_sample_prob: 0.01 # the fraction of samples with which to train the shallow model
-    floatx: 'float32'
+  bleed_in: 0 # how many shots from the test set to use in training?
+  bleed_in_repeat_fac: 1 # how many times to repeat shots in training and validation?
+  bleed_in_remove_from_test: True
+  bleed_in_equalize_sets: False
+  # TODO(KGF): make next parameter use 'none' instead of None for consistency
+  signal_to_augment: None # 'plasma current'
+  augmentation_mode: 'none'
+  augment_during_training: False
+  cut_shot_ends: True
+  T_min_warn: 30
+  recompute: False
+  recompute_normalization: False
+  # specifies which of the signals in the signals_dirs order contains the plasma current info
+  current_index: 0
+  plotting: False
+  # maximum number of shots to use
+  use_shots: 200000 # 1000
+  positive_example_penalty: 1.0 # by what factor to upweight positive examples?
+  # normalization timescale
+  dt: 0.001
+  # maximum TTD considered
+  T_max: 1000.0
+  # warning time in seconds
+  # The shortest works best so far: less overfitting. log TTd prediction also works well. 0.5 better than 0.2
+  T_warning: 1.024 # 0.512 # 0.25 # 1.0
+  current_thresh: 750000
+  current_end_thresh: 10000
+  # the characteristic decay length of the decaying moving average window
+  window_decay: 2
+  # the width of the actual window
+  window_size: 10
+  # TODO(KGF): optimize the normalizer parameters
+  normalizer: 'var'
+  norm_stat_range: 100.0
+  equalize_classes: False
+  # the fraction of samples with which to train the shallow model
+  #  shallow_sample_prob: 0.01
+  floatx: 'float32'
 
 model:
-    loss_scale_factor: 1.0
-    use_batch_norm: false
-    torch: False
-    shallow: False
-    shallow_model:
-        num_samples: 1000000 # 1000000 # the number of samples to use for training
-        type: "xgboost" # "xgboost" #"random_forest"
-        n_estimators: 100 # for random forest
-        max_depth: 3 # for random forest and xgboost (def = 3)
-        C: 1.0 # for svm
-        kernel: "rbf" # rbf, sigmoid, linear, poly, for svm
-        learning_rate: 0.1 # used in xgboost
-        scale_pos_weight: 10.0 # used in xgboost
-        final_hidden_layer_size: 10 # final layers has this many neurons, every layer before twice as many
-        num_hidden_layers: 3
-        learning_rate_mlp: 0.0001
-        mlp_regularization: 0.0001
-        skip_train: False # should a finished model be loaded if available
-    # length of LSTM memory
-    pred_length: 200
-    pred_batch_size: 128
-    # TODO(KGF): optimize length of LSTM memory
-    length: 128
-    skip: 1
-    # hidden layer size
-    # TODO(KGF): optimize size of RNN layers
-    rnn_size: 200
-    # size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100. Prediction much better with size 100, size 20 cannot capture the data.
-    rnn_type: 'LSTM'
-    # TODO(KGF): optimize number of RNN layers
-    rnn_layers: 2
-    num_conv_filters: 128
-    size_conv_filters: 3
-    num_conv_layers: 3
-    pool_size: 2
-    dense_size: 128
-    extra_dense_input: False
-    # have not found a difference yet
-    optimizer: 'adam'
-    clipnorm: 10.0
-    regularization: 0.001
-    dense_regularization: 0.001
-    # lr=1e-4 is too high, 5e-7 is too low. 5e-5 seems best at 256 batch size, full dataset
-    # and ~10 epochs, and lr decay of 0.90
-    # lr=1e-4 also works well if we decay a lot (i.e ~0.7 or more)
-    lr: 0.00002 # 0.00001 # 0.0005 # for adam plots 0.0000001 # 0.00005 # 0.00005 # 0.00005
-    lr_decay: 0.97 # 0.98 # 0.9
-    stateful: True
-    return_sequences: True
-    dropout_prob: 0.1
-    # only relevant if we want to do MPI training. The number of steps with a single replica
-    warmup_steps: 0
-    ignore_timesteps: 100 # how many initial timesteps to ignore during evaluation (to let the internal state settle)
-    backend: 'tensorflow'
+  loss_scale_factor: 1.0
+  use_batch_norm: false
+  torch: False
+  shallow: False
+  shallow_model:
+    type: "xgboost" # "random_forest"
+    # the number of samples to use for training
+    num_samples: 1000000 # 1000000
+    n_estimators: 100 # used in random forest
+    max_depth: 3 # used in random forest and xgboost (def = 3)
+    C: 1.0 # used in svm
+    kernel: "rbf" # rbf, sigmoid, linear, poly, for svm
+    learning_rate: 0.1 # used in xgboost
+    scale_pos_weight: 10.0 # used in xgboost
+    # final layer has this many neurons, every layer before has twice as many
+    final_hidden_layer_size: 10
+    num_hidden_layers: 3
+    learning_rate_mlp: 0.0001
+    mlp_regularization: 0.0001
+    # should a finished model be loaded if available?
+    skip_train: False
+  # length of LSTM memory
+  pred_length: 200
+  pred_batch_size: 128
+  # TODO(KGF): optimize length of LSTM memory
+  length: 128
+  skip: 1
+  # hidden layer size
+  # TODO(KGF): optimize size of RNN layers
+  # size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100.
+  # Prediction is much better with size 100, size 20 cannot capture the data.
+  rnn_size: 200
+  rnn_type: 'LSTM'
+  # TODO(KGF): optimize number of RNN layers
+  rnn_layers: 2
+  num_conv_filters: 128
+  size_conv_filters: 3
+  num_conv_layers: 3
+  pool_size: 2
+  dense_size: 128
+  extra_dense_input: False
+  # have not found a difference yet
+  optimizer: 'adam'
+  clipnorm: 10.0
+  regularization: 0.001
+  dense_regularization: 0.001
+  # lr=1e-4 is too high, 5e-7 is too low. 5e-5 seems best at 256 batch size, full dataset
+  # and ~10 epochs, and lr decay of 0.90
+  # lr=1e-4 also works well if we decay a lot (i.e ~0.7 or more)
+  lr: 0.00002 # 0.00001 # 0.0005 # for adam plots 0.0000001
+  lr_decay: 0.97 # 0.98 # 0.9
+  stateful: True
+  return_sequences: True
+  dropout_prob: 0.1
+  # only relevant if we want to do MPI training. The number of steps with a single replica
+  warmup_steps: 0
+  # how many initial timesteps to ignore during evaluation (to let the internal state settle)
+  ignore_timesteps: 100
+  backend: 'tensorflow'
 training:
-    as_array_of_shots: True
-    shuffle_training: True
-    train_frac: 0.75
-    validation_frac: 0.33
-    batch_size: 128 # 256
-    # THE MAX_PATCH_LENGTH WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
-    max_patch_length: 100000
-    # How many shots are we loading at once?
-    num_shots_at_once: 200
-    num_epochs: 1000   # large number = maximum number of epochs. Early stopping will occur if loss does not decrease
-    use_mock_data: False
-    data_parallel: False
-    hyperparam_tuning: False
-    batch_generator_warmup_steps: 0
-    use_process_generator: False
-    num_batches_minimum: 20 # minimum number of batches per epoch
-    ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
+  as_array_of_shots: True
+  shuffle_training: True
+  train_frac: 0.75
+  validation_frac: 0.33
+  batch_size: 128 # 256
+  # THE MAX_PATCH_LENGTH WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
+  max_patch_length: 100000
+  # How many shots are we loading at once?
+  num_shots_at_once: 200
+  # large number = maximum number of epochs.
+  # Early stopping will occur if loss does not decrease, after some patience # of epochs
+  num_epochs: 1000
+  use_mock_data: False
+  data_parallel: False
+  hyperparam_tuning: False
+  batch_generator_warmup_steps: 0
+  use_process_generator: False
+  num_batches_minimum: 20 # minimum number of batches per epoch
+  ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
 callbacks:
-    list: ['earlystop']
-    metrics: ['val_loss','val_roc','train_loss']
-    mode: 'max'
-    monitor: 'val_roc'
-    patience: 5
-    write_grads: False
-    monitor_test: True
-    monitor_times: [30,70,200,500,1000]
+  list: ['earlystop']
+  metrics: ['val_loss','val_roc','train_loss']
+  mode: 'max'
+  monitor: 'val_roc'
+  patience: 5
+  write_grads: False
+  monitor_test: True
+  monitor_times: [30,70,200,500,1000]
 env:
-    name: 'frnn'
-    type: 'anaconda'
+  name: 'frnn'
+  type: 'anaconda'

From 9585728645719eb1c66e9babe64ab5ef1462f05a Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Fri, 22 Nov 2019 11:48:37 -0600
Subject: [PATCH 13/51] Clarify comment

---
 plasma/primitives/shots.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 3079b8b2..57130cb4 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -97,10 +97,12 @@ def split_train_test(self, conf):
         shuffle_training = conf['training']['shuffle_training']
         use_shots = conf['data']['use_shots']
         all_signals = conf['paths']['all_signals']
-        # split randomly
+        # split "maximum number of shots to use" into:
+        # test vs. (train U validate)
         use_shots_train = int(round(train_frac*use_shots))
         use_shots_test = int(round((1-train_frac)*use_shots))
         if len(shot_files_test) == 0:
+            # split randomly
             shot_list_train, shot_list_test = train_test_split(
                 self.shots, train_frac, shuffle_training)
         # train and test list given

From e2738ea0bc7d7e2fdd77d116961fb6aa72fde8f7 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Fri, 22 Nov 2019 23:10:50 -0500
Subject: [PATCH 14/51] Add comment warning about workaround for JET 0D CW ->
 ILW preprocessing

---
 data/signals.py                    | 23 +++++++++++------------
 examples/guarantee_preprocessed.py |  2 +-
 plasma/primitives/shots.py         |  6 ++++--
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/data/signals.py b/data/signals.py
index 4c1c9b8e..69636c00 100644
--- a/data/signals.py
+++ b/data/signals.py
@@ -85,12 +85,9 @@ def get_units(str):
     if found:
         if rank > 1:
             xdata = c.get('dim_of(_s,1)').data()
-            # xunits = get_units('dim_of(_s,1)')
             ydata = c.get('dim_of(_s)').data()
-            # yunits = get_units('dim_of(_s)')
         else:
             xdata = c.get('dim_of(_s)').data()
-            # xunits = get_units('dim_of(_s)')
 
     # MDSplus seems to return 2-D arrays transposed.  Change them back.
     if np.ndim(data) == 2:
@@ -114,16 +111,13 @@ def fetch_jet_data(signal_path, shot_num, c):
         data = c.get('_sig=jet("{}/",{})'.format(signal_path, shot_num)).data()
         if np.ndim(data) == 2:
             data = np.transpose(data)
-            time = c.get(
-                '_sig=dim_of(jet("{}/",{}),1)'.format(
-                    signal_path, shot_num)).data()
-            ydata = c.get(
-                '_sig=dim_of(jet("{}/",{}),0)'.format(
-                    signal_path, shot_num)).data()
+            time = c.get('_sig=dim_of(jet("{}/",{}),1)'.format(
+                signal_path, shot_num)).data()
+            ydata = c.get('_sig=dim_of(jet("{}/",{}),0)'.format(
+                signal_path, shot_num)).data()
         else:
-            time = c.get(
-                '_sig=dim_of(jet("{}/",{}))'.format(
-                    signal_path, shot_num)).data()
+            time = c.get('_sig=dim_of(jet("{}/",{}))'.format(
+                signal_path, shot_num)).data()
         found = True
     except Exception as e:
         g.print_unique(e)
@@ -345,6 +339,11 @@ def fetch_nstx_data(signal_path, shot_num, c):
     # 'tmamp1':tmamp1, 'tmamp2':tmamp2, 'tmfreq1':tmfreq1, 'tmfreq2':tmfreq2,
     # 'pechin':pechin,
     # 'rho_profile_spatial':rho_profile_spatial, 'etemp':etemp,
+    # -----
+    # TODO(KGF): replace this hacky workaround
+    # IMPORTANT: must comment-out the following line when preprocessing for
+    # training on JET CW and testing on JET ILW (FRNN 0D).
+    # Otherwise 1K+ CW shots are excluded due to missing profile data
     'etemp_profile': etemp_profile, 'edens_profile': edens_profile,
     # 'itemp_profile':itemp_profile, 'zdens_profile':zdens_profile,
     # 'trot_profile':trot_profile, 'pthm_profile':pthm_profile,
diff --git a/examples/guarantee_preprocessed.py b/examples/guarantee_preprocessed.py
index 310ed300..3fa75e02 100644
--- a/examples/guarantee_preprocessed.py
+++ b/examples/guarantee_preprocessed.py
@@ -11,4 +11,4 @@
 #####################################################
 np.random.seed(0)
 random.seed(0)
-guarantee_preprocessed(conf)
+guarantee_preprocessed(conf, verbose=True)
diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 57130cb4..00de09b8 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -102,10 +102,12 @@ def split_train_test(self, conf):
         use_shots_train = int(round(train_frac*use_shots))
         use_shots_test = int(round((1-train_frac)*use_shots))
         if len(shot_files_test) == 0:
-            # split randomly
+            # split randomly, e.g. sample both sets from same distribution
+            # such as D3D test and train
             shot_list_train, shot_list_test = train_test_split(
                 self.shots, train_frac, shuffle_training)
-        # train and test list given
+        # train and test list given, e.g. they are sampled from separate
+        # distributions such as train=CW and test=ILW for JET
         else:
             shot_list_train = ShotList()
             shot_list_train.load_from_shot_list_files_objects(

From 6f7894d11d36e9e6f062dd907cb755857c6979a5 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Mon, 25 Nov 2019 17:46:14 -0500
Subject: [PATCH 15/51] Attempt to standardize diagnostics from
 guarantee_preprocessed.py

Now, all criteria for excluding a shot from the input raw shot lists
trigger "[omit]" string in their diagnostic when they are satisfied.

Should make searching the piped output easier
---
 plasma/primitives/data.py  | 28 +++++++++++++++-------------
 plasma/primitives/shots.py | 13 ++++++-------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index 5d84fb0b..7b8e2ab9 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -65,24 +65,24 @@ def is_saved(self, prepath, shot):
     def load_data_from_txt_safe(self, prepath, shot, dtype='float32'):
         file_path = self.get_file_path(prepath, shot.machine, shot.number)
         if not self.is_saved(prepath, shot):
-            print('Signal {}, shot {} was never downloaded'.format(
+            print('Signal {}, shot {} was never downloaded [omit]'.format(
                 self.description, shot.number))
             return None, False
 
         if os.path.getsize(file_path) == 0:
             print('Signal {}, shot {} '.format(self.description, shot.number),
-                  'was downloaded incorrectly (empty file). Removing.')
+                  'was downloaded incorrectly (empty file) [omit]')
             os.remove(file_path)
             return None, False
         try:
             data = np.loadtxt(file_path, dtype=dtype)
             if np.all(data == get_missing_value_array()):
-                print('Signal {}, shot {} contains no data'.format(
+                print('Signal {}, shot {} contains no data [omit]'.format(
                     self.description, shot.number))
                 return None, False
         except Exception as e:
             print(e)
-            print('Couldnt load signal {} shot {}. Removing.'.format(
+            print('Cannot load signal {} shot {} [omit]'.format(
                 file_path, shot.number))
             os.remove(file_path)
             return None, False
@@ -103,7 +103,7 @@ def load_data(self, prepath, shot, dtype='float32'):
         if self.is_ip:  # restrict shot to current threshold
             region = np.where(np.abs(sig) >= shot.machine.current_threshold)[0]
             if len(region) == 0:
-                print('shot {} has no current'.format(shot.number))
+                print('Shot {} has no current [omit]'.format(shot.number))
                 return None, None, False
             first_idx = region[0]
             last_idx = region[-1]
@@ -120,15 +120,15 @@ def load_data(self, prepath, shot, dtype='float32'):
         # make sure shot is not garbage data
         if len(t) <= 1 or (np.max(sig) == 0.0 and np.min(sig) == 0.0):
             if self.is_ip:
-                print('shot {} has no current'.format(shot.number))
+                print('Shot {} has no current [omit]'.format(shot.number))
             else:
-                print('Signal {}, shot {} contains no data'.format(
+                print('Signal {}, shot {} contains no data [omit]'.format(
                     self.description, shot.number))
             return None, None, False
 
         # make sure data doesn't contain nan
         if np.any(np.isnan(t)) or np.any(np.isnan(sig)):
-            print('Signal {}, shot {} contains NAN'.format(
+            print('Signal {}, shot {} contains NaN [omit]'.format(
                 self.description, shot.number))
             return None, None, False
 
@@ -243,15 +243,16 @@ def load_data(self, prepath, shot, dtype='float32'):
             print('Signal {}, shot {} '.format(self.description, shot.number),
                   'should be profile but has only one channel. Possibly only ',
                   'one profile fit was run for the duration of the shot and ',
-                  'was transposed during downloading. Need at least 2.')
+                  'was transposed during downloading. Need at least 2 channels'
+                  ' [omit]')
             return None, None, False
         if len(t) <= 1 or (np.max(sig) == 0.0 and np.min(sig) == 0.0):
             print('Signal {}, shot {} '.format(self.description, shot.number),
-                  'contains no data.')
+                  'contains no data [omit]')
             return None, None, False
         if np.any(np.isnan(t)) or np.any(np.isnan(sig)):
             print('Signal {}, shot {} '.format(self.description, shot.number),
-                  'contains NaN value(s).')
+                  'contains NaN value(s) [omit]')
             return None, None, False
 
         timesteps = len(t)
@@ -268,7 +269,8 @@ def load_data(self, prepath, shot, dtype='float32'):
                 print('Signal {}, shot {} '.format(self.description,
                                                    shot.number),
                       'has insufficient points for linear interpolation. ',
-                      'dfitpack.error: (m>k) failed for hidden m: fpcurf0:m=1')
+                      'dfitpack.error: (m>k) failed for hidden m: fpcurf0:m=1 '
+                      '[omit]')
                 return None, None, False
 
         return t, sig_interp, True
@@ -349,7 +351,7 @@ def fetch_data(self, machine, shot_num, c):
         if channel_num is not None and success:
             if np.ndim(data) != 2:
                 print("Channel Signal {} expected 2D array for shot {}".format(
-                    self, self.shot_number))
+                    self, self.shot_number), ' [omit]')
                 success = False
             else:
                 data = data[channel_num, :]  # extract channel of interest
diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 00de09b8..69f7d083 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -404,13 +404,13 @@ def get_signals_and_times_from_file(self, conf):
                 signal_arrays.append(sig)
                 time_arrays.append(t)
                 if self.is_disruptive and self.t_disrupt > np.max(t):
-                    t_max_total = (
-                        np.max(t) + signal.get_data_avail_tolerance(
-                            self.machine))
+                    t_max_total = (np.max(t)
+                                   + signal.get_data_avail_tolerance(
+                                       self.machine))
                     if (self.t_disrupt > t_max_total):
                         print('Shot {}: disruption event '.format(self.number),
                               'is not contained in valid time region of ',
-                              'signal {} by {}s, omitting.'.format(
+                              'signal {} by {}s [omit]'.format(
                                   self.number, signal,
                                   self.t_disrupt - np.max(t)))
                         valid = False
@@ -424,9 +424,8 @@ def get_signals_and_times_from_file(self, conf):
         dt = conf['data']['dt']
         if (t_max - t_min)/dt <= (2*conf['model']
                                   ['length']+conf['data']['T_min_warn']):
-            print(
-                'Shot {} contains insufficient data, omitting.'.format(
-                    self.number))
+            print('Shot {} contains insufficient data [omit]'.format(
+                self.number))
             valid = False
 
         assert(

From 36920a650f8d9a9055a41c4d1822311ebefb1bdb Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Mon, 25 Nov 2019 19:49:21 -0500
Subject: [PATCH 16/51] Spacing changes

---
 plasma/primitives/data.py  |  2 +-
 plasma/primitives/shots.py | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index 7b8e2ab9..0123588e 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -126,7 +126,7 @@ def load_data(self, prepath, shot, dtype='float32'):
                     self.description, shot.number))
             return None, None, False
 
-        # make sure data doesn't contain nan
+        # make sure data doesn't contain NaN values
         if np.any(np.isnan(t)) or np.any(np.isnan(sig)):
             print('Signal {}, shot {} contains NaN [omit]'.format(
                 self.description, shot.number))
diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 69f7d083..07d4156f 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -318,10 +318,8 @@ def __eq__(self, other):
 
     def __hash__(self):
         import hashlib
-        return int(
-            hashlib.md5(
-                self.get_id_str().encode('utf-8')).hexdigest(),
-            16)
+        return int(hashlib.md5(
+            self.get_id_str().encode('utf-8')).hexdigest(), 16)
 
     def __str__(self):
         string = 'number: {}\n'.format(self.number)
@@ -422,8 +420,8 @@ def get_signals_and_times_from_file(self, conf):
 
         # make sure the shot is long enough.
         dt = conf['data']['dt']
-        if (t_max - t_min)/dt <= (2*conf['model']
-                                  ['length']+conf['data']['T_min_warn']):
+        if (t_max - t_min)/dt <= (2*conf['model']['length']
+                                  + conf['data']['T_min_warn']):
             print('Shot {} contains insufficient data [omit]'.format(
                 self.number))
             valid = False

From 01a2376717f60c82641ee22dd9a7835284531548 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Mon, 25 Nov 2019 19:03:54 -0600
Subject: [PATCH 17/51] Change validation_frac default from 0.33 to 1.0/3.0

---
 examples/conf.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/conf.yaml b/examples/conf.yaml
index 82832d49..9b07f8df 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -119,8 +119,9 @@ model:
 training:
   as_array_of_shots: True
   shuffle_training: True
+  # used iff 1) test & 2) (train U validate) are both sampled from the same distribution/source lists of shots:
   train_frac: 0.75
-  validation_frac: 0.33
+  validation_frac: 1.0/3.0
   batch_size: 128 # 256
   # THE MAX_PATCH_LENGTH WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
   max_patch_length: 100000

From 7586b68e71099d0e49d16ddeda022d157daa44cb Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Tue, 26 Nov 2019 11:43:31 -0500
Subject: [PATCH 18/51] Add comments

---
 plasma/utils/processing.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/plasma/utils/processing.py b/plasma/utils/processing.py
index 12938c1b..4781e3d0 100644
--- a/plasma/utils/processing.py
+++ b/plasma/utils/processing.py
@@ -68,9 +68,14 @@ def append_to_filename(path, to_append):
 
 
 def train_test_split(x, frac, do_shuffle=False):
+    # TODO(KGF): rename these 2x fns; used for generic ShotList.split_direct
     if not isinstance(x, np.ndarray):
+        print("x is not an instance of np.ndarray")
         return train_test_split_robust(x, frac, do_shuffle)
     mask = np.array(range(len(x))) < frac*len(x)
+    # Note, these functions do not directly split the "disruptive" subset of
+    # ShotLists; they are only applied to the overall sets and rely on random
+    # shuffling to produce the correct disruptive split in large N sample limit
     if do_shuffle:
         np.random.shuffle(mask)
     return x[mask], x[~mask]

From 9a5b0f416a9d287743ce0884f3ca402a324231a2 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Tue, 26 Nov 2019 11:16:21 -0600
Subject: [PATCH 19/51] Do not use Python expressions in conf.yaml values

---
 examples/conf.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/conf.yaml b/examples/conf.yaml
index 9b07f8df..b1b242be 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -1,4 +1,6 @@
 # conf.py will parse the yaml and extract parameters based on what is specified
+# note, the YAML parser will NOT evaluate expressions in the value fields.
+# e.g. "validation_frac: 1.0/3.0" will result in str value "1.0/3.0"
 
 # will do stuff in fs_path / [username] / signal_data | shot_lists | processed shots, etc.
 
@@ -121,7 +123,7 @@ training:
   shuffle_training: True
   # used iff 1) test & 2) (train U validate) are both sampled from the same distribution/source lists of shots:
   train_frac: 0.75
-  validation_frac: 1.0/3.0
+  validation_frac: 0.3333333333333333
   batch_size: 128 # 256
   # THE MAX_PATCH_LENGTH WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
   max_patch_length: 100000

From fda919fcefb39b87e62efcc266a7e515a47e87ce Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Tue, 26 Nov 2019 12:18:01 -0500
Subject: [PATCH 20/51] Remove leftover debug print()

---
 plasma/utils/processing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/plasma/utils/processing.py b/plasma/utils/processing.py
index 4781e3d0..7a13b2c1 100644
--- a/plasma/utils/processing.py
+++ b/plasma/utils/processing.py
@@ -70,7 +70,6 @@ def append_to_filename(path, to_append):
 def train_test_split(x, frac, do_shuffle=False):
     # TODO(KGF): rename these 2x fns; used for generic ShotList.split_direct
     if not isinstance(x, np.ndarray):
-        print("x is not an instance of np.ndarray")
         return train_test_split_robust(x, frac, do_shuffle)
     mask = np.array(range(len(x))) < frac*len(x)
     # Note, these functions do not directly split the "disruptive" subset of

From 7712e2ee07f010f4db0769365ec9edf262970931 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Tue, 26 Nov 2019 13:33:44 -0600
Subject: [PATCH 21/51] Standardize [omit] diagnostics in Normalizer and
 downloading.py

---
 plasma/preprocessor/normalize.py  | 4 ++--
 plasma/preprocessor/preprocess.py | 4 ++--
 plasma/primitives/shots.py        | 2 +-
 plasma/utils/downloading.py       | 5 ++---
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/plasma/preprocessor/normalize.py b/plasma/preprocessor/normalize.py
index 19606121..ae5a8adc 100644
--- a/plasma/preprocessor/normalize.py
+++ b/plasma/preprocessor/normalize.py
@@ -244,7 +244,7 @@ def extract_stats(self, shot):
                                     (1, num_signals))
             stats.is_disruptive = shot.is_disruptive
         else:
-            print('Warning: shot {} not valid, omitting'.format(shot.number))
+            print('Warning: shot {} not valid [omit]'.format(shot.number))
         stats.valid = shot.valid
         stats.machine = shot.machine
         return stats
@@ -397,7 +397,7 @@ def extract_stats(self, shot):
             stats.maximums = np.array([np.max(sig) for sig in list_of_signals])
             stats.is_disruptive = shot.is_disruptive
         else:
-            print('Warning: shot {} not valid, omitting'.format(shot.number))
+            print('Warning: shot {} not valid [omit]'.format(shot.number))
         stats.valid = shot.valid
         stats.machine = shot.machine
         return stats
diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index 18cf3114..9672fd6d 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -225,7 +225,7 @@ def apply_bleed_in(conf, shot_list_train, shot_list_validate, shot_list_test):
         #         if conf['data']['bleed_in_remove_from_test']:
         #             shot_list_test.remove(s)
         # else:
-        #     print('No disruptive shots in test set, omitting bleed in')
+        #     print('No disruptive shots in test set, [omit] bleed in')
         # if num_nd > 0:
         #     for i in range(num):
         #         s = shot_list_test.sample_single_class(False)
@@ -234,7 +234,7 @@ def apply_bleed_in(conf, shot_list_train, shot_list_validate, shot_list_test):
         #         if conf['data']['bleed_in_remove_from_test']:
         #             shot_list_test.remove(s)
         # else:
-        #     print('No nondisruptive shots in test set, omitting bleed in')
+        #     print('No nondisruptive shots in test set, [omit] bleed in')
     return shot_list_train, shot_list_validate, shot_list_test
 
 
diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 07d4156f..0f15fe89 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -260,7 +260,7 @@ def append_if_valid(self, shot):
             self.append(shot)
             return True
         else:
-            # print('Warning: shot {} not valid, omitting'.format(shot.number))
+            # print('Warning: shot {} not valid [omit]'.format(shot.number))
             return False
 
 
diff --git a/plasma/utils/downloading.py b/plasma/utils/downloading.py
index ae940c66..e5cc7655 100644
--- a/plasma/utils/downloading.py
+++ b/plasma/utils/downloading.py
@@ -181,9 +181,8 @@ def download_all_shot_numbers(prepath, save_path, shot_list_files,
     signals = []
     for sig in signals_full:
         if not sig.is_defined_on_machine(machine):
-            print(
-                'Signal {} not defined on machine {}, omitting'.format(
-                    sig, machine))
+            print('Signal {} not defined on machine {} [omit]'.format(
+                sig, machine))
         else:
             signals.append(sig)
     save_prepath = prepath + save_path + '/'

From f53eeeea9c7fb68dda249983569eff9b538ee8c3 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Mon, 2 Dec 2019 12:42:06 -0600
Subject: [PATCH 22/51] Add comment

---
 plasma/primitives/shots.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 0f15fe89..da6d2bfe 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -173,6 +173,7 @@ def sample_equal_classes(self):
         return self.sample_weighted_given_arr(p)
 
     def get_weights_d_nd(self):
+        # TODO(KGF): only called in above sample_equal_classes()
         num_total = len(self)
         num_d = self.num_disruptive()
         num_nd = num_total - num_d

From d31908fc47008591cade0bc4261f73ac209fa846 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 5 Dec 2019 06:49:18 +0000
Subject: [PATCH 23/51] Suppress the same NumPy deprecation warning in
 tensorboard

as in tensorflow. Occurs on ALCF Theta

numpy                     1.17.2
tensorboard               1.12.2                   pypi_0    pypi
tensorflow                1.12.0                   pypi_0    pypi
tensorflow-base           1.14.0          eigen_py36hf4a566f_0
tensorflow-estimator      1.14.0                     py_0

/home/felker/FRNN_project/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550:
  FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
---
 plasma/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plasma/__init__.py b/plasma/__init__.py
index 3be7f58e..f823703a 100644
--- a/plasma/__init__.py
+++ b/plasma/__init__.py
@@ -15,7 +15,7 @@
 warnings.filterwarnings('ignore',
                         category=FutureWarning,
                         message=r"passing \(type, 1\) or '1type' as a synonym of type is deprecated",  # noqa
-                        module="tensorflow")
+                        module="tensor*")
 
 # Optional: disable the C-based library diagnostic info and warning messages:
 # 2019-11-06 18:27:31.698908: I ...  dynamic library libcublas.so.10

From b0e5889aff13bc60d2926e8240a206938588afee Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 5 Dec 2019 00:02:36 -0800
Subject: [PATCH 24/51] Prefix processed_shots/*.npz filenames with "jet-",
 "d3d-" etc.

Currently, preprocessing dumps all machines/shotlists with the same
signal group hash into the same folder. There were no collisions in file
names because D3D and JET shot numbers do not currently overlap.

Unify implementations of get_individual_shot_file() in
utils/processing.py (fairly confident that warning comment about globals
incompat with multiprocessing is no longer valid).

Use os.path.join() instead of manual += '/'

Need to test these changes.
---
 plasma/preprocessor/normalize.py |  4 ----
 plasma/primitives/data.py        | 17 ++++++++++-------
 plasma/primitives/shots.py       | 14 ++++++--------
 plasma/utils/processing.py       |  9 +++++++--
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/plasma/preprocessor/normalize.py b/plasma/preprocessor/normalize.py
index ae5a8adc..eede4247 100644
--- a/plasma/preprocessor/normalize.py
+++ b/plasma/preprocessor/normalize.py
@@ -465,10 +465,6 @@ def load_stats(self, verbose=False):
             self.print_summary()
 
 
-def get_individual_shot_file(prepath, shot_num, ext='.txt'):
-    return prepath + str(shot_num) + ext
-
-
 def apply_positivity(shot):
     for (i, sig) in enumerate(shot.signals):
         if hasattr(sig, "is_strictly_positive"):
diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index 0123588e..9204f3c8 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -50,9 +50,10 @@ def is_ip(self):
         return self.is_ip
 
     def get_file_path(self, prepath, machine, shot_number):
-        dirname = self.get_path(machine)
-        return get_individual_shot_file(prepath + '/' + machine.name + '/'
-                                        + dirname + '/', shot_number)
+        signal_dirname = self.get_path(machine)
+        dirname = os.path.join(prepath, machine.name, signal_dirname)
+        return get_individual_shot_file(dirname, machine.name, shot_number,
+                                        raw_signal=True)
 
     def is_valid(self, prepath, shot, dtype='float32'):
         t, data, exists = self.load_data(prepath, shot, dtype)
@@ -358,12 +359,14 @@ def fetch_data(self, machine, shot_num, c):
         return time, data, mapping, success
 
     def get_file_path(self, prepath, machine, shot_number):
-        dirname = self.get_path(machine)
+        signal_dirname = self.get_path(machine)
         num = self.get_channel_num(machine)
         if num is not None:
-            dirname += "/channel{}".format(num)
-        return get_individual_shot_file(prepath + '/' + machine.name + '/'
-                                        + dirname + '/', shot_number)
+            # TODO(KGF): deduplicate with parent class fn. Only difference:
+            signal_dirname += "/channel{}".format(num)
+        dirname = os.path.join(prepath, machine.name, signal_dirname)
+        return get_individual_shot_file(dirname, machine.name, shot_number,
+                                        raw_signal=True)
 
 
 class Machine(object):
diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index da6d2bfe..6570977b 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -15,7 +15,10 @@
 import random as rnd
 import numpy as np
 
-from plasma.utils.processing import train_test_split, cut_and_resample_signal
+from plasma.utils.processing import (
+    train_test_split, cut_and_resample_signal,
+    get_individual_shot_file
+    )
 from plasma.utils.downloading import makedirs_process_safe
 
 
@@ -475,7 +478,8 @@ def save(self, prepath):
         print('...saved shot {}'.format(self.number))
 
     def get_save_path(self, prepath):
-        return get_individual_shot_file(prepath, self.number, '.npz')
+        return get_individual_shot_file(prepath, self.machine, self.number,
+                                        '.npz')
 
     def restore(self, prepath, light=False):
         assert self.previously_saved(prepath), 'shot was never saved'
@@ -503,9 +507,3 @@ def make_light(self):
     @staticmethod
     def is_disruptive_given_disruption_time(t):
         return t >= 0
-
-# it used to be in utilities, but can't import globals in multiprocessing
-
-
-def get_individual_shot_file(prepath, shot_num, ext='.txt'):
-    return prepath + str(shot_num) + ext
diff --git a/plasma/utils/processing.py b/plasma/utils/processing.py
index 7a13b2c1..9950e5d6 100644
--- a/plasma/utils/processing.py
+++ b/plasma/utils/processing.py
@@ -57,8 +57,13 @@ def cut_and_resample_signal(t, sig, tmin, tmax, dt, precision_str):
     return resample_signal(t, sig, tmin, tmax, dt, precision_str)
 
 
-def get_individual_shot_file(prepath, shot_num, ext='.txt'):
-    return prepath + str(shot_num) + ext
+def get_individual_shot_file(prepath, machine, shot_num, raw_signal=False,
+                             ext='.txt'):
+    """Return filepath of raw input .txt shot signal or processed .npz shot"""
+    if raw_signal:
+        return prepath + str(shot_num) + ext
+    else:
+        return prepath + machine + '_' + str(shot_num) + ext
 
 
 def append_to_filename(path, to_append):

From 677b00bbbc555a17eac0821c9d9a948cf2e2b236 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 5 Dec 2019 03:50:37 -0500
Subject: [PATCH 25/51] Fix bugs in parent commit

---
 plasma/primitives/shots.py | 2 +-
 plasma/utils/processing.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 6570977b..945f9d9c 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -479,7 +479,7 @@ def save(self, prepath):
 
     def get_save_path(self, prepath):
         return get_individual_shot_file(prepath, self.machine, self.number,
-                                        '.npz')
+                                        ext='.npz')
 
     def restore(self, prepath, light=False):
         assert self.previously_saved(prepath), 'shot was never saved'
diff --git a/plasma/utils/processing.py b/plasma/utils/processing.py
index 9950e5d6..532b5a8b 100644
--- a/plasma/utils/processing.py
+++ b/plasma/utils/processing.py
@@ -10,7 +10,7 @@
 
 from __future__ import print_function
 import itertools
-
+import os
 import numpy as np
 # from scipy.interpolate import UnivariateSpline
 
@@ -61,9 +61,9 @@ def get_individual_shot_file(prepath, machine, shot_num, raw_signal=False,
                              ext='.txt'):
     """Return filepath of raw input .txt shot signal or processed .npz shot"""
     if raw_signal:
-        return prepath + str(shot_num) + ext
+        return os.path.join(prepath, str(shot_num) + ext)
     else:
-        return prepath + machine + '_' + str(shot_num) + ext
+        return os.path.join(prepath, str(machine) + '_' + str(shot_num) + ext)
 
 
 def append_to_filename(path, to_append):

From c1c645f1b005cfe96ecdeeed84b0e8ad4434790f Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Sat, 7 Dec 2019 17:02:49 -0500
Subject: [PATCH 26/51] Add support for CuDNNLSTM; dump model params to ONNX

- Consider wrapping import onnx, etc. in try/except to make this an
  optional dependency that automatically runs if installed
- Specify Opset=10, for now
- Only add dropout parameters to RNN layer if CuDNNLSTM is not used

- ONNX conversion will not fail fatally if op is not supported. Need to
  evaluate if CuDNNLSTM output is usable at all (or with non-GPU
  inference), given the following warning that is emitted:

WARNING:tensorflow:From
/home/kfelker/.conda/envs/frnn/lib/python3.7/site-packages/keras2onnx/subgraph.py:156:
tensor_shape_from_node_def_name (from
tensorflow.python.framework.graph_util_impl) is deprecated and will be
removed in a future version.
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
Cannot infer shape for TFNodes1/cu_dnnlstm_1/CudnnRNN:
TFNodes1/cu_dnnlstm_1/CudnnRNN:3
Tensorflow op [TFNodes1/cu_dnnlstm_1/CudnnRNN: CudnnRNN] is not
supported
Unsupported ops: Counter({'CudnnRNN': 1})
Cannot infer shape for TFNodes/cu_dnnlstm_2/CudnnRNN:
TFNodes/cu_dnnlstm_2/CudnnRNN:3
Tensorflow op [TFNodes/cu_dnnlstm_2/CudnnRNN: CudnnRNN] is not supported
Unsupported ops: Counter({'CudnnRNN': 1})
---
 plasma/models/builder.py | 205 ++++++++++++++++++++++-----------------
 1 file changed, 118 insertions(+), 87 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index e47f6588..77bd6630 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -3,13 +3,14 @@
 # KGF: the first time Keras is ever imported via mpi_learn.py -> mpi_runner.py
 import keras.backend as K
 # KGF: see below synchronization--- output is launched here
-from keras.models import Sequential, Model
+from keras.models import Model  # , Sequential
+# KGF: (was used only in hyper_build_model())
 from keras.layers import Input
 from keras.layers.core import (
     Dense, Activation, Dropout, Lambda,
     Reshape, Flatten, Permute,  # RepeatVector
     )
-from keras.layers import LSTM, SimpleRNN, BatchNormalization
+from keras.layers import LSTM, CuDNNLSTM, SimpleRNN, BatchNormalization
 from keras.layers.convolutional import Convolution1D
 from keras.layers.pooling import MaxPooling1D
 # from keras.utils.data_utils import get_file
@@ -25,6 +26,9 @@
 from copy import deepcopy
 from plasma.utils.downloading import makedirs_process_safe
 from plasma.utils.hashing import general_object_hash
+# TODO(KGF): perhaps relax the requirement of thse dependencies with try/except
+import keras2onnx
+import onnx
 
 # Synchronize 2x stderr msg from TensorFlow initialization via Keras backend
 # "Succesfully opened dynamic library... libcudart" "Using TensorFlow backend."
@@ -128,6 +132,8 @@ def build_model(self, predict, custom_batch_size=None):
 
         if rnn_type == 'LSTM':
             rnn_model = LSTM
+        elif rnn_type == 'CuDNNLSTM':
+            rnn_model = CuDNNLSTM
         elif rnn_type == 'SimpleRNN':
             rnn_model = SimpleRNN
         else:
@@ -257,14 +263,20 @@ def slicer_output_shape(input_shape, indices):
         # pre_rnn_model.summary()
         x_input = Input(batch_shape=batch_input_shape)
         x_in = TimeDistributed(pre_rnn_model)(x_input)
+        model_kwargs = dict(return_sequences=return_sequences,
+                            # batch_input_shape=batch_input_shape,
+                            stateful=stateful,
+                            kernel_regularizer=l2(regularization),
+                            recurrent_regularizer=l2(regularization),
+                            bias_regularizer=l2(regularization),
+                            )
+        if rnn_type != 'CuDNNLSTM':
+            # Dropout is unsupported in CuDNN library
+            model_kwargs['dropout'] = dropout_prob
+            model_kwargs['recurrent_dropout'] = dropout_prob
+            # LSTM in ONNX: "The maximum opset needed by this model is only 9."
         for _ in range(model_conf['rnn_layers']):
-            x_in = rnn_model(
-                rnn_size, return_sequences=return_sequences,
-                # batch_input_shape=batch_input_shape,
-                stateful=stateful, kernel_regularizer=l2(regularization),
-                recurrent_regularizer=l2(regularization),
-                bias_regularizer=l2(regularization), dropout=dropout_prob,
-                recurrent_dropout=dropout_prob)(x_in)
+            x_in = rnn_model(rnn_size, **model_kwargs)(x_in)
             x_in = Dropout(dropout_prob)(x_in)
         if return_sequences:
             # x_out = TimeDistributed(Dense(100,activation='tanh')) (x_in)
@@ -292,16 +304,28 @@ def build_train_test_models(self):
     def save_model_weights(self, model, epoch):
         save_path = self.get_save_path(epoch)
         model.save_weights(save_path, overwrite=True)
+        try:
+            save_path = self.get_save_path(epoch, ext='onnx')
+            onnx_model = keras2onnx.convert_keras(model, model.name,
+                                                  target_opset=10)
+            onnx.save_model(onnx_model, save_path)
+        except Exception as e:
+            print(e)
+            return
 
     def delete_model_weights(self, model, epoch):
         save_path = self.get_save_path(epoch)
         assert(os.path.exists(save_path))
         os.remove(save_path)
 
-    def get_save_path(self, epoch):
+    def get_save_path(self, epoch, ext='h5'):
         unique_id = self.get_unique_id()
-        return (self.conf['paths']['model_save_path']
-                + 'model.{}._epoch_.{}.h5'.format(unique_id, epoch))
+        dir_path = self.conf['paths']['model_save_path']
+        # TODO(KGF): consider storing .onnx files in subdirectory away from .h5
+        # if ext == 'onnx':
+        #     os.path.join(dir_path, 'onnx/')
+        return os.path.join(
+            dir_path, 'model.{}._epoch_.{}.{}'.format(unique_id, epoch, ext))
 
     def ensure_save_directory(self):
         prepath = self.conf['paths']['model_save_path']
@@ -327,28 +351,30 @@ def load_model_weights(self, model, custom_path=None):
             g.write_all("Loading from custom epoch {}\n".format(epoch))
             return epoch
 
-    # TODO(KGF): method only called in non-MPI runner.py. Deduplicate?
-    def get_latest_save_path(self):
-        epochs = self.get_all_saved_files()
-        if len(epochs) == 0:
-            print('no previous checkpoint found')
-            return ''
-        else:
-            max_epoch = max(epochs)
-            print('loading from epoch {}'.format(max_epoch))
-            return self.get_save_path(max_epoch)
+    # TODO(KGF): method was only called in non-MPI runner.py. Remove.
+    # def get_latest_save_path(self):
+    #     epochs = self.get_all_saved_files()
+    #     if len(epochs) == 0:
+    #         print('no previous checkpoint found')
+    #         return ''
+    #     else:
+    #         max_epoch = max(epochs)
+    #         print('loading from epoch {}'.format(max_epoch))
+    #         return self.get_save_path(max_epoch)
 
     def extract_id_and_epoch_from_filename(self, filename):
         regex = re.compile(r'-?\d+')
         numbers = [int(x) for x in regex.findall(filename)]
-        assert(len(numbers) == 3)  # id,epoch number and extension
-        assert(numbers[2] == 5)  # .h5 extension
+        if filename[-3:] == '.h5':
+            assert len(numbers) == 3  # id, epoch number, and .h5 extension
+            assert numbers[2] == 5  # .h5 extension
         return numbers[0], numbers[1]
 
     def get_all_saved_files(self):
         self.ensure_save_directory()
         unique_id = self.get_unique_id()
         path = self.conf['paths']['model_save_path']
+        # TODO(KGF): probably should only list .h5 file, not ONNX right now
         filenames = [name for name in os.listdir(path)
                      if os.path.isfile(os.path.join(path, name))]
         epochs = []
@@ -358,66 +384,71 @@ def get_all_saved_files(self):
                 epochs.append(epoch)
         return epochs
 
-    # FIXME this is essentially the ModelBuilder.build_model
-        # in the long run we want to replace the space dictionary with the
-        # regular conf file - I am sure there is a way to accomodate
-    def hyper_build_model(self, space, predict, custom_batch_size=None):
-        conf = self.conf
-        model_conf = conf['model']
-        rnn_size = model_conf['rnn_size']
-        rnn_type = model_conf['rnn_type']
-        regularization = model_conf['regularization']
-
-        dropout_prob = model_conf['dropout_prob']
-        length = model_conf['length']
-        pred_length = model_conf['pred_length']
-        # skip = model_conf['skip']
-        stateful = model_conf['stateful']
-        return_sequences = model_conf['return_sequences']
-        # model_conf['output_activation']
-        output_activation = conf['data']['target'].activation
-        num_signals = conf['data']['num_signals']
-
-        batch_size = self.conf['training']['batch_size']
-        if predict:
-            batch_size = self.conf['model']['pred_batch_size']
-            # so we can predict with one time point at a time!
-            if return_sequences:
-                length = pred_length
-            else:
-                length = 1
-
-        if custom_batch_size is not None:
-            batch_size = custom_batch_size
-
-        if rnn_type == 'LSTM':
-            rnn_model = LSTM
-        elif rnn_type == 'SimpleRNN':
-            rnn_model = SimpleRNN
-        else:
-            print('Unkown Model Type, exiting.')
-            exit(1)
-
-        batch_input_shape = (batch_size, length, num_signals)
-        model = Sequential()
-
-        for _ in range(model_conf['rnn_layers']):
-            model.add(
-                rnn_model(
-                    rnn_size,
-                    return_sequences=return_sequences,
-                    batch_input_shape=batch_input_shape,
-                    stateful=stateful,
-                    kernel_regularizer=l2(regularization),
-                    recurrent_regularizer=l2(regularization),
-                    bias_regularizer=l2(regularization),
-                    dropout=dropout_prob,
-                    recurrent_dropout=dropout_prob))
-            model.add(Dropout(space['Dropout']))
-        if return_sequences:
-            model.add(TimeDistributed(Dense(1, activation=output_activation)))
-        else:
-            model.add(Dense(1, activation=output_activation))
-        model.reset_states()
-
-        return model
+    # TODO(felker): remove the following code or use as template for DeepHyper
+    # plugin. Formerly was only used in single-GPU runner.py with hyperopt
+
+    # TODO(alexeys): this is essentially the ModelBuilder.build_model
+    # in the long run we want to replace the space dictionary with the
+    # regular conf file - I am sure there is a way to accomodate
+    # def hyper_build_model(self, space, predict, custom_batch_size=None):
+    #     conf = self.conf
+    #     model_conf = conf['model']
+    #     rnn_size = model_conf['rnn_size']
+    #     rnn_type = model_conf['rnn_type']
+    #     regularization = model_conf['regularization']
+
+    #     dropout_prob = model_conf['dropout_prob']
+    #     length = model_conf['length']
+    #     pred_length = model_conf['pred_length']
+    #     # skip = model_conf['skip']
+    #     stateful = model_conf['stateful']
+    #     return_sequences = model_conf['return_sequences']
+    #     # model_conf['output_activation']
+    #     output_activation = conf['data']['target'].activation
+    #     num_signals = conf['data']['num_signals']
+
+    #     batch_size = self.conf['training']['batch_size']
+    #     if predict:
+    #         batch_size = self.conf['model']['pred_batch_size']
+    #         # so we can predict with one time point at a time!
+    #         if return_sequences:
+    #             length = pred_length
+    #         else:
+    #             length = 1
+
+    #     if custom_batch_size is not None:
+    #         batch_size = custom_batch_size
+
+    #     if rnn_type == 'LSTM':
+    #         rnn_model = CuDNNLSTM
+    #     elif rnn_type == 'SimpleRNN':
+    #         rnn_model = SimpleRNN
+    #     else:
+    #         print('Unkown Model Type, exiting.')
+    #         exit(1)
+
+    #     batch_input_shape = (batch_size, length, num_signals)
+    #     model = Sequential()
+
+    #     for _ in range(model_conf['rnn_layers']):
+    #         model.add(
+    #             rnn_model(
+    #                 rnn_size,
+    #                 return_sequences=return_sequences,
+    #                 batch_input_shape=batch_input_shape,
+    #                 stateful=stateful,
+    #                 kernel_regularizer=l2(regularization),
+    #                 recurrent_regularizer=l2(regularization),
+    #                 bias_regularizer=l2(regularization),
+    #                 # dropout=dropout_prob,
+    #                 # recurrent_dropout=dropout_prob
+    #             ))
+    #         model.add(Dropout(space['Dropout']))
+    #     if return_sequences:
+    #         model.add(TimeDistributed(Dense(1, activation=output_activation)
+    # ))
+    #     else:
+    #         model.add(Dense(1, activation=output_activation))
+    #     model.reset_states()
+
+    #     return model

From b11d7596526d28bedb3df570dea39aa717502be9 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Sat, 7 Dec 2019 17:10:49 -0500
Subject: [PATCH 27/51] Delete single-GPU Keras runner; comment out hyperopt
 driver for it

---
 examples/hyper_learn.py |  95 ++++----
 examples/learn.py       |   4 +-
 plasma/models/runner.py | 521 ----------------------------------------
 3 files changed, 51 insertions(+), 569 deletions(-)
 delete mode 100644 plasma/models/runner.py

diff --git a/examples/hyper_learn.py b/examples/hyper_learn.py
index 0dc41a8a..5186bebe 100644
--- a/examples/hyper_learn.py
+++ b/examples/hyper_learn.py
@@ -1,47 +1,48 @@
-from plasma.models import runner
-from plasma.models.loader import Loader
-
-import numpy as np
-from hyperopt import Trials, tpe
-
-from plasma.conf import conf
-from pprint import pprint
-pprint(conf)
-# from plasma.primitives.shots import Shot, ShotList
-# from plasma.models.runner import train, make_predictions,make_predictions_gpu
-
-if conf['data']['normalizer'] == 'minmax':
-    from plasma.preprocessor.normalize import MinMaxNormalizer as Normalizer
-elif conf['data']['normalizer'] == 'meanvar':
-    from plasma.preprocessor.normalize import MeanVarNormalizer as Normalizer
-elif conf['data']['normalizer'] == 'var':
-    # performs !much better than minmaxnormalizer
-    from plasma.preprocessor.normalize import VarNormalizer as Normalizer
-elif conf['data']['normalizer'] == 'averagevar':
-    # performs !much better than minmaxnormalizer
-    from plasma.preprocessor.normalize import (
-        AveragingVarNormalizer as Normalizer
-    )
-else:
-    print('unkown normalizer. exiting')
-    exit(1)
-
-np.random.seed(1)
-
-print("normalization", end='')
-nn = Normalizer(conf)
-nn.train()
-loader = Loader(conf, nn)
-shot_list_train, shot_list_validate, shot_list_test = loader.load_shotlists(
-    conf)
-print("...done")
-
-print('Training on {} shots, testing on {} shots'.format(
-    len(shot_list_train), len(shot_list_test)))
-
-specific_runner = runner.HyperRunner(conf, loader, shot_list_train)
-
-best_run, best_model = specific_runner.frnn_minimize(
-    algo=tpe.suggest, max_evals=2, trials=Trials())
-print(best_run)
-print(best_model)
+# from plasma.models import runner
+# from plasma.models.loader import Loader
+
+# import numpy as np
+# from hyperopt import Trials, tpe
+
+# from plasma.conf import conf
+# from pprint import pprint
+# pprint(conf)
+#  #from plasma.primitives.shots import Shot, ShotList
+#  #from plasma.models.runner import train, make_predictions
+#  ,make_predictions_gpu
+
+# if conf['data']['normalizer'] == 'minmax':
+#     from plasma.preprocessor.normalize import MinMaxNormalizer as Normalizer
+# elif conf['data']['normalizer'] == 'meanvar':
+#     from plasma.preprocessor.normalize import MeanVarNormalizer as Normalizer
+# elif conf['data']['normalizer'] == 'var':
+#     # performs !much better than minmaxnormalizer
+#     from plasma.preprocessor.normalize import VarNormalizer as Normalizer
+# elif conf['data']['normalizer'] == 'averagevar':
+#     # performs !much better than minmaxnormalizer
+#     from plasma.preprocessor.normalize import (
+#         AveragingVarNormalizer as Normalizer
+#     )
+# else:
+#     print('unkown normalizer. exiting')
+#     exit(1)
+
+# np.random.seed(1)
+
+# print("normalization", end='')
+# nn = Normalizer(conf)
+# nn.train()
+# loader = Loader(conf, nn)
+# shot_list_train, shot_list_validate, shot_list_test = loader.load_shotlists(
+#     conf)
+# print("...done")
+
+# print('Training on {} shots, testing on {} shots'.format(
+#     len(shot_list_train), len(shot_list_test)))
+
+# specific_runner = runner.HyperRunner(conf, loader, shot_list_train)
+
+# best_run, best_model = specific_runner.frnn_minimize(
+#     algo=tpe.suggest, max_evals=2, trials=Trials())
+# print(best_run)
+# print(best_model)
diff --git a/examples/learn.py b/examples/learn.py
index b1156130..0f89c71c 100644
--- a/examples/learn.py
+++ b/examples/learn.py
@@ -39,7 +39,9 @@
         train, make_predictions_and_evaluate_gpu
         )
 else:
-    from plasma.models.runner import train, make_predictions_and_evaluate_gpu
+    print('unknown driver. exiting')
+    exit(1)
+    # from plasma.models.runner import train, make_predictions_and_evaluate_gpu
 
 if conf['data']['normalizer'] == 'minmax':
     from plasma.preprocessor.normalize import MinMaxNormalizer as Normalizer
diff --git a/plasma/models/runner.py b/plasma/models/runner.py
deleted file mode 100644
index c2008817..00000000
--- a/plasma/models/runner.py
+++ /dev/null
@@ -1,521 +0,0 @@
-from plasma.utils.state_reset import reset_states
-from plasma.utils.evaluation import get_loss_from_list
-from plasma.utils.performance import PerformanceAnalyzer
-from plasma.utils.diagnostics import print_shot_list_sizes
-from plasma.models.loader import Loader, ProcessGenerator
-from plasma.conf import conf
-import pathos.multiprocessing as mp
-from functools import partial
-import os
-import time
-from hyperopt import hp, STATUS_OK
-import numpy as np
-import sys
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt  # noqa
-
-# if sys.version_info[0] < 3:
-#     from itertools import imap
-
-
-backend = conf['model']['backend']
-
-
-def train(conf, shot_list_train, shot_list_validate, loader,
-          shot_list_test=None):
-    loader.set_inference_mode(False)
-    np.random.seed(1)
-
-    validation_losses = []
-    validation_roc = []
-    training_losses = []
-    print_shot_list_sizes(shot_list_train, shot_list_validate)
-
-    if backend == 'tf' or backend == 'tensorflow':
-        first_time = "tensorflow" not in sys.modules
-        if first_time:
-            import tensorflow as tf
-            os.environ['KERAS_BACKEND'] = 'tensorflow'
-            from keras.backend.tensorflow_backend import set_session
-            config = tf.ConfigProto(device_count={"GPU": 1})
-            set_session(tf.Session(config=config))
-    else:
-        os.environ['KERAS_BACKEND'] = 'theano'
-        os.environ['THEANO_FLAGS'] = 'device=gpu,floatX=float32'
-        # import theano
-
-    from keras.utils.generic_utils import Progbar
-    from keras import backend as K
-    from plasma.models import builder
-
-    print('Build model...', end='')
-    specific_builder = builder.ModelBuilder(conf)
-    train_model = specific_builder.build_model(False)
-    print('Compile model', end='')
-    train_model.compile(optimizer=optimizer_class(),
-                        loss=conf['data']['target'].loss)
-    print('...done')
-
-    # load the latest epoch we did. Returns -1 if none exist yet
-    e = specific_builder.load_model_weights(train_model)
-    e_start = e
-    batch_generator = partial(
-        loader.training_batch_generator_partial_reset,
-        shot_list=shot_list_train)
-    batch_iterator = ProcessGenerator(batch_generator())
-
-    num_epochs = conf['training']['num_epochs']
-    # num_at_once = conf['training']['num_shots_at_once']
-    lr_decay = conf['model']['lr_decay']
-    print('{} epochs left to go'.format(num_epochs - 1 - e))
-    num_so_far_accum = 0
-    num_so_far = 0
-    num_total = np.inf
-
-    if conf['callbacks']['mode'] == 'max':
-        best_so_far = -np.inf
-        cmp_fn = max
-    else:
-        best_so_far = np.inf
-        cmp_fn = min
-
-    while e < (num_epochs - 1):
-        e += 1
-        print('\nEpoch {}/{}'.format(e+1, num_epochs))
-        pbar = Progbar(len(shot_list_train))
-
-        # TODO(KGF): check this fix; lr, tf were undefined in neglected
-        # serial runner.py, since mpi_runner.py has been the main tool
-        lr = conf['model']['lr']
-        # decay learning rate each epoch:
-        K.set_value(train_model.optimizer.lr, lr*lr_decay**(e))
-
-        num_batches_minimum = 100
-        num_batches_current = 0
-        training_losses_tmp = []
-
-        while (num_so_far < (e - e_start)*num_total
-               or num_batches_current < num_batches_minimum):
-            num_so_far_old = num_so_far
-            try:
-                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                 num_total, is_warmup_period) = next(batch_iterator)
-            except StopIteration:
-                print("Resetting batch iterator.")
-                num_so_far_accum = num_so_far
-                batch_iterator = ProcessGenerator(batch_generator())
-                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                 num_total, is_warmup_period) = next(batch_iterator)
-            if np.any(batches_to_reset):
-                reset_states(train_model, batches_to_reset)
-            if not is_warmup_period:
-                num_so_far = num_so_far_accum + num_so_far_curr
-                num_batches_current += 1
-                loss = train_model.train_on_batch(batch_xs, batch_ys)
-                training_losses_tmp.append(loss)
-                pbar.add(num_so_far - num_so_far_old,
-                         values=[("train loss", loss)])
-                loader.verbose = False  # True during the first iteration
-            else:
-                _ = train_model.predict(
-                    batch_xs, batch_size=conf['training']['batch_size'])
-
-        e = e_start + 1.0*num_so_far/num_total
-        sys.stdout.flush()
-        ave_loss = np.mean(training_losses_tmp)
-        training_losses.append(ave_loss)
-        specific_builder.save_model_weights(train_model, int(round(e)))
-
-        if conf['training']['validation_frac'] > 0.0:
-            print("prediction on GPU...")
-            _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu(
-                conf, shot_list_validate, loader)
-            validation_losses.append(loss)
-            validation_roc.append(roc_area)
-
-            epoch_logs = {}
-            epoch_logs['val_roc'] = roc_area
-            epoch_logs['val_loss'] = loss
-            epoch_logs['train_loss'] = ave_loss
-            best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']],
-                                 best_so_far)
-            # only save model weights if quantity we are tracking is improving
-            if best_so_far != epoch_logs[conf['callbacks']['monitor']]:
-                print("Not saving model weights")
-                specific_builder.delete_model_weights(train_model,
-                                                      int(round(e)))
-
-            if conf['training']['ranking_difficulty_fac'] != 1.0:
-                (_, _, _, roc_area_train,
-                 loss_train) = make_predictions_and_evaluate_gpu(
-                     conf, shot_list_train, loader)
-                batch_iterator.__exit__()
-                batch_generator = partial(
-                    loader.training_batch_generator_partial_reset,
-                    shot_list=shot_list_train)
-                batch_iterator = ProcessGenerator(batch_generator())
-                num_so_far_accum = num_so_far
-
-        print('=========Summary========')
-        print('Training Loss Numpy: {:.3e}'.format(training_losses[-1]))
-        if conf['training']['validation_frac'] > 0.0:
-            print('Validation Loss: {:.3e}'.format(validation_losses[-1]))
-            print('Validation ROC: {:.4f}'.format(validation_roc[-1]))
-            if conf['training']['ranking_difficulty_fac'] != 1.0:
-                print('Train Loss: {:.3e}'.format(loss_train))
-                print('Train ROC: {:.4f}'.format(roc_area_train))
-
-    # plot_losses(conf,[training_losses],specific_builder,name='training')
-    if conf['training']['validation_frac'] > 0.0:
-        plot_losses(conf, [training_losses, validation_losses, validation_roc],
-                    specific_builder, name='training_validation_roc')
-    batch_iterator.__exit__()
-    print('...done')
-
-
-def optimizer_class():
-    from keras.optimizers import SGD, Adam, RMSprop, Nadam, TFOptimizer
-    # TODO(KGF): check this fix; lr, tf were undefined in neglected
-    # serial runner.py, since mpi_runner.py has been the main tool
-    import tensorflow as tf
-
-    if conf['model']['optimizer'] == 'sgd':
-        return SGD(lr=conf['model']['lr'], clipnorm=conf['model']['clipnorm'])
-    elif conf['model']['optimizer'] == 'momentum_sgd':
-        return SGD(lr=conf['model']['lr'], clipnorm=conf['model']['clipnorm'],
-                   decay=1e-6, momentum=0.9)
-    elif conf['model']['optimizer'] == 'tf_momentum_sgd':
-        return TFOptimizer(tf.train.MomentumOptimizer(
-            learning_rate=conf['model']['lr'], momentum=0.9))
-    elif conf['model']['optimizer'] == 'adam':
-        return Adam(lr=conf['model']['lr'], clipnorm=conf['model']['clipnorm'])
-    elif conf['model']['optimizer'] == 'tf_adam':
-        return TFOptimizer(tf.train.AdamOptimizer(
-            learning_rate=conf['model']['lr']))
-    elif conf['model']['optimizer'] == 'rmsprop':
-        return RMSprop(lr=conf['model']['lr'],
-                       clipnorm=conf['model']['clipnorm'])
-    elif conf['model']['optimizer'] == 'nadam':
-        return Nadam(lr=conf['model']['lr'],
-                     clipnorm=conf['model']['clipnorm'])
-    else:
-        print("Optimizer not implemented yet")
-        exit(1)
-
-
-class HyperRunner(object):
-    def __init__(self, conf, loader, shot_list):
-        self.loader = loader
-        self.shot_list = shot_list
-        self.conf = conf
-
-    # FIXME setup for hyperas search
-    def keras_fmin_fnct(self, space):
-        from plasma.models import builder
-
-        specific_builder = builder.ModelBuilder(self.conf)
-
-        train_model = specific_builder.hyper_build_model(space, False)
-        train_model.compile(optimizer=optimizer_class(),
-                            loss=conf['data']['target'].loss)
-
-        np.random.seed(1)
-        validation_losses = []
-        validation_roc = []
-        training_losses = []
-        shot_list_train, shot_list_validate = self.shot_list.split_direct(
-            1.0-conf['training']['validation_frac'], do_shuffle=True)
-
-        from keras.utils.generic_utils import Progbar
-        from keras import backend as K
-
-        num_epochs = self.conf['training']['num_epochs']
-        num_at_once = self.conf['training']['num_shots_at_once']
-        lr_decay = self.conf['model']['lr_decay']
-
-        resulting_dict = {'loss': None, 'status': STATUS_OK, 'model': None}
-
-        e = -1
-        # print("Current num_epochs {}".format(e))
-        while e < num_epochs-1:
-            e += 1
-            pbar = Progbar(len(shot_list_train))
-
-            shot_list_train.shuffle()
-            shot_sublists = shot_list_train.sublists(num_at_once)[:1]
-            training_losses_tmp = []
-            # TODO(KGF): check this fix; lr, tf were undefined in neglected
-            # serial runner.py, since mpi_runner.py has been the main tool
-            lr = conf['model']['lr']
-            K.set_value(train_model.optimizer.lr, lr*lr_decay**(e))
-            for (i, shot_sublist) in enumerate(shot_sublists):
-                X_list, y_list = self.loader.load_as_X_y_list(shot_sublist)
-                for j, (X, y) in enumerate(zip(X_list, y_list)):
-                    history = builder.LossHistory()
-                    train_model.fit(X, y,
-                                    batch_size=Loader.get_batch_size(self.conf['training']['batch_size'],  prediction_mode=False),  # noqa
-                                    epochs=1, shuffle=False, verbose=0,
-                                    validation_split=0.0, callbacks=[history])
-                    train_model.reset_states()
-                    train_loss = np.mean(history.losses)
-                    training_losses_tmp.append(train_loss)
-
-                    pbar.add(1.0*len(shot_sublist)/len(X_list),
-                             values=[("train loss", train_loss)])
-                    self.loader.verbose = False
-            sys.stdout.flush()
-            training_losses.append(np.mean(training_losses_tmp))
-            specific_builder.save_model_weights(train_model, e)
-            _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu(
-                self.conf, shot_list_validate, self.loader)
-            print("Epoch: {}, loss: {}, validation_losses_size: {}".format(
-                e, loss, len(validation_losses)))
-            validation_losses.append(loss)
-            validation_roc.append(roc_area)
-            resulting_dict['loss'] = loss
-            resulting_dict['model'] = train_model
-            # print("Results {}, before
-            # {}".format(resulting_dict,id(resulting_dict)))
-
-        # print("Results {}, after
-        # {}".format(resulting_dict,id(resulting_dict)))
-        return resulting_dict
-
-    def get_space(self):
-        return {'Dropout': hp.uniform('Dropout', 0, 1), }
-
-    def frnn_minimize(self, algo, max_evals, trials, rseed=1337):
-        from hyperopt import fmin
-        best_run = fmin(self.keras_fmin_fnct, space=self.get_space(),
-                        algo=algo, max_evals=max_evals, trials=trials,
-                        rstate=np.random.RandomState(rseed))
-        best_model = None
-        for trial in trials:
-            vals = trial.get('misc').get('vals')
-            for key in vals.keys():
-                vals[key] = vals[key][0]
-            if (trial.get('misc').get('vals') == best_run
-                    and 'model' in trial.get('result').keys()):
-                best_model = trial.get('result').get('model')
-
-        return best_run, best_model
-
-
-def plot_losses(conf, losses_list, specific_builder, name=''):
-    unique_id = specific_builder.get_unique_id()
-    savedir = 'losses'
-    if not os.path.exists(savedir):
-        os.makedirs(savedir)
-
-    save_path = os.path.join(savedir, '{}_loss_{}.png'.format(name, unique_id))
-    plt.figure()
-    for losses in losses_list:
-        plt.semilogy(losses)
-    plt.xlabel('Epoch')
-    plt.ylabel('Loss')
-    plt.grid()
-    plt.savefig(save_path)
-
-
-def make_predictions(conf, shot_list, loader):
-    loader.set_inference_mode(True)
-
-    use_cores = max(1, mp.cpu_count()-2)
-
-    if backend == 'tf' or backend == 'tensorflow':
-        first_time = "tensorflow" not in sys.modules
-        if first_time:
-            import tensorflow as tf
-            os.environ['KERAS_BACKEND'] = 'tensorflow'
-            from keras.backend.tensorflow_backend import set_session
-            config = tf.ConfigProto(device_count={"CPU": use_cores})
-            set_session(tf.Session(config=config))
-    else:
-        os.environ['THEANO_FLAGS'] = 'device=cpu'
-        # import theano
-
-    from plasma.models.builder import ModelBuilder
-    specific_builder = ModelBuilder(conf)
-
-    y_prime = []
-    y_gold = []
-    disruptive = []
-
-    model = specific_builder.build_model(True)
-    model.compile(optimizer=optimizer_class(),
-                  loss=conf['data']['target'].loss)
-
-    specific_builder.load_model_weights(model)
-    model_save_path = specific_builder.get_latest_save_path()
-
-    start_time = time.time()
-    pool = mp.Pool(use_cores)
-    fn = partial(make_single_prediction, builder=specific_builder,
-                 loader=loader, model_save_path=model_save_path)
-
-    print('running in parallel on {} processes'.format(pool._processes))
-    for (i, (y_p, y, is_disruptive)) in enumerate(pool.imap(fn, shot_list)):
-        print('Shot {}/{}'.format(i, len(shot_list)))
-        sys.stdout.flush()
-        y_prime.append(y_p)
-        y_gold.append(y)
-        disruptive.append(is_disruptive)
-    pool.close()
-    pool.join()
-    print('Finished Predictions in {} seconds'.format(time.time()-start_time))
-    loader.set_inference_mode(False)
-    return y_prime, y_gold, disruptive
-
-
-def make_single_prediction(shot, specific_builder, loader, model_save_path):
-    loader.set_inference_mode(True)
-    model = specific_builder.build_model(True)
-    model.compile(optimizer=optimizer_class(),
-                  loss=conf['data']['target'].loss)
-
-    model.load_weights(model_save_path)
-    model.reset_states()
-    X, y = loader.load_as_X_y(shot, prediction_mode=True)
-    assert(X.shape[0] == y.shape[0])
-    y_p = model.predict(
-        X, batch_size=Loader.get_batch_size(conf['training']['batch_size'],
-                                            prediction_mode=True), verbose=0)
-    answer_dims = y_p.shape[-1]
-    if conf['model']['return_sequences']:
-        shot_length = y_p.shape[0]*y_p.shape[1]
-    else:
-        shot_length = y_p.shape[0]
-    y_p = np.reshape(y_p, (shot_length, answer_dims))
-    y = np.reshape(y, (shot_length, answer_dims))
-    is_disruptive = shot.is_disruptive_shot()
-    model.reset_states()
-    loader.set_inference_mode(False)
-    return y_p, y, is_disruptive
-
-
-def make_predictions_gpu(conf, shot_list, loader, custom_path=None):
-    loader.set_inference_mode(True)
-    if backend == 'tf' or backend == 'tensorflow':
-        first_time = "tensorflow" not in sys.modules
-        if first_time:
-            import tensorflow as tf
-            os.environ['KERAS_BACKEND'] = 'tensorflow'
-            from keras.backend.tensorflow_backend import set_session
-            config = tf.ConfigProto(device_count={"GPU": 1})
-            set_session(tf.Session(config=config))
-    else:
-        os.environ['THEANO_FLAGS'] = 'device=gpu,floatX=float32'
-        # import theano
-
-    from keras.utils.generic_utils import Progbar
-    from plasma.models.builder import ModelBuilder
-    specific_builder = ModelBuilder(conf)
-
-    y_prime = []
-    y_gold = []
-    disruptive = []
-
-    model = specific_builder.build_model(True)
-    model.compile(optimizer=optimizer_class(),
-                  loss=conf['data']['target'].loss)
-
-    specific_builder.load_model_weights(model, custom_path)
-    model.reset_states()
-
-    pbar = Progbar(len(shot_list))
-    shot_sublists = shot_list.sublists(conf['model']['pred_batch_size'],
-                                       do_shuffle=False, equal_size=True)
-    for (i, shot_sublist) in enumerate(shot_sublists):
-        X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)
-        # load data and fit on data
-        y_p = model.predict(X,
-                            batch_size=conf['model']['pred_batch_size'])
-        model.reset_states()
-        y_p = loader.batch_output_to_array(y_p)
-        y = loader.batch_output_to_array(y)
-        # cut arrays back
-        y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)]
-        y = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y)]
-
-        pbar.add(1.0*len(shot_sublist))
-        loader.verbose = False  # True during the first iteration
-        y_prime += y_p
-        y_gold += y
-        disruptive += disr
-    y_prime = y_prime[:len(shot_list)]
-    y_gold = y_gold[:len(shot_list)]
-    disruptive = disruptive[:len(shot_list)]
-    loader.set_inference_mode(False)
-    return y_prime, y_gold, disruptive
-
-
-def make_predictions_and_evaluate_gpu(
-        conf, shot_list, loader, custom_path=None):
-    y_prime, y_gold, disruptive = make_predictions_gpu(
-        conf, shot_list, loader, custom_path)
-    analyzer = PerformanceAnalyzer(conf=conf)
-    roc_area = analyzer.get_roc_area(y_prime, y_gold, disruptive)
-    shot_list.set_weights(analyzer.get_shot_difficulty(
-        y_prime, y_gold, disruptive))
-    loss = get_loss_from_list(y_prime, y_gold, conf['data']['target'])
-    return y_prime, y_gold, disruptive, roc_area, loss
-
-
-def make_evaluations_gpu(conf, shot_list, loader):
-    loader.set_inference_mode(True)
-
-    if backend == 'tf' or backend == 'tensorflow':
-        first_time = "tensorflow" not in sys.modules
-        if first_time:
-            import tensorflow as tf
-            os.environ['KERAS_BACKEND'] = 'tensorflow'
-            from keras.backend.tensorflow_backend import set_session
-            config = tf.ConfigProto(device_count={"GPU": 1})
-            set_session(tf.Session(config=config))
-    else:
-        os.environ['THEANO_FLAGS'] = 'device=gpu,floatX=float32'
-        # import theano
-
-    from keras.utils.generic_utils import Progbar
-    from plasma.models.builder import ModelBuilder
-    specific_builder = ModelBuilder(conf)
-
-    # y_prime = []
-    # y_gold = []
-    # disruptive = []
-    batch_size = min(len(shot_list), conf['model']['pred_batch_size'])
-
-    pbar = Progbar(len(shot_list))
-    print('evaluating {} shots using batchsize {}'.format(
-        len(shot_list), batch_size))
-
-    shot_sublists = shot_list.sublists(batch_size, equal_size=False)
-    all_metrics = []
-    all_weights = []
-    for (i, shot_sublist) in enumerate(shot_sublists):
-        batch_size = len(shot_sublist)
-        model = specific_builder.build_model(
-            True, custom_batch_size=batch_size)
-        model.compile(optimizer=optimizer_class(),
-                      loss=conf['data']['target'].loss)
-
-        specific_builder.load_model_weights(model)
-        model.reset_states()
-        X, y, shot_lengths, disr = loader.load_as_X_y_pred(
-            shot_sublist, custom_batch_size=batch_size)
-        # load data and fit on data
-        all_metrics.append(model.evaluate(X, y, batch_size=batch_size,
-                                          verbose=False))
-        all_weights.append(batch_size)
-        model.reset_states()
-
-        pbar.add(1.0*len(shot_sublist))
-        loader.verbose = False  # True during the first iteration
-
-    if len(all_metrics) > 1:
-        print('evaluations all: {}'.format(all_metrics))
-    loss = np.average(all_metrics, weights=all_weights)
-    print('Evaluation Loss: {}'.format(loss))
-    loader.set_inference_mode(False)
-    return loss

From 7a72cca059dcedd04754632822c24fd956e786d4 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Wed, 11 Dec 2019 10:58:41 -0600
Subject: [PATCH 28/51] Remove "_data_", "_data" from names of dataset
 variables

---
 examples/conf.yaml                            |  8 +--
 plasma/conf_parser.py                         | 70 ++++++++++---------
 plasma/preprocessor/augment.py                |  5 +-
 ...ements-travis.txt => requirements-cpu.yaml |  0
 4 files changed, 42 insertions(+), 41 deletions(-)
 rename requirements-travis.txt => requirements-cpu.yaml (100%)

diff --git a/examples/conf.yaml b/examples/conf.yaml
index b1b242be..8ae41bba 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -7,25 +7,22 @@
 fs_path: '/tigress'
 target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
 num_gpus: 4  # per node
-
 paths:
   signal_prepath: '/signal_data/' # /signal_data/jet/
   shot_list_dir: '/shot_lists/'
   tensorboard_save_path: '/Graph/'
-  data: d3d_data_0D
+  data: d3d_0D
   # if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
   specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
   executable: "mpi_learn.py"
   shallow_executable: "learn.py"
-
 data:
   bleed_in: 0 # how many shots from the test set to use in training?
   bleed_in_repeat_fac: 1 # how many times to repeat shots in training and validation?
   bleed_in_remove_from_test: True
   bleed_in_equalize_sets: False
-  # TODO(KGF): make next parameter use 'none' instead of None for consistency
   signal_to_augment: None # 'plasma current'
-  augmentation_mode: 'none'
+  augmentation_mode: None
   augment_during_training: False
   cut_shot_ends: True
   T_min_warn: 30
@@ -57,7 +54,6 @@ data:
   # the fraction of samples with which to train the shallow model
   #  shallow_sample_prob: 0.01
   floatx: 'float32'
-
 model:
   loss_scale_factor: 1.0
   use_batch_norm: false
diff --git a/plasma/conf_parser.py b/plasma/conf_parser.py
index 9c338fb7..aa0ef096 100644
--- a/plasma/conf_parser.py
+++ b/plasma/conf_parser.py
@@ -150,41 +150,41 @@ def parameters(input_file):
         #     nstx, params['paths']['shot_list_dir'],
         #     ['disrupt_nstx.txt'], 'nstx shots (all are disruptive')
 
-        if params['paths']['data'] == 'jet_data':
+        if params['paths']['data'] == 'jet_all':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.jet_signals
-        elif params['paths']['data'] == 'jet_data_0D':
+        elif params['paths']['data'] == 'jet_0D':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.jet_signals_0D
-        elif params['paths']['data'] == 'jet_data_1D':
+        elif params['paths']['data'] == 'jet_1D':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.jet_signals_1D
-        elif params['paths']['data'] == 'jet_data_late':
+        elif params['paths']['data'] == 'jet_late':
             params['paths']['shot_files'] = [jet_iterlike_wall_late]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.jet_signals
-        elif params['paths']['data'] == 'jet_data_carbon_to_late_0D':
+        elif params['paths']['data'] == 'jet_carbon_to_late_0D':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall_late]
             params['paths']['use_signals_dict'] = sig.jet_signals_0D
-        elif params['paths']['data'] == 'jet_data_temp_profile':
+        elif params['paths']['data'] == 'jet_temp_profile':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = {
                 'etemp_profile': sig.etemp_profile}
-        elif params['paths']['data'] == 'jet_data_dens_profile':
+        elif params['paths']['data'] == 'jet_dens_profile':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = {
                 'edens_profile': sig.edens_profile}
-        elif params['paths']['data'] == 'jet_carbon_data':
+        elif params['paths']['data'] == 'jet_carbon_all':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.jet_signals
-        elif params['paths']['data'] == 'jet_mixed_data':
+        elif params['paths']['data'] == 'jet_mixed_all':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.jet_signals
@@ -192,18 +192,20 @@ def parameters(input_file):
             params['paths']['shot_files'] = [jenkins_jet_carbon_wall]
             params['paths']['shot_files_test'] = [jenkins_jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.jet_signals
-        # jet data but with fully defined signals
-        elif params['paths']['data'] == 'jet_data_fully_defined':
+        # JET data but with fully defined signals
+        elif params['paths']['data'] == 'jet_fully_defined':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        # jet data but with fully defined signals
-        elif params['paths']['data'] == 'jet_data_fully_defined_0D':
+        # JET data but with fully defined signals
+        elif params['paths']['data'] == 'jet_fully_defined_0D':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
-
-        elif params['paths']['data'] == 'd3d_data':
+        # ==================
+        # START D3D DATASETS
+        # ==================
+        elif params['paths']['data'] == 'd3d_all':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -224,7 +226,7 @@ def parameters(input_file):
                 'etemp_profile': sig.etemp_profile,
                 'edens_profile': sig.edens_profile,
             }
-        elif params['paths']['data'] == 'd3d_data_1D':
+        elif params['paths']['data'] == 'd3d_1D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -232,7 +234,7 @@ def parameters(input_file):
                 'etemp_profile': sig.etemp_profile,
                 'edens_profile': sig.edens_profile,
             }
-        elif params['paths']['data'] == 'd3d_data_all_profiles':
+        elif params['paths']['data'] == 'd3d_all_profiles':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -248,7 +250,7 @@ def parameters(input_file):
                 'bootstrap_current_profile': sig.bootstrap_current_profile,
                 'q_psi_profile': sig.q_psi_profile,
             }
-        elif params['paths']['data'] == 'd3d_data_0D':
+        elif params['paths']['data'] == 'd3d_0D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -267,10 +269,12 @@ def parameters(input_file):
                 'iptarget': sig.iptarget,
                 'iperr': sig.iperr,
             }
-        elif params['paths']['data'] == 'd3d_data_all':
-            params['paths']['shot_files'] = [d3d_full]
-            params['paths']['shot_files_test'] = []
-            params['paths']['use_signals_dict'] = sig.d3d_signals
+        # TODO(KGF): rename. Unlike JET, there are probably differences between
+        # sig.d3d_signals and the manually-defined sigs in above d3d_all
+        # elif params['paths']['data'] == 'd3d_all':
+        #     params['paths']['shot_files'] = [d3d_full]
+        #     params['paths']['shot_files_test'] = []
+        #     params['paths']['use_signals_dict'] = sig.d3d_signals
         elif params['paths']['data'] == 'jenkins_d3d':
             params['paths']['shot_files'] = [d3d_jenkins]
             params['paths']['shot_files_test'] = []
@@ -293,22 +297,22 @@ def parameters(input_file):
                 'edens_profile': sig.edens_profile,
             }
         # jet data but with fully defined signals
-        elif params['paths']['data'] == 'd3d_data_fully_defined':
+        elif params['paths']['data'] == 'd3d_fully_defined':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
         # jet data but with fully defined signals
-        elif params['paths']['data'] == 'd3d_data_fully_defined_0D':
+        elif params['paths']['data'] == 'd3d_fully_defined_0D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
-        elif params['paths']['data'] == 'd3d_data_temp_profile':
+        elif params['paths']['data'] == 'd3d_temp_profile':
             # jet data but with fully defined signals
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
                 'etemp_profile': sig.etemp_profile}  # fully_defined_signals_0D
-        elif params['paths']['data'] == 'd3d_data_dens_profile':
+        elif params['paths']['data'] == 'd3d_dens_profile':
             # jet data but with fully defined signals
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
@@ -316,31 +320,31 @@ def parameters(input_file):
                 'edens_profile': sig.edens_profile}  # fully_defined_signals_0D
 
         # cross-machine
-        elif params['paths']['data'] == 'jet_to_d3d_data':
+        elif params['paths']['data'] == 'jet_to_d3d':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = [d3d_full]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        elif params['paths']['data'] == 'd3d_to_jet_data':
+        elif params['paths']['data'] == 'd3d_to_jet':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        elif params['paths']['data'] == 'd3d_to_late_jet_data':
+        elif params['paths']['data'] == 'd3d_to_late_jet':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall_late]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        elif params['paths']['data'] == 'jet_to_d3d_data_0D':
+        elif params['paths']['data'] == 'jet_to_d3d_0D':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = [d3d_full]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
-        elif params['paths']['data'] == 'd3d_to_jet_data_0D':
+        elif params['paths']['data'] == 'd3d_to_jet_0D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
-        elif params['paths']['data'] == 'jet_to_d3d_data_1D':
+        elif params['paths']['data'] == 'jet_to_d3d_1D':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = [d3d_full]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_1D
-        elif params['paths']['data'] == 'd3d_to_jet_data_1D':
+        elif params['paths']['data'] == 'd3d_to_jet_1D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_1D
diff --git a/plasma/preprocessor/augment.py b/plasma/preprocessor/augment.py
index 9ecb4d43..fef7ae01 100644
--- a/plasma/preprocessor/augment.py
+++ b/plasma/preprocessor/augment.py
@@ -135,8 +135,9 @@ def augment(self, signal, strength=10):
         elif self.conf['data']['augmentation_mode'] == "zero":
             # if "set to zero" augmentation. Can control in conf.
             return signal*0.0
-        elif self.conf['data']['augmentation_mode'] == "none":
-            return signal  # if no augmentation. Should be the default in conf.
+        elif self.conf['data']['augmentation_mode'] is None:
+            # no augmentation should be the default in conf.yaml
+            return signal
         else:
             print("Unknown augmentation mode. Exiting")
             exit(-1)
diff --git a/requirements-travis.txt b/requirements-cpu.yaml
similarity index 100%
rename from requirements-travis.txt
rename to requirements-cpu.yaml

From 036b9fbb194f78d040c65e40cd9647aae2354387 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Wed, 11 Dec 2019 11:09:24 -0600
Subject: [PATCH 29/51] Consistency in cross-machine data variables

---
 plasma/conf_parser.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/plasma/conf_parser.py b/plasma/conf_parser.py
index aa0ef096..e29cb5fb 100644
--- a/plasma/conf_parser.py
+++ b/plasma/conf_parser.py
@@ -149,7 +149,9 @@ def parameters(input_file):
         # nstx_full = ShotListFiles(
         #     nstx, params['paths']['shot_list_dir'],
         #     ['disrupt_nstx.txt'], 'nstx shots (all are disruptive')
-
+        # ==================
+        # JET DATASETS
+        # ==================
         if params['paths']['data'] == 'jet_all':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
@@ -203,7 +205,7 @@ def parameters(input_file):
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
         # ==================
-        # START D3D DATASETS
+        # D3D DATASETS
         # ==================
         elif params['paths']['data'] == 'd3d_all':
             params['paths']['shot_files'] = [d3d_full]
@@ -318,13 +320,14 @@ def parameters(input_file):
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
                 'edens_profile': sig.edens_profile}  # fully_defined_signals_0D
-
-        # cross-machine
-        elif params['paths']['data'] == 'jet_to_d3d':
+        # ======================
+        # CROSS-MACHINE DATASETS
+        # ======================
+        elif params['paths']['data'] == 'jet_to_d3d_all':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = [d3d_full]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        elif params['paths']['data'] == 'd3d_to_jet':
+        elif params['paths']['data'] == 'd3d_to_jet_all':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals

From 61cd42cfd9219bd6c1a94aa0d48e3dad5d82d329 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 16 Dec 2019 11:44:45 -0600
Subject: [PATCH 30/51] Start storing machine-dependent module .cmd and Conda
 YAML in envs/

---
 envs/requirements-traverse.yaml | 23 +++++++++++++++++++++++
 envs/traverse-env.cmd           | 11 +++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 envs/requirements-traverse.yaml
 create mode 100644 envs/traverse-env.cmd

diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
new file mode 100644
index 00000000..2e003e0f
--- /dev/null
+++ b/envs/requirements-traverse.yaml
@@ -0,0 +1,23 @@
+name: frnn
+channels:
+  - https:////public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda
+  - defaults
+channel_priority: strict
+dependencies:
+  - python==3.6.8
+      - scipy
+      - pandas
+      - flake8
+      - h5py
+      - pyparsing
+      - pyyaml
+      - tensorflow-gpu>=1.3,<2.0.0
+  - pip:
+      - keras>=2.0.5
+      - pathos
+      - matplotlib>=2.0.2
+      - hyperopt  # TODO(KGF): remove
+      - mpi4py
+      - xgboost
+      - scikit-learn
+      - joblib
diff --git a/envs/traverse-env.cmd b/envs/traverse-env.cmd
new file mode 100644
index 00000000..b6131c88
--- /dev/null
+++ b/envs/traverse-env.cmd
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+module load anaconda3
+# must activate conda env before module loads
+conda activate frnn
+export OMPI_MCA_btl="tcp,self,vader"
+
+module load cudatoolkit
+module load cudnn/cuda-10.1/7.6.1
+module load openmpi/gcc/3.1.4/64
+module load hdf5/gcc/openmpi-3.1.4/1.10.5

From 212b0b0650f1e2b26c18c2e2b95de05ad0c17d52 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 16 Dec 2019 11:54:49 -0600
Subject: [PATCH 31/51] Reorganize CI files

---
 .travis.yml                                         | 6 ++++--
 {jenkins-ci => ci/jenkins}/jenkins.sh               | 0
 {jenkins-ci => ci/jenkins}/run_jenkins.py           | 0
 {jenkins-ci => ci/jenkins}/validate_jenkins.py      | 0
 {jenkins-ci => ci/jenkins}/validate_jenkins.sh      | 0
 install-mpi.sh => ci/travis/install-mpi.sh          | 0
 requirements-cpu.yaml => envs/requirements-cpu.yaml | 0
 envs/requirements-travis.txt                        | 8 ++++++++
 8 files changed, 12 insertions(+), 2 deletions(-)
 rename {jenkins-ci => ci/jenkins}/jenkins.sh (100%)
 rename {jenkins-ci => ci/jenkins}/run_jenkins.py (100%)
 rename {jenkins-ci => ci/jenkins}/validate_jenkins.py (100%)
 rename {jenkins-ci => ci/jenkins}/validate_jenkins.sh (100%)
 rename install-mpi.sh => ci/travis/install-mpi.sh (100%)
 rename requirements-cpu.yaml => envs/requirements-cpu.yaml (100%)
 create mode 100644 envs/requirements-travis.txt

diff --git a/.travis.yml b/.travis.yml
index 31c7f70b..d9e54e9c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,12 +34,12 @@ env:
 # before_install:
 
 install:
-  - sh install-mpi.sh
+  - sh ci/travis/install-mpi.sh
   - export MPI_PREFIX="${HOME}/opt/${MPI_LIBRARY}-${MPI_LIBRARY_VERSION}"
   - export PATH="${HOME}/.local/bin:${MPI_PREFIX}/bin${PATH:+":${PATH}"}"
   - export LD_LIBRARY_PATH="${MPI_PREFIX}/lib${LD_LIBRARY_PATH:+":${LD_LIBRARY_PATH}"}"
   - pip install --upgrade pip
-  - pip install -r requirements-travis.txt
+  - pip install -r envs/requirements-travis.txt
 
 # before_script:
 
@@ -54,6 +54,8 @@ stages:
 
 notifications:
   email:
+    recipients:
+      - felker@anl.gov
     on_success: change
     on_failure: always
   slack:
diff --git a/jenkins-ci/jenkins.sh b/ci/jenkins/jenkins.sh
similarity index 100%
rename from jenkins-ci/jenkins.sh
rename to ci/jenkins/jenkins.sh
diff --git a/jenkins-ci/run_jenkins.py b/ci/jenkins/run_jenkins.py
similarity index 100%
rename from jenkins-ci/run_jenkins.py
rename to ci/jenkins/run_jenkins.py
diff --git a/jenkins-ci/validate_jenkins.py b/ci/jenkins/validate_jenkins.py
similarity index 100%
rename from jenkins-ci/validate_jenkins.py
rename to ci/jenkins/validate_jenkins.py
diff --git a/jenkins-ci/validate_jenkins.sh b/ci/jenkins/validate_jenkins.sh
similarity index 100%
rename from jenkins-ci/validate_jenkins.sh
rename to ci/jenkins/validate_jenkins.sh
diff --git a/install-mpi.sh b/ci/travis/install-mpi.sh
similarity index 100%
rename from install-mpi.sh
rename to ci/travis/install-mpi.sh
diff --git a/requirements-cpu.yaml b/envs/requirements-cpu.yaml
similarity index 100%
rename from requirements-cpu.yaml
rename to envs/requirements-cpu.yaml
diff --git a/envs/requirements-travis.txt b/envs/requirements-travis.txt
new file mode 100644
index 00000000..75815e68
--- /dev/null
+++ b/envs/requirements-travis.txt
@@ -0,0 +1,8 @@
+# pip dependencies for Travis CI builds
+scipy
+pandas
+flake8
+h5py
+pyparsing
+pyyaml
+tensorflow-gpu>=1.3,<2.0.0

From 44772f9215401c4be8373ad6e18a983c3a6d9339 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 16 Dec 2019 12:00:12 -0600
Subject: [PATCH 32/51] Add TigerGPU build specs to envs/

Only meaningful difference in Conda YAML is the removal of the ppc64le
IBM AI Conda channel
---
 envs/requirements-tigergpu.yaml         | 22 ++++++++++++++++++++++
 envs/tigergpu.cmd                       | 11 +++++++++++
 envs/{traverse-env.cmd => traverse.cmd} |  0
 3 files changed, 33 insertions(+)
 create mode 100644 envs/requirements-tigergpu.yaml
 create mode 100644 envs/tigergpu.cmd
 rename envs/{traverse-env.cmd => traverse.cmd} (100%)

diff --git a/envs/requirements-tigergpu.yaml b/envs/requirements-tigergpu.yaml
new file mode 100644
index 00000000..fecc0ab9
--- /dev/null
+++ b/envs/requirements-tigergpu.yaml
@@ -0,0 +1,22 @@
+name: frnn
+channels:
+  - defaults
+channel_priority: strict
+dependencies:
+  - python==3.6.8
+      - scipy
+      - pandas
+      - flake8
+      - h5py
+      - pyparsing
+      - pyyaml
+      - tensorflow-gpu>=1.3,<2.0.0
+  - pip:
+      - keras>=2.0.5
+      - pathos
+      - matplotlib>=2.0.2
+      - hyperopt  # TODO(KGF): remove
+      - mpi4py
+      - xgboost
+      - scikit-learn
+      - joblib
diff --git a/envs/tigergpu.cmd b/envs/tigergpu.cmd
new file mode 100644
index 00000000..e0f05368
--- /dev/null
+++ b/envs/tigergpu.cmd
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+module load anaconda3
+# must activate conda env before module loads
+conda activate frnn
+export OMPI_MCA_btl="tcp,self,vader"  #sm"
+module load cudatoolkit
+module load cudnn
+
+module load openmpi/gcc/3.1.3/64
+module load hdf5/gcc/openmpi-1.10.2/1.10.0
diff --git a/envs/traverse-env.cmd b/envs/traverse.cmd
similarity index 100%
rename from envs/traverse-env.cmd
rename to envs/traverse.cmd

From 336135be2c4d495106726f3f645874cbd186ae24 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Mon, 16 Dec 2019 13:32:23 -0500
Subject: [PATCH 33/51] Fix 2x Conda YAML errors; comment-out channel_priority

Not currently valid field in environment YAML file.
Follow https://github.com/conda/conda/issues/8675
Until then, use
conda config --set channel_priority strict
---
 envs/requirements-tigergpu.yaml | 16 ++++++++--------
 envs/requirements-traverse.yaml | 19 ++++++++++---------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/envs/requirements-tigergpu.yaml b/envs/requirements-tigergpu.yaml
index fecc0ab9..88d79988 100644
--- a/envs/requirements-tigergpu.yaml
+++ b/envs/requirements-tigergpu.yaml
@@ -1,16 +1,16 @@
 name: frnn
 channels:
   - defaults
-channel_priority: strict
+#channel_priority: strict
 dependencies:
   - python==3.6.8
-      - scipy
-      - pandas
-      - flake8
-      - h5py
-      - pyparsing
-      - pyyaml
-      - tensorflow-gpu>=1.3,<2.0.0
+  - scipy
+  - pandas
+  - flake8
+  - h5py
+  - pyparsing
+  - pyyaml
+  - tensorflow-gpu>=1.3,<2.0.0
   - pip:
       - keras>=2.0.5
       - pathos
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index 2e003e0f..6dd367da 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -1,17 +1,18 @@
 name: frnn
 channels:
-  - https:////public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda
+  - https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda
   - defaults
-channel_priority: strict
+# channel_priority: strict   # set in .condarc
 dependencies:
   - python==3.6.8
-      - scipy
-      - pandas
-      - flake8
-      - h5py
-      - pyparsing
-      - pyyaml
-      - tensorflow-gpu>=1.3,<2.0.0
+  - pip
+  - scipy
+  - pandas
+  - flake8
+  - h5py
+  - pyparsing
+  - pyyaml
+  - tensorflow-gpu>=1.3,<2.0.0
   - pip:
       - keras>=2.0.5
       - pathos

From 85939447936e5c29f00be917c2b99f30ba3b0f48 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 16 Dec 2019 12:40:42 -0600
Subject: [PATCH 34/51] Add pip to TigerGPU dependencies

Intentionally add PEP 8 style error in order to test Travis CI email
notifications on failed builds
---
 envs/requirements-tigergpu.yaml | 1 +
 plasma/global_vars.py           | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/envs/requirements-tigergpu.yaml b/envs/requirements-tigergpu.yaml
index 88d79988..e1232552 100644
--- a/envs/requirements-tigergpu.yaml
+++ b/envs/requirements-tigergpu.yaml
@@ -4,6 +4,7 @@ channels:
 #channel_priority: strict
 dependencies:
   - python==3.6.8
+  - pip
   - scipy
   - pandas
   - flake8
diff --git a/plasma/global_vars.py b/plasma/global_vars.py
index a07b0d7e..68e5ce8f 100644
--- a/plasma/global_vars.py
+++ b/plasma/global_vars.py
@@ -13,7 +13,7 @@
 
 def init_MPI():
     from mpi4py import MPI
-    global comm, task_index, num_workers
+    global comm, task_index, num_workers#
     comm = MPI.COMM_WORLD
     task_index = comm.Get_rank()
     num_workers = comm.Get_size()

From 5b44e442b120585849140331652e9a6bb312c5a6 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 16 Dec 2019 13:10:31 -0600
Subject: [PATCH 35/51] Fix intentional style error

---
 plasma/global_vars.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plasma/global_vars.py b/plasma/global_vars.py
index 68e5ce8f..a07b0d7e 100644
--- a/plasma/global_vars.py
+++ b/plasma/global_vars.py
@@ -13,7 +13,7 @@
 
 def init_MPI():
     from mpi4py import MPI
-    global comm, task_index, num_workers#
+    global comm, task_index, num_workers
     comm = MPI.COMM_WORLD
     task_index = comm.Get_rank()
     num_workers = comm.Get_size()

From f68f2f0c342656939b170311ca2984c73deceafa Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 16 Dec 2019 13:31:39 -0600
Subject: [PATCH 36/51] Do not install mpi4py via pip when building Conda env

---
 envs/requirements-tigergpu.yaml | 2 +-
 envs/requirements-traverse.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/envs/requirements-tigergpu.yaml b/envs/requirements-tigergpu.yaml
index e1232552..a78e5aef 100644
--- a/envs/requirements-tigergpu.yaml
+++ b/envs/requirements-tigergpu.yaml
@@ -17,7 +17,7 @@ dependencies:
       - pathos
       - matplotlib>=2.0.2
       - hyperopt  # TODO(KGF): remove
-      - mpi4py
+      # - mpi4py   # must reload MPI library modules before installing via pip
       - xgboost
       - scikit-learn
       - joblib
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index 6dd367da..847317bb 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -18,7 +18,7 @@ dependencies:
       - pathos
       - matplotlib>=2.0.2
       - hyperopt  # TODO(KGF): remove
-      - mpi4py
+      # - mpi4py   # must reload MPI library modules before installing via pip
       - xgboost
       - scikit-learn
       - joblib

From 03f62395a0081f6ecbd0258af41b4a87a9bdbb7d Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 16 Dec 2019 13:33:20 -0600
Subject: [PATCH 37/51] TigerGPU conda env is generic for x86_64 Linux
 platforms with GPUs

---
 .travis.yml                                                     | 2 +-
 envs/{requirements-travis.txt => pip-requirements-travis.txt}   | 0
 ...equirements-tigergpu.yaml => requirements-linux-64-gpu.yaml} | 0
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename envs/{requirements-travis.txt => pip-requirements-travis.txt} (100%)
 rename envs/{requirements-tigergpu.yaml => requirements-linux-64-gpu.yaml} (100%)

diff --git a/.travis.yml b/.travis.yml
index d9e54e9c..087fdb66 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -39,7 +39,7 @@ install:
   - export PATH="${HOME}/.local/bin:${MPI_PREFIX}/bin${PATH:+":${PATH}"}"
   - export LD_LIBRARY_PATH="${MPI_PREFIX}/lib${LD_LIBRARY_PATH:+":${LD_LIBRARY_PATH}"}"
   - pip install --upgrade pip
-  - pip install -r envs/requirements-travis.txt
+  - pip install -r envs/pip-requirements-travis.txt
 
 # before_script:
 
diff --git a/envs/requirements-travis.txt b/envs/pip-requirements-travis.txt
similarity index 100%
rename from envs/requirements-travis.txt
rename to envs/pip-requirements-travis.txt
diff --git a/envs/requirements-tigergpu.yaml b/envs/requirements-linux-64-gpu.yaml
similarity index 100%
rename from envs/requirements-tigergpu.yaml
rename to envs/requirements-linux-64-gpu.yaml

From 4a6ceaa7ebb5c865faebdc276a154ae4ccada502 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Mon, 16 Dec 2019 17:41:43 -0500
Subject: [PATCH 38/51] Install Cython on Traverse (unique req. for this
 PICSciE cluster)

---
 envs/requirements-traverse.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index 847317bb..efa9b4a1 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -5,6 +5,7 @@ channels:
 # channel_priority: strict   # set in .condarc
 dependencies:
   - python==3.6.8
+  - cython
   - pip
   - scipy
   - pandas

From c522d3be7a53638676d87af97960219816ae6679 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Wed, 18 Dec 2019 22:57:58 -0600
Subject: [PATCH 39/51] Add tcn.py from @ge-dong's fork

Modified version of https://github.com/philipperemy/keras-tcn
---
 plasma/models/mpi_runner.py |   3 +-
 plasma/models/tcn.py        | 240 ++++++++++++++++++++++++++++++++++++
 2 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 plasma/models/tcn.py

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 273fd226..706c85e9 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -700,7 +700,8 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     model = specific_builder.build_model(True)
     specific_builder.load_model_weights(model, custom_path)
 
-    # broadcast model weights then set it explicitely: fix for Py3.6
+    # broadcast model weights then set it explicitly: fix for Py3.6
+    # TODO(KGF): remove if we no longer support Py2
     if sys.version_info[0] > 2:
         if g.task_index == 0:
             new_weights = model.get_weights()
diff --git a/plasma/models/tcn.py b/plasma/models/tcn.py
new file mode 100644
index 00000000..78425edd
--- /dev/null
+++ b/plasma/models/tcn.py
@@ -0,0 +1,240 @@
+from typing import List, Tuple
+
+import keras.backend as K
+import keras.layers
+from keras import optimizers
+from keras.engine.topology import Layer
+from keras.layers import Activation, Lambda
+from keras.layers import Conv1D, SpatialDropout1D
+from keras.layers import Dense, BatchNormalization
+from keras.models import Input, Model
+
+
+def residual_block(x, dilation_rate, nb_filters, kernel_size, padding, activation='relu', dropout_rate=0,
+                   kernel_initializer='he_normal', use_batch_norm=False):
+    # type: (Layer, int, int, int, str, str, float, str, bool) -> Tuple[Layer, Layer]
+    """Defines the residual block for the WaveNet TCN
+
+    Args:
+        x: The previous layer in the model
+        dilation_rate: The dilation power of 2 we are using for this residual block
+        nb_filters: The number of convolutional filters to use in this block
+        kernel_size: The size of the convolutional kernel
+        padding: The padding used in the convolutional layers, 'same' or 'causal'.
+        activation: The final activation used in o = Activation(x + F(x))
+        dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
+        kernel_initializer: Initializer for the kernel weights matrix (Conv1D).
+        use_batch_norm: Whether to use batch normalization in the residual layers or not.
+    Returns:
+        A tuple where the first element is the residual model layer, and the second
+        is the skip connection.
+    """
+    prev_x = x
+    for k in range(2):
+        x = Conv1D(filters=nb_filters,
+                   kernel_size=kernel_size,
+                   dilation_rate=dilation_rate,
+                   kernel_initializer=kernel_initializer,
+                   padding=padding)(x)
+        if use_batch_norm:
+            x = BatchNormalization()(x)  # TODO should be WeightNorm here, but using batchNorm instead
+        x = Activation('relu')(x)
+        x = SpatialDropout1D(rate=dropout_rate)(x)
+
+    # 1x1 conv to match the shapes (channel dimension).
+    prev_x = Conv1D(nb_filters, 1, padding='same')(prev_x)
+    res_x = keras.layers.add([prev_x, x])
+    res_x = Activation(activation)(res_x)
+    return res_x, x
+
+
+def process_dilations(dilations):
+    def is_power_of_two(num):
+        return num != 0 and ((num & (num - 1)) == 0)
+
+    if all([is_power_of_two(i) for i in dilations]):
+        return dilations
+
+    else:
+        new_dilations = [2 ** i for i in dilations]
+        return new_dilations
+
+
+class TCN:
+    """Creates a TCN layer.
+
+        Input shape:
+            A tensor of shape (batch_size, timesteps, input_dim).
+
+        Args:
+            nb_filters: The number of filters to use in the convolutional layers.
+            kernel_size: The size of the kernel to use in each convolutional layer.
+            dilations: The list of the dilations. Example is: [1, 2, 4, 8, 16, 32, 64].
+            nb_stacks : The number of stacks of residual blocks to use.
+            padding: The padding to use in the convolutional layers, 'causal' or 'same'.
+            use_skip_connections: Boolean. If we want to add skip connections from input to each residual block.
+            return_sequences: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+            activation: The activation used in the residual blocks o = Activation(x + F(x)).
+            dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
+            name: Name of the model. Useful when having multiple TCN.
+            kernel_initializer: Initializer for the kernel weights matrix (Conv1D).
+            use_batch_norm: Whether to use batch normalization in the residual layers or not.
+
+        Returns:
+            A TCN layer.
+        """
+
+    def __init__(self,
+                 nb_filters=25,
+                 kernel_size=5,
+                 nb_stacks=1,
+                 num_layers=10,#[1, 2, 4, 8, 16, 32,64,128,256,512],
+                 padding='causal',
+                 use_skip_connections=True,
+                 dropout_rate=0.0,
+                 return_sequences=True,
+                 activation='linear',
+                 name='tcn',
+                 kernel_initializer='he_normal',
+                 use_batch_norm=False):
+        dilations=[2**i for i in range(0,num_layers)]
+        self.name = name
+        self.return_sequences = return_sequences
+        self.dropout_rate = dropout_rate
+        self.use_skip_connections = use_skip_connections
+        self.dilations = dilations
+        self.nb_stacks = nb_stacks
+        self.kernel_size = kernel_size
+        self.nb_filters = nb_filters
+        self.activation = activation
+        self.padding = padding
+        self.kernel_initializer = kernel_initializer
+        self.use_batch_norm = use_batch_norm
+
+        if padding != 'causal' and padding != 'same':
+            raise ValueError("Only 'causal' or 'same' padding are compatible for this layer.")
+
+        if not isinstance(nb_filters, int):
+            print('An interface change occurred after the version 2.1.2.')
+            print('Before: tcn.TCN(x, return_sequences=False, ...)')
+            print('Now should be: tcn.TCN(return_sequences=False, ...)(x)')
+            print('The alternative is to downgrade to 2.1.2 (pip install keras-tcn==2.1.2).')
+            raise Exception()
+
+    def __call__(self, inputs):
+        x = inputs
+        # 1D FCN.
+        x = Conv1D(self.nb_filters, 1, padding=self.padding, kernel_initializer=self.kernel_initializer)(x)
+        skip_connections = []
+        for s in range(self.nb_stacks):
+            for d in self.dilations:
+                x, skip_out = residual_block(x,
+                                             dilation_rate=d,
+                                             nb_filters=self.nb_filters,
+                                             kernel_size=self.kernel_size,
+                                             padding=self.padding,
+                                             activation=self.activation,
+                                             dropout_rate=self.dropout_rate,
+                                             kernel_initializer=self.kernel_initializer,
+                                             use_batch_norm=self.use_batch_norm)
+                skip_connections.append(skip_out)
+        if self.use_skip_connections:
+            x = keras.layers.add(skip_connections)
+        if not self.return_sequences:
+            x = Lambda(lambda tt: tt[:, -1, :])(x)
+        return x
+
+
+def compiled_tcn(num_feat,  # type: int
+                 num_classes,  # type: int
+                 nb_filters,  # type: int
+                 kernel_size,  # type: int
+                 dilations,  # type: List[int]
+                 nb_stacks,  # type: int
+                 max_len,  # type: int
+                 padding='causal',  # type: str
+                 use_skip_connections=True,  # type: bool
+                 return_sequences=True,
+                 regression=False,  # type: bool
+                 dropout_rate=0.05,  # type: float
+                 name='tcn',  # type: str,
+                 kernel_initializer='he_normal',  # type: str,
+                 activation='linear',  # type:str,
+                 opt='adam',
+                 lr=0.002,
+                 use_batch_norm=False):
+    # type: (...) -> keras.Model
+    """Creates a compiled TCN model for a given task (i.e. regression or classification).
+    Classification uses a sparse categorical loss. Please input class ids and not one-hot encodings.
+
+    Args:
+        num_feat: The number of features of your input, i.e. the last dimension of: (batch_size, timesteps, input_dim).
+        num_classes: The size of the final dense layer, how many classes we are predicting.
+        nb_filters: The number of filters to use in the convolutional layers.
+        kernel_size: The size of the kernel to use in each convolutional layer.
+        dilations: The list of the dilations. Example is: [1, 2, 4, 8, 16, 32, 64].
+        nb_stacks : The number of stacks of residual blocks to use.
+        max_len: The maximum sequence length, use None if the sequence length is dynamic.
+        padding: The padding to use in the convolutional layers.
+        use_skip_connections: Boolean. If we want to add skip connections from input to each residual block.
+        return_sequences: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+        regression: Whether the output should be continuous or discrete.
+        dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
+        activation: The activation used in the residual blocks o = Activation(x + F(x)).
+        name: Name of the model. Useful when having multiple TCN.
+        kernel_initializer: Initializer for the kernel weights matrix (Conv1D).
+        opt: Optimizer name.
+        lr: Learning rate.
+        use_batch_norm: Whether to use batch normalization in the residual layers or not.
+    Returns:
+        A compiled keras TCN.
+    """
+
+    dilations = process_dilations(dilations)
+
+    input_layer = Input(shape=(max_len, num_feat))
+
+    x = TCN(nb_filters, kernel_size, nb_stacks, dilations, padding,
+            use_skip_connections, dropout_rate, return_sequences,
+            activation, name, kernel_initializer, use_batch_norm)(input_layer)
+
+    print('x.shape=', x.shape)
+
+    def get_opt():
+        if opt == 'adam':
+            return optimizers.Adam(lr=lr, clipnorm=1.)
+        elif opt == 'rmsprop':
+            return optimizers.RMSprop(lr=lr, clipnorm=1.)
+        else:
+            raise Exception('Only Adam and RMSProp are available here')
+
+    if not regression:
+        # classification
+        x = Dense(num_classes)(x)
+        x = Activation('softmax')(x)
+        output_layer = x
+        model = Model(input_layer, output_layer)
+
+        # https://github.com/keras-team/keras/pull/11373
+        # It's now in Keras@master but still not available with pip.
+        # TODO remove later.
+        def accuracy(y_true, y_pred):
+            # reshape in case it's in shape (num_samples, 1) instead of (num_samples,)
+            if K.ndim(y_true) == K.ndim(y_pred):
+                y_true = K.squeeze(y_true, -1)
+            # convert dense predictions to labels
+            y_pred_labels = K.argmax(y_pred, axis=-1)
+            y_pred_labels = K.cast(y_pred_labels, K.floatx())
+            return K.cast(K.equal(y_true, y_pred_labels), K.floatx())
+
+        model.compile(get_opt(), loss='sparse_categorical_crossentropy', metrics=[accuracy])
+    else:
+        # regression
+        x = Dense(1)(x)
+        x = Activation('linear')(x)
+        output_layer = x
+        model = Model(input_layer, output_layer)
+        model.compile(get_opt(), loss='mean_squared_error')
+    print('model.x = {}'.format(input_layer.shape))
+    print('model.y = {}'.format(output_layer.shape))
+    return model

From ae84a222f769b73b91de1256c62e7153436dddda Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Thu, 19 Dec 2019 11:05:01 -0600
Subject: [PATCH 40/51] Fix PEP8 style erorrs

---
 plasma/models/targets.py  | 11 ++++----
 plasma/models/tcn.py      | 55 +++++++++++++++++----------------------
 plasma/primitives/data.py |  2 +-
 setup.cfg                 |  2 ++
 4 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/plasma/models/targets.py b/plasma/models/targets.py
index 567fb43b..3db4fead 100644
--- a/plasma/models/targets.py
+++ b/plasma/models/targets.py
@@ -150,7 +150,9 @@ def loss(y_true, y_pred):
     @staticmethod
     def loss_np(y_true, y_pred):
         from plasma.conf import conf
-        fac = MaxHingeTarget.fac
+        # TODO(KGF): fac = positive_example_penalty is only used in this class,
+        # only in above loss() fn, which only this class has (besides loss_np)
+        # fac = MaxHingeTarget.fac
         # print(y_pred.shape)
         overall_fac = np.prod(np.array(y_pred.shape).astype(np.float32))
         max_val = np.max(y_pred, axis=-2)  # temporal axis!
@@ -160,10 +162,8 @@ def loss_np(y_true, y_pred):
         mask = np.equal(max_val, y_pred)
         mask = mask.astype(np.float32)
         y_pred = mask * y_pred + (1-mask) * y_true
-        weight_mask = np.greater(
-            y_true, 0.0).astype(
-            np.float32)  # positive label!
-        weight_mask = fac*weight_mask + (1 - weight_mask)
+        # positive label! weight_mask = fac*weight_mask + (1 - weight_mask):
+        weight_mask = np.greater(y_true, 0.0).astype(np.float32)
         # return np.mean(
         #  weight_mask*np.square(np.maximum(1. - y_true * y_pred, 0.)))
         # , axis=-1)
@@ -196,7 +196,6 @@ def threshold_range(T_warning):
 
 class HingeTarget(Target):
     activation = 'linear'
-
     loss = 'hinge'  # hinge
 
     @staticmethod
diff --git a/plasma/models/tcn.py b/plasma/models/tcn.py
index 78425edd..9e9355b2 100644
--- a/plasma/models/tcn.py
+++ b/plasma/models/tcn.py
@@ -1,5 +1,4 @@
 from typing import List, Tuple
-
 import keras.backend as K
 import keras.layers
 from keras import optimizers
@@ -10,7 +9,8 @@
 from keras.models import Input, Model
 
 
-def residual_block(x, dilation_rate, nb_filters, kernel_size, padding, activation='relu', dropout_rate=0,
+def residual_block(x, dilation_rate, nb_filters, kernel_size, padding,
+                   activation='relu', dropout_rate=0,
                    kernel_initializer='he_normal', use_batch_norm=False):
     # type: (Layer, int, int, int, str, str, float, str, bool) -> Tuple[Layer, Layer]
     """Defines the residual block for the WaveNet TCN
@@ -37,7 +37,8 @@ def residual_block(x, dilation_rate, nb_filters, kernel_size, padding, activatio
                    kernel_initializer=kernel_initializer,
                    padding=padding)(x)
         if use_batch_norm:
-            x = BatchNormalization()(x)  # TODO should be WeightNorm here, but using batchNorm instead
+            # TODO should be WeightNorm here, but using batchNorm instead
+            x = BatchNormalization()(x)
         x = Activation('relu')(x)
         x = SpatialDropout1D(rate=dropout_rate)(x)
 
@@ -84,20 +85,12 @@ class TCN:
             A TCN layer.
         """
 
-    def __init__(self,
-                 nb_filters=25,
-                 kernel_size=5,
-                 nb_stacks=1,
-                 num_layers=10,#[1, 2, 4, 8, 16, 32,64,128,256,512],
-                 padding='causal',
-                 use_skip_connections=True,
-                 dropout_rate=0.0,
-                 return_sequences=True,
-                 activation='linear',
-                 name='tcn',
-                 kernel_initializer='he_normal',
-                 use_batch_norm=False):
-        dilations=[2**i for i in range(0,num_layers)]
+    def __init__(self, nb_filters=25, kernel_size=5, nb_stacks=1,
+                 num_layers=10,  # [1, 2, 4, 8, 16, 32,64,128,256,512],
+                 padding='causal', use_skip_connections=True, dropout_rate=0.0,
+                 return_sequences=True, activation='linear', name='tcn',
+                 kernel_initializer='heb_normal', use_batch_norm=False):
+        dilations = [2**i for i in range(0, num_layers)]
         self.name = name
         self.return_sequences = return_sequences
         self.dropout_rate = dropout_rate
@@ -112,31 +105,29 @@ def __init__(self,
         self.use_batch_norm = use_batch_norm
 
         if padding != 'causal' and padding != 'same':
-            raise ValueError("Only 'causal' or 'same' padding are compatible for this layer.")
+            raise ValueError("Only 'causal' or 'same' padding are compatible for this layer.")  # noqa
 
         if not isinstance(nb_filters, int):
             print('An interface change occurred after the version 2.1.2.')
             print('Before: tcn.TCN(x, return_sequences=False, ...)')
             print('Now should be: tcn.TCN(return_sequences=False, ...)(x)')
-            print('The alternative is to downgrade to 2.1.2 (pip install keras-tcn==2.1.2).')
+            print('The alternative is to downgrade to 2.1.2 (pip install keras-tcn==2.1.2).')  # noqa
             raise Exception()
 
     def __call__(self, inputs):
         x = inputs
         # 1D FCN.
-        x = Conv1D(self.nb_filters, 1, padding=self.padding, kernel_initializer=self.kernel_initializer)(x)
+        x = Conv1D(self.nb_filters, 1, padding=self.padding,
+                   kernel_initializer=self.kernel_initializer)(x)
         skip_connections = []
         for s in range(self.nb_stacks):
             for d in self.dilations:
-                x, skip_out = residual_block(x,
-                                             dilation_rate=d,
-                                             nb_filters=self.nb_filters,
-                                             kernel_size=self.kernel_size,
-                                             padding=self.padding,
-                                             activation=self.activation,
-                                             dropout_rate=self.dropout_rate,
-                                             kernel_initializer=self.kernel_initializer,
-                                             use_batch_norm=self.use_batch_norm)
+                x, skip_out = residual_block(
+                    x, dilation_rate=d, nb_filters=self.nb_filters,
+                    kernel_size=self.kernel_size, padding=self.padding,
+                    activation=self.activation, dropout_rate=self.dropout_rate,
+                    kernel_initializer=self.kernel_initializer,
+                    use_batch_norm=self.use_batch_norm)
                 skip_connections.append(skip_out)
         if self.use_skip_connections:
             x = keras.layers.add(skip_connections)
@@ -219,7 +210,8 @@ def get_opt():
         # It's now in Keras@master but still not available with pip.
         # TODO remove later.
         def accuracy(y_true, y_pred):
-            # reshape in case it's in shape (num_samples, 1) instead of (num_samples,)
+            # reshape in case it's in shape (num_samples, 1) instead of
+            # (num_samples,)
             if K.ndim(y_true) == K.ndim(y_pred):
                 y_true = K.squeeze(y_true, -1)
             # convert dense predictions to labels
@@ -227,7 +219,8 @@ def accuracy(y_true, y_pred):
             y_pred_labels = K.cast(y_pred_labels, K.floatx())
             return K.cast(K.equal(y_true, y_pred_labels), K.floatx())
 
-        model.compile(get_opt(), loss='sparse_categorical_crossentropy', metrics=[accuracy])
+        model.compile(get_opt(), loss='sparse_categorical_crossentropy',
+                      metrics=[accuracy])
     else:
         # regression
         x = Dense(1)(x)
diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index d1b3e6c3..91a2aed0 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -109,7 +109,7 @@ def load_data(self, prepath, shot, dtype='float32'):
             first_idx = region[0]
             last_idx = region[-1]
             # add 50 ms to cover possible disruption event
-            last_time = t[last_idx]+5e-2
+            last_time = t[last_idx] + 5e-2
             last_indices = np.where(t > last_time)[0]
             if len(last_indices) == 0:
                 last_idx = -1
diff --git a/setup.cfg b/setup.cfg
index db3e245d..32fe63a5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,5 +27,7 @@ ignore =
        # W503: line break before binary operator (use mutually exclusive W504)
        W503
 # suppress linter warning about MPI init fn call before module-level imports
+# and long comment lines in externally-written tcn.py
 per-file-ignores =
        mpi_learn.py:E402
+       tcn.py:E501       

From 6add7f50fbc451d5bd7a8cf4b16fcb4f3054bd70 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Thu, 19 Dec 2019 16:34:47 -0600
Subject: [PATCH 41/51] Make keras2onnx, onnx optional modules

---
 plasma/models/builder.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index 96ee84f5..ac2db521 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -27,9 +27,16 @@
 from plasma.utils.downloading import makedirs_process_safe
 from plasma.utils.hashing import general_object_hash
 from plasma.models.tcn import TCN
-# TODO(KGF): perhaps relax the requirement of thse dependencies with try/except
-import keras2onnx
-import onnx
+# TODO(KGF): consider using importlib.util.find_spec() instead (Py>3.4)
+try:
+    import keras2onnx
+    import onnx
+except ImportError:  # as e:
+    _has_onnx = False
+    # onnx = None
+    # keras2onnx = None
+else:
+    _has_onnx = True
 
 # Synchronize 2x stderr msg from TensorFlow initialization via Keras backend
 # "Succesfully opened dynamic library... libcudart" "Using TensorFlow backend."
@@ -355,14 +362,15 @@ def build_train_test_models(self):
     def save_model_weights(self, model, epoch):
         save_path = self.get_save_path(epoch)
         model.save_weights(save_path, overwrite=True)
-        try:
+        # try:
+        if _has_onnx:
             save_path = self.get_save_path(epoch, ext='onnx')
             onnx_model = keras2onnx.convert_keras(model, model.name,
                                                   target_opset=10)
             onnx.save_model(onnx_model, save_path)
-        except Exception as e:
-            print(e)
-            return
+        # except Exception as e:
+        #     print(e)
+        return
 
     def delete_model_weights(self, model, epoch):
         save_path = self.get_save_path(epoch)

From 458a7653135eccf5a3929f568031abbd9cde2f11 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 2 Jan 2020 16:08:31 -0500
Subject: [PATCH 42/51] Restrict Keras to <v2.3.0

Both Keras v2.3.0 and v2.3.1 on Traverse (and at least the latter on
TigerGPU) die with:

WARNING:tensorflow:From
/home/kfelker/.conda/envs/frnn/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630:
calling BaseReso\
urceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops)
with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Printing out pre_rnn model...
Traceback (most recent call last):
  File "mpi_learn.py", line 111, in <module>
      shot_list_test=shot_list_test)

  File
  "/home/kfelker/.conda/envs/frnn/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py",
  line 1229, in __imul__
      raise RuntimeError("Variable *= value not supported. Use "
      RuntimeError: Variable *= value not supported. Use
      `var.assign(var * value)` to modify the variable or `var = var *
      value` to get a new Tensor object.

Incompatibility likely fixed in TF >= v2.0 and/or TF's internal Keras
https://github.com/tensorflow/tensorflow/issues/27829

Re-check this after moving to TensorFlow's internal Keras in #43
---
 envs/requirements-linux-64-gpu.yaml | 2 +-
 envs/requirements-traverse.yaml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/envs/requirements-linux-64-gpu.yaml b/envs/requirements-linux-64-gpu.yaml
index a78e5aef..4b23c827 100644
--- a/envs/requirements-linux-64-gpu.yaml
+++ b/envs/requirements-linux-64-gpu.yaml
@@ -13,7 +13,7 @@ dependencies:
   - pyyaml
   - tensorflow-gpu>=1.3,<2.0.0
   - pip:
-      - keras>=2.0.5
+      - keras>=2.0.5,<2.3.0
       - pathos
       - matplotlib>=2.0.2
       - hyperopt  # TODO(KGF): remove
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index efa9b4a1..6fda2884 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -15,7 +15,7 @@ dependencies:
   - pyyaml
   - tensorflow-gpu>=1.3,<2.0.0
   - pip:
-      - keras>=2.0.5
+      - keras>=2.0.5,<2.3.0
       - pathos
       - matplotlib>=2.0.2
       - hyperopt  # TODO(KGF): remove

From 9b58f638a30565fb523a40eabc06cbfcab6b6c5a Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 2 Jan 2020 16:21:14 -0500
Subject: [PATCH 43/51] Loosen precise Python version requirement in Conda YAML
 files

Add conda-forge to channels above Anaconda Cloud defaults

Need to reevaluate these choices later on
---
 envs/requirements-linux-64-gpu.yaml | 3 ++-
 envs/requirements-traverse.yaml     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/envs/requirements-linux-64-gpu.yaml b/envs/requirements-linux-64-gpu.yaml
index 4b23c827..21bdcf2d 100644
--- a/envs/requirements-linux-64-gpu.yaml
+++ b/envs/requirements-linux-64-gpu.yaml
@@ -1,9 +1,10 @@
 name: frnn
 channels:
+  - conda-forge
   - defaults
 #channel_priority: strict
 dependencies:
-  - python==3.6.8
+  - python>=3.6.8
   - pip
   - scipy
   - pandas
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
index 6fda2884..a305ea17 100644
--- a/envs/requirements-traverse.yaml
+++ b/envs/requirements-traverse.yaml
@@ -1,10 +1,11 @@
 name: frnn
 channels:
   - https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda
+  - conda-forge
   - defaults
 # channel_priority: strict   # set in .condarc
 dependencies:
-  - python==3.6.8
+  - python>=3.6.8
   - cython
   - pip
   - scipy

From 46afd737723e3e140fbbc540ec3fef1890b15c9e Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 2 Jan 2020 16:34:01 -0500
Subject: [PATCH 44/51] Remove version info from dependency list in setup.py

---
 setup.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index da722f07..46b365fe 100644
--- a/setup.py
+++ b/setup.py
@@ -19,22 +19,26 @@
       long_description="""Add description here""",
       author="Julian Kates-Harbeck, Alexey Svyatkovskiy",
       author_email="jkatesharbeck@g.harvard.edu",
-      maintainer="Alexey Svyatkovskiy",
-      maintainer_email="alexeys@princeton.edu",
+      maintainer="Kyle Gerard Felker",
+      maintainer_email="felker@anl.gov",
       # url = "http://",
       download_url="https://github.com/PPPLDeepLearning/plasma-python",
       # license = "Apache Software License v2",
       test_suite="tests",
+      # TODO(KGF): continue specifying "mininmum reqs" of deps w/o any version
+      # info in this file in conjunction with specific reqs in Conda YAML?
       install_requires=[
-          'keras>=2.0.5',
+          'keras',
           'pathos',
-          'matplotlib>=2.0.2',
+          'matplotlib',
           'hyperopt',
           'mpi4py',
           'xgboost',
           'scikit-learn',
           'joblib',
           ],
+      # TODO(KGF): add optional feature specs for [deephyper,balsam,
+      # readthedocs,onnx,keras2onnx]
       tests_require=[],
       classifiers=["Development Status :: 3 - Alpha",
                    "Environment :: Console",

From 057535f738675382709e0741283ece2dc36d7597 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Fri, 3 Jan 2020 14:31:08 -0600
Subject: [PATCH 45/51] Add docstrings to cumulative moving Averager()

Use "sync", not "synch", for "synchronization" abbreviation
---
 plasma/models/mpi_runner.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 706c85e9..5aead668 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -167,7 +167,7 @@ def get_deltas(self, raw_deltas):
             self.m_list = [np.zeros_like(grad) for grad in raw_deltas]
             self.v_list = [np.zeros_like(grad) for grad in raw_deltas]
         t = self.iterations + 1
-        lr_t = self.lr * np.sqrt(1-self.beta_2**t)/(1-self.beta_1**t)
+        lr_t = self.lr * np.sqrt(1 - self.beta_2**t)/(1 - self.beta_1**t)
         deltas = []
         for (i, grad) in enumerate(raw_deltas):
             m_t = (self.beta_1 * self.m_list[i]) + (1-self.beta_1) * grad
@@ -182,16 +182,20 @@ def get_deltas(self, raw_deltas):
 
 
 class Averager(object):
+    """Compute and store a cumulative moving average (CMA).
+
+    """
+
     def __init__(self):
         self.steps = 0
-        self.val = 0.0
+        self.cma = 0.0
 
-    def add_val(self, val):
-        self.val = (self.steps * self.val + 1.0 * val)/(self.steps + 1.0)
+    def add_val(self, new_val):
+        self.cma = (self.steps * self.cma + 1.0 * new_val)/(self.steps + 1.0)
         self.steps += 1
 
-    def get_val(self):
-        return self.val
+    def get_ave(self):
+        return self.cma
 
 
 class MPIModel():
@@ -432,7 +436,7 @@ def build_callbacks(self, conf, callbacks_list):
         val_loss this should be min, etc. In auto mode, the direction is
         automatically inferred from the name of the monitored quantity.
 
-        -monitor: Quantity used for early stopping, has to
+        - monitor: Quantity used for early stopping, has to
         be from the list of metrics
 
         - patience: Number of epochs used to decide on whether to apply early
@@ -579,7 +583,7 @@ def train_epoch(self):
                 curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
                 # g.print_unique(self.model.get_weights()[0][0][:4])
                 loss_averager.add_val(curr_loss)
-                ave_loss = loss_averager.get_val()
+                ave_loss = loss_averager.get_ave()
                 eta = self.estimate_remaining_time(
                     t0 - t_start, self.num_so_far - self.epoch*num_total,
                     num_total)
@@ -641,7 +645,7 @@ def calculate_speed(self, t0, t_after_deltas, t_after_update, num_replicas,
 
         print_str = ('{:.2E} Examples/sec | {:.2E} sec/batch '.format(
             examples_per_sec, t_tot)
-                     + '[{:.1%} calc., {:.1%} synch.]'.format(
+                     + '[{:.1%} calc., {:.1%} sync.]'.format(
                          frac_calculate, frac_sync))
         print_str += '[batch = {} = {}*{}] [lr = {:.2E} = {:.2E}*{}]'.format(
             effective_batch_size, self.batch_size, num_replicas,

From 856c14709dbeedc1f11d0eb0ffa61036c9794662 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 6 Jan 2020 09:40:25 -0600
Subject: [PATCH 46/51] Comment-out pre-RNN diagnostics

Added by @ge-dong in #49 (refactored in #50). It is the only
MPI-specific code within builder.py
---
 plasma/models/builder.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index ac2db521..95307efe 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -276,17 +276,18 @@ def slicer_output_shape(input_shape, indices):
                 activity_regularizer=l2(dense_regularization))(pre_rnn)
 
         pre_rnn_model = Model(inputs=pre_rnn_input, outputs=pre_rnn)
-        from mpi4py import MPI
-        comm = MPI.COMM_WORLD
-        task_index = comm.Get_rank()
-        if not predict and task_index == 0:
-            print('Printing out pre_rnn model...')
-            fr = open('model_architecture.log', 'w')
-            ori = sys.stdout
-            sys.stdout = fr
-            pre_rnn_model.summary()
-            sys.stdout = ori
-            fr.close()
+        # TODO(KGF): uncomment following lines to get summary of pre-RNN model
+        # from mpi4py import MPI
+        # comm = MPI.COMM_WORLD
+        # task_index = comm.Get_rank()
+        # if not predict and task_index == 0:
+        #     print('Printing out pre_rnn model...')
+        #     fr = open('model_architecture.log', 'w')
+        #     ori = sys.stdout
+        #     sys.stdout = fr
+        #     pre_rnn_model.summary()
+        #     sys.stdout = ori
+        #     fr.close()
         # pre_rnn_model.summary()
         x_input = Input(batch_shape=batch_input_shape)
         # TODO(KGF): Ge moved this inside a new conditional in Dec 2019. check

From 1867bd2b4b2ab9f830318da45eaba2f057afb06b Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 6 Jan 2020 13:24:32 -0600
Subject: [PATCH 47/51] Fix typo

---
 plasma/conf_parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/plasma/conf_parser.py b/plasma/conf_parser.py
index 59923e56..a3d879b6 100644
--- a/plasma/conf_parser.py
+++ b/plasma/conf_parser.py
@@ -98,7 +98,8 @@ def parameters(input_file):
         elif params['target'] == 'ttdlinear':
             params['data']['target'] = TTDLinearTarget
         else:
-            g.print_unique('Unkown type of target. Exiting')
+            # TODO(KGF): "Target" base class is unused here
+            g.print_unique('Unknown type of target. Exiting')
             exit(1)
 
         # params['model']['output_activation'] =

From f292e9ecaae0e2639a9b92d09ecba5ae1af6d5dc Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 6 Jan 2020 13:35:57 -0600
Subject: [PATCH 48/51] Deprecate and remove old documentation

---
 docs/ALCF.md               |  19 +++++++++++++++++++
 docs/ANL_Theta.md          |  21 ---------------------
 docs/PrincetonUTutorial.md |   2 +-
 docs/Targets.md            |  12 ------------
 docs/{ => images}/tb.png   | Bin
 docs/{ => old}/Model.md    |   0
 docs/{ => old}/Titan.md    |   0
 7 files changed, 20 insertions(+), 34 deletions(-)
 delete mode 100644 docs/ANL_Theta.md
 delete mode 100644 docs/Targets.md
 rename docs/{ => images}/tb.png (100%)
 rename docs/{ => old}/Model.md (100%)
 rename docs/{ => old}/Titan.md (100%)

diff --git a/docs/ALCF.md b/docs/ALCF.md
index 63877423..828e4aae 100644
--- a/docs/ALCF.md
+++ b/docs/ALCF.md
@@ -362,5 +362,24 @@ sudo singularity build centos7-cuda-tf1.12.0-plasma.simg docker://rjzamora/cento
 
 
 
+# First time setup on Theta (Fall 2017)
 
+```bash
+mkdir PPPL
+cd PPPL/
+git clone https://github.com/PPPLDeepLearning/plasma-python
 
+wget https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh
+sh Anaconda3-4.4.0-Linux-x86_64.sh
+PPPL/plasma-python/
+
+conda create --name PPPL_dev --file=requirements-travis.txt
+#~/.bashrc
+export PATH="/home/alexeys/anaconda3/bin:$PATH"
+conda create --name PPPL_dev --file=requirements-travis.txt
+source activate PPPL_dev
+
+python setup.py install
+module load PrgEnv-intel/6.0.4
+#which mpicc
+env MPICC=/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpicc pip install --user mpi4py
diff --git a/docs/ANL_Theta.md b/docs/ANL_Theta.md
deleted file mode 100644
index abc2f2eb..00000000
--- a/docs/ANL_Theta.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# First time setup on Theta, Argonne
-
-```bash
-mkdir PPPL
-cd PPPL/
-git clone https://github.com/PPPLDeepLearning/plasma-python
-
-wget https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh
-sh Anaconda3-4.4.0-Linux-x86_64.sh 
-PPPL/plasma-python/
-
-conda create --name PPPL_dev --file=requirements-travis.txt 
-#~/.bashrc
-export PATH="/home/alexeys/anaconda3/bin:$PATH"
-conda create --name PPPL_dev --file=requirements-travis.txt 
-source activate PPPL_dev
-
-python setup.py install
-module load PrgEnv-intel/6.0.4
-#which mpicc
-env MPICC=/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpicc pip install --user mpi4py
diff --git a/docs/PrincetonUTutorial.md b/docs/PrincetonUTutorial.md
index 6493b7a1..5a4a2954 100644
--- a/docs/PrincetonUTutorial.md
+++ b/docs/PrincetonUTutorial.md
@@ -245,7 +245,7 @@ A URL should be emitted to the console output. Navigate to this link in your bro
 
 You should see something like:
 
-![tensorboard example](https://github.com/PPPLDeepLearning/plasma-python/blob/master/docs/tb.png)
+![tensorboard example](https://github.com/PPPLDeepLearning/plasma-python/blob/master/docs/images/tb.png)
 
 When you are finished with analyzing the summaries in TensorBoard, you may wish to unmount the remote filesystem:
 ```
diff --git a/docs/Targets.md b/docs/Targets.md
deleted file mode 100644
index 75c83917..00000000
--- a/docs/Targets.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Understanding targets
-
-An abstract base class implemented using Python ABC library and  a set of classes derived from it.
-
-## Data members
-
-activation and loss, type string
-
-
-## Static methods
-
-remapper and threshold_range
diff --git a/docs/tb.png b/docs/images/tb.png
similarity index 100%
rename from docs/tb.png
rename to docs/images/tb.png
diff --git a/docs/Model.md b/docs/old/Model.md
similarity index 100%
rename from docs/Model.md
rename to docs/old/Model.md
diff --git a/docs/Titan.md b/docs/old/Titan.md
similarity index 100%
rename from docs/Titan.md
rename to docs/old/Titan.md

From 63b8d2d7a13fb5911ebd600eab43620655ad65ba Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 6 Jan 2020 13:51:47 -0600
Subject: [PATCH 49/51] Note unused functions in targets.py

See #54
---
 plasma/models/mpi_runner.py | 14 ++++++++------
 plasma/models/targets.py    |  6 ++++--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 5aead668..51301d2c 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -485,12 +485,14 @@ def add_noise(self, X):
 
     def train_epoch(self):
         '''
-        The purpose of the method is to perform distributed mini-batch SGD for
+        Perform distributed mini-batch SGD for
         one epoch.  It takes the batch iterator function and a NN model from
         MPIModel object, fetches mini-batches in a while-loop until number of
         samples seen by the ensemble of workers (num_so_far) exceeds the
         training dataset size (num_total).
 
+        NOTE: "sample" = "an entire shot" within this description
+
         During each iteration, the gradient updates (deltas) and the loss are
         calculated for each model replica in the ensemble, weights are averaged
         over ensemble, and the new weights are set.
@@ -501,11 +503,11 @@ def train_epoch(self):
         Argument list: Empty
 
         Returns:
-          - step: epoch number
-          - ave_loss: training loss averaged over replicas
-          - curr_loss:
-          - num_so_far: the number of samples seen by ensemble of replicas to a
-        current epoch (step)
+          - step: final iteration number
+          - ave_loss: model loss averaged over iterations within this epoch
+          - curr_loss: training loss averaged over replicas at final iteration
+          - num_so_far: the cumulative number of samples seen by the ensemble
+        of replicas up to the end of the final iteration (step) of this epoch
 
         Intermediate outputs and logging: debug printout of task_index (MPI),
         epoch number, number of samples seen to a current epoch, average
diff --git a/plasma/models/targets.py b/plasma/models/targets.py
index 3db4fead..fe1351e4 100644
--- a/plasma/models/targets.py
+++ b/plasma/models/targets.py
@@ -130,6 +130,7 @@ class MaxHingeTarget(Target):
 
     @staticmethod
     def loss(y_true, y_pred):
+        # TODO(KGF): this function is unused and unique to this class
         from plasma.conf import conf
         fac = MaxHingeTarget.fac
         # overall_fac =
@@ -151,9 +152,10 @@ def loss(y_true, y_pred):
     def loss_np(y_true, y_pred):
         from plasma.conf import conf
         # TODO(KGF): fac = positive_example_penalty is only used in this class,
-        # only in above loss() fn, which only this class has (besides loss_np)
+        # only in above (unused) loss() fn, which only this class has, and is
+        # never called. 2 lines related to fac commented-out in this fn.
+        #
         # fac = MaxHingeTarget.fac
-        # print(y_pred.shape)
         overall_fac = np.prod(np.array(y_pred.shape).astype(np.float32))
         max_val = np.max(y_pred, axis=-2)  # temporal axis!
         max_val = np.reshape(

From 8a36a58f8b6228aedf2026bc16bd0317a4d28354 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 9 Jan 2020 11:39:29 -0500
Subject: [PATCH 50/51] Group "T_*" timing parameters close together in
 conf.yaml

---
 examples/conf.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/conf.yaml b/examples/conf.yaml
index 8ae41bba..edbabeeb 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -25,7 +25,6 @@ data:
   augmentation_mode: None
   augment_during_training: False
   cut_shot_ends: True
-  T_min_warn: 30
   recompute: False
   recompute_normalization: False
   # specifies which of the signals in the signals_dirs order contains the plasma current info
@@ -36,10 +35,11 @@ data:
   positive_example_penalty: 1.0 # by what factor to upweight positive examples?
   # normalization timescale
   dt: 0.001
+  T_min_warn: 30
   # maximum TTD considered
   T_max: 1000.0
   # warning time in seconds
-  # The shortest works best so far: less overfitting. log TTd prediction also works well. 0.5 better than 0.2
+  # The shortest works best so far: less overfitting. log(TTD) prediction also works well. 0.5s better than 0.2s
   T_warning: 1.024 # 0.512 # 0.25 # 1.0
   current_thresh: 750000
   current_end_thresh: 10000

From eb8f66484bec58ec3a1c13da4f503e5147ae8d3d Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <kfelker@princeton.edu>
Date: Thu, 9 Jan 2020 11:43:36 -0500
Subject: [PATCH 51/51] Fix bug: MaxHingeTarget was inheriting loss='mse' from
 parent Target

---
 plasma/models/targets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/plasma/models/targets.py b/plasma/models/targets.py
index fe1351e4..fc6729f9 100644
--- a/plasma/models/targets.py
+++ b/plasma/models/targets.py
@@ -126,6 +126,7 @@ def threshold_range(T_warning):
 # time sequence is punished. Also implements class weighting
 class MaxHingeTarget(Target):
     activation = 'linear'
+    loss = 'hinge'
     fac = 1.0
 
     @staticmethod
@@ -198,7 +199,7 @@ def threshold_range(T_warning):
 
 class HingeTarget(Target):
     activation = 'linear'
-    loss = 'hinge'  # hinge
+    loss = 'hinge'
 
     @staticmethod
     def loss_np(y_true, y_pred):