diff --git a/.travis.yml b/.travis.yml
index 31c7f70b..087fdb66 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,12 +34,12 @@ env:
 # before_install:
 
 install:
-  - sh install-mpi.sh
+  - sh ci/travis/install-mpi.sh
   - export MPI_PREFIX="${HOME}/opt/${MPI_LIBRARY}-${MPI_LIBRARY_VERSION}"
   - export PATH="${HOME}/.local/bin:${MPI_PREFIX}/bin${PATH:+":${PATH}"}"
   - export LD_LIBRARY_PATH="${MPI_PREFIX}/lib${LD_LIBRARY_PATH:+":${LD_LIBRARY_PATH}"}"
   - pip install --upgrade pip
-  - pip install -r requirements-travis.txt
+  - pip install -r envs/pip-requirements-travis.txt
 
 # before_script:
 
@@ -54,6 +54,8 @@ stages:
 
 notifications:
   email:
+    recipients:
+      - felker@anl.gov
     on_success: change
     on_failure: always
   slack:
diff --git a/jenkins-ci/jenkins.sh b/ci/jenkins/jenkins.sh
similarity index 100%
rename from jenkins-ci/jenkins.sh
rename to ci/jenkins/jenkins.sh
diff --git a/jenkins-ci/run_jenkins.py b/ci/jenkins/run_jenkins.py
similarity index 100%
rename from jenkins-ci/run_jenkins.py
rename to ci/jenkins/run_jenkins.py
diff --git a/jenkins-ci/validate_jenkins.py b/ci/jenkins/validate_jenkins.py
similarity index 100%
rename from jenkins-ci/validate_jenkins.py
rename to ci/jenkins/validate_jenkins.py
diff --git a/jenkins-ci/validate_jenkins.sh b/ci/jenkins/validate_jenkins.sh
similarity index 100%
rename from jenkins-ci/validate_jenkins.sh
rename to ci/jenkins/validate_jenkins.sh
diff --git a/install-mpi.sh b/ci/travis/install-mpi.sh
similarity index 100%
rename from install-mpi.sh
rename to ci/travis/install-mpi.sh
diff --git a/data/signals.py b/data/signals.py
index 12e26e4b..18e4e521 100644
--- a/data/signals.py
+++ b/data/signals.py
@@ -85,12 +85,9 @@ def get_units(str):
     if found:
         if rank > 1:
             xdata = c.get('dim_of(_s,1)').data()
-            # xunits = get_units('dim_of(_s,1)')
             ydata = c.get('dim_of(_s)').data()
-            # yunits = get_units('dim_of(_s)')
         else:
             xdata = c.get('dim_of(_s)').data()
-            # xunits = get_units('dim_of(_s)')
 
     # MDSplus seems to return 2-D arrays transposed.  Change them back.
     if np.ndim(data) == 2:
@@ -406,6 +403,11 @@ def fetch_nstx_data(signal_path, shot_num, c):
     # 'tmamp1':tmamp1, 'tmamp2':tmamp2, 'tmfreq1':tmfreq1, 'tmfreq2':tmfreq2,
     # 'pechin':pechin,
     # 'rho_profile_spatial':rho_profile_spatial, 'etemp':etemp,
+    # -----
+    # TODO(KGF): replace this hacky workaround
+    # IMPORTANT: must comment-out the following line when preprocessing for
+    # training on JET CW and testing on JET ILW (FRNN 0D).
+    # Otherwise 1K+ CW shots are excluded due to missing profile data
     'etemp_profile': etemp_profile, 'edens_profile': edens_profile,
     # 'itemp_profile':itemp_profile, 'zdens_profile':zdens_profile,
     # 'trot_profile':trot_profile, 'pthm_profile':pthm_profile,
diff --git a/docs/ALCF.md b/docs/ALCF.md
new file mode 100644
index 00000000..828e4aae
--- /dev/null
+++ b/docs/ALCF.md
@@ -0,0 +1,385 @@
+# ALCF Theta `plasma-python` FRNN Notes
+
+**Author: Rick Zamora (rzamora@anl.gov)**
+
+This document is intended to act as a tutorial for running the [plasma-python](https://github.com/PPPLDeepLearning/plasma-python) implementation of the Fusion recurrent neural network (FRNN) on the ALCF Theta supercomputer (Cray XC40; Intel KNL processors).  The steps followed in these notes are based on the Princeton [Tiger-GPU tutorial](https://github.com/PPPLDeepLearning/plasma-python/blob/master/docs/PrincetonUTutorial.md#location-of-the-data-on-tigress), hosted within the main GitHub repository for the project.
+
+## Environment Setup
+
+
+Choose a *root* directory for FRNN-related installations on Theta:
+
+```
+export FRNN_ROOT=<desired-root-directory>
+cd $FRNN_ROOT
+```
+
+*Personal Note: Using FRNN_ROOT=/home/zamora/ESP*
+
+Create a simple directory structure allowing experimental *builds* of the `plasma-python` python code/library:
+
+```
+mkdir build
+mkdir build/miniconda-3.6-4.5.4
+cd build/miniconda-3.6-4.5.4
+```
+
+### Custom Miniconda Environment Setup
+
+Copy miniconda installation script to working directory (and install):
+
+```
+cp /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/install_miniconda-3.6-4.5.4.sh .
+./install_miniconda-3.6-4.5.4.sh
+```
+
+The `install_miniconda-3.6-4.5.4.sh` script will install `miniconda-4.5.4` (using `Python-3.6`), as well as `Tensorflow-1.12.0` and `Keras 2.2.4`.
+
+
+Update your environment variables to use miniconda:
+
+```
+export PATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/bin:$PATH
+export PYTHONPATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/lib/python3.6/site-packages/:$PYTHONPATH
+```
+
+Note that the previous lines (as well as the definition of `FRNN_ROOT`) can be appended to your `$HOME/.bashrc` file if you want to use this environment on Theta by default.
+
+
+## Installing `plasma-python`
+
+Here, we assume the installation is within the custom miniconda environment installed in the previous steps. We also assume the following commands have already been executed:
+
+```
+export FRNN_ROOT=<desired-root-directory>
+export PATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/bin:$PATH
+export PYTHONPATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/lib/python3.6/site-packages/:$PYTHONPATH
+```
+
+*Personal Note: Using `export FRNN_ROOT=/lus/theta-fs0/projects/fusiondl_aesp/zamora/FRNN_project`*
+
+If the environment is set up correctly, installation of `plasma-python` is straightforward:
+
+```
+cd ${FRNN_ROOT}/build/miniconda-3.6-4.5.4
+git clone https://github.com/PPPLDeepLearning/plasma-python.git
+cd plasma-python
+python setup.py build
+python setup.py install
+```
+
+## Data Access
+
+Sample data and metadata is available in `/lus/theta-fs0/projects/FRNN/tigress/alexeys/signal_data` and `/lus/theta-fs0/projects/FRNN/tigress/alexeys/shot_lists`, respectively.  It is recommended that users create their own symbolic links to these directories. I recommend that you do this within a directory called `/lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/`. For example:
+
+```
+ln -s /lus/theta-fs0/projects/fusiondl_aesp/FRNN/tigress/alexeys/shot_lists  /lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/shot_lists
+ln -s /lus/theta-fs0/projects/fusiondl_aesp/FRNN/tigress/alexeys/signal_data  /lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/signal_data
+```
+
+For the examples included in `plasma-python`, there is a configuration file that specifies the root directory of the raw data. Change the `fs_path: '/tigress'` line in `examples/conf.yaml` to reflect the following:
+
+```
+fs_path: '/lus/theta-fs0/projects/fusiondl_aesp'
+```
+
+Its also a good idea to change `num_gpus: 4` to `num_gpus: 1`. I am also using the `jet_data_0D` dataset:
+
+```
+paths:
+    data: jet_data_0D
+```
+
+
+### Data Preprocessing
+
+#### The SLOW Way (On Theta)
+
+Theta is KNL-based, and is **not** the best resource for processing many text files in python. However, the preprocessing step *can* be used by using the following steps (although it may need to be repeated many times to get through the whole dataset in a 60-minute debug queues):
+
+```
+cd ${FRNN_ROOT}/build/miniconda-3.6-4.5.4/plasma-python/examples
+cp /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/submit_guarantee_preprocessed.sh .
+```
+
+Modify the paths defined in `submit_guarantee_preprocessed.sh` to match your environment.
+
+Note that the preprocessing module will use Pathos multiprocessing (not MPI/mpi4py).  Therefore, the script will see every compute core (all 256 per node) as an available resource.  Since the LUSTRE file system is unlikely to perform well with 256 processes (on the same node) opening/closing/creating files at once, it might improve performance if you make a slight change to line 85 in the `vi ~/plasma-python/plasma/preprocessor/preprocess.py` file:
+
+```
+line 85: use_cores = min( <desired-maximum-process-count>, max(1,mp.cpu_count()-2) )
+```
+
+After optionally re-building and installing plasm-python with this change, submit the preprocessing job:
+
+```
+qsub submit_guarantee_preprocessed.sh
+```
+
+#### The FAST Way (On Cooley)
+
+You will fine it much less painful to preprocess the data on Cooley, because the Haswell processors are much better suited for this... Log onto the ALCF Cooley Machine:
+
+```
+ssh <alcf-username>@cooley.alcf.anl.gov
+```
+
+Copy my `cooley_preprocess` example directory to whatever directory you choose to work in:
+
+```
+cp -r /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/cooley_preprocess .
+cd cooley_preprocess
+```
+
+This directory has a Singularity image with everything you need to run your code on Cooley. Assuming you have created symbolic links to the `shot_lists` and `signal_data` directories in `/lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/`, you can just submit the included `COBALT` script (to specify the data you want to process, just modify the included `conf.yaml` file):
+
+```
+qsub submit.sh
+```
+
+For me, this finishes in less than 10 minutes, and creates 5523 `.npz` files in the `/lus/theta-fs0/projects/fusiondl_aesp/<your-alcf-username>/processed_shots/` directory.  The output file of the COBALT submission ends with the following message:
+
+```
+5522/5523Finished Preprocessing 5523 files in 406.94421911239624 seconds
+Omitted 5523 shots of 5523 total.
+0/0 disruptive shots
+WARNING: All shots were omitted, please ensure raw data is complete and available at /lus/theta-fs0/projects/fusiondl_aesp/zamora/signal_data/.
+4327 1196
+```
+
+
+# Notes on Revisiting Pre-Processes
+
+## Preprocessing Information
+
+To understand what might be going wrong with the preprocessing step, let's investigate what the code is actually doing.
+
+**Step 1** Call `guarentee_preprocessed( conf )`, which is defined in `plasma/preprocessor/preprocess.py`. This function first initializes a `Preprocessor()` object (whose class definition is in the same file), and then checks if the preprocessing was already done (by looking for a file). The preprocessor object is called `pp`.
+
+**Step 2** Assuming preprocessing is needed, we call `pp.clean_shot_lists()`, which loops through each file in the `shot_lists` directory and calls `self.clean_shot_list()` (not plural) for each text-file item. I do not believe this function is doing any thing when I run it, because all the shot list files have been "cleaned." The cleaning of a shot-list file just means the data is corrected to have two columns, and the file is renamed (to have "clear" in the name).
+
+**Step 3** We call `pp.preprocess_all()`, which parses some of the config file, and ultimately calls `self.preprocess_from_files(shot_files_all,use_shots)` (where I believe `shot_files_all` is the output directory, and `use_shots` is the number of shots to use).
+
+**Step 4** The `preprocess_from_files()` function is used to do the actual preprocessing. It does this by creating a multiprocessing pool, and mapping the processes to the `self.preprocess_single_file` function (note that the code for `ShotList` class is in `plasma/primitives/shots.py`, and the preprocessing code is still in `plasma/preprocessor/preprocess.py`).
+
+**Important:** It looks like the code uses the path definitions in `data/shot_lists/signals.py` to define the location/path of signal data. I believe that some of the signal data is missing, which is causing every "shot" to be labeled as incomplete (and consequently thrown out).
+
+### Possible Issues
+
+From the preprocessing output, it is clear that the *Signal Radiated Power Core* data was not downloaded correctly. According to the `data/shot_lists/signals.py` file, the data *should* be in `/lus/theta-fs0/projects/fusiondl_aesp/<alcf-user-name>/signal_data/jet/ppf/bolo/kb5h/channel14`. However, the only subdirectory of `~/jet/ppf/` is `~/jet/ppf/efit`
+
+Another possible issue is that the `data/shot_lists/signals.py` file specifies the **name** of the directory containing the *Radiated Power* data incorrectly (*I THINK*). Instead of the following line:
+
+`pradtot = Signal("Radiated Power",['jpf/db/b5r-ptot>out'],[jet])`
+
+We might need this:
+
+`pradtot = Signal("Radiated Power",['jpf/db/b5r-ptot\>out'],[jet])`
+
+The issue has to do with the `>` character in the directory name (without the proper `\` escape character, python may be looking in the wrong path). **NOTE: I need to confirm that there is actually an issue with the way the code is actually using the string.**
+
+
+## Singularity/Docker Notes
+
+Recall that the data preprocessing step was PAINFULLY slow on Theta, and so I decided to use Cooley. To simplify the process of using Cooley, I created a Docker image with the necessary environment. **Personal Note:** I performed this work on my local machine (Mac) in `/Users/rzamora/container-recipes`.
+
+
+In order to use a Docker image within a Singularity container (required on ALCF machines), it is useful to build the image on your local machine and push it to "Docker Hub":
+
+
+**Step 1:** Install Docker if you don't have it. [Docker-Mac](https://www.docker.com/docker-mac) works well for Mac.
+
+**Step 2:** Build a Docker image using the recipe discussed below.
+
+```
+export IMAGENAME="test_image"
+export RECIPENAME="Docker.centos7-cuda-tf1.12.0"
+docker build -t $IMAGENAME -f $RECIPENAME .
+```
+
+You can check that the image is functional by starting an interactive shell session, and checking that the necessary python modules are available. For example (using `-it` for an interactive session):
+
+```
+docker run --rm -it -v $PWD:/tmp -w /tmp $IMAGENAME:latest bash
+# python -c "import keras; import plasma; print(plasma.__file__)"
+```
+
+Note that the `plasma-python` source code will be located in `/root/plasma-python/` for the recipe described below.
+
+**Step 3:** Push the image to [Docker Hub](https://hub.docker.com/).
+
+Using your docker-hub username:
+
+```
+docker login --username=<username>
+```
+
+Then, "tag" the image using the `IMAGE ID` value displayed with `docker image ls`:
+
+```
+docker tag <IMAGE-ID> <username>/<image-name>:<label>
+```
+
+Here, `<label>` is something like "latest".  To finally push the image to [Docker Hub](https://hub.docker.com/):
+
+```
+docker push <username>/<image-name>
+```
+
+### Docker Recipe
+
+The actual content of the docker recipe is mostly borrowed from an example on [GitHub](https://github.com/scieule/golden-heart/blob/master/Dockerfile):
+
+```
+FROM nvidia/cuda:9.1-cudnn7-devel-centos7
+
+# Setup environment:
+SHELL ["/bin/bash", "-c"]
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV LC_ALL en_US.UTF-8
+ENV CUDA_DEVICE_ORDER PCI_BUS_ID
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/cuda/extras/CUPTI/lib64
+
+RUN yum update -y
+
+RUN yum groupinstall -y "Development tools"
+
+RUN yum install -y  wget \
+                    unzip \
+                    screen tmux \
+                    ruby \
+                    vim \
+                    bc \
+                    man \
+                    ncurses-devel \
+                    zlib-devel \
+                    curl-devel \
+                    openssl-devel \
+                    which
+
+RUN yum install -y qt5*devel gtk2-devel
+
+RUN yum install -y  blas-devel \
+                    lapack-devel \
+                    atlas-devel \
+                    gcc-gfortran \
+                    tbb-devel \
+                    eigen3-devel \
+                    jasper-devel \
+                    libpng-devel \
+                    libtiff-devel \
+                    openexr-devel \
+                    libwebp-devel \
+                    libv4l-devel \
+                    libdc1394-devel \
+                    libv4l-devel \
+                    gstreamer-plugins-base-devel
+
+# C/C++ CMake Python
+RUN yum install -y  centos-release-scl && \
+    yum install -y  devtoolset-7-gcc* \
+                    devtoolset-7-valgrind \
+                    devtoolset-7-gdb \
+                    devtoolset-7-elfutils \
+                    clang \
+                    llvm-toolset-7 \
+                    llvm-toolset-7-cmake \
+                    rh-python36-python-devel \
+                    rh-python36-python-pip \
+                    rh-git29-git \
+                    devtoolset-7-make
+
+RUN echo "source scl_source enable devtoolset-7" >> /etc/bashrc
+RUN echo "source scl_source enable llvm-toolset-7" >> /etc/bashrc
+RUN echo "source scl_source enable rh-python36" >> /etc/bashrc
+RUN echo "source scl_source enable rh-git29" >> /etc/bashrc
+
+# Python libs & jupyter
+
+RUN source /etc/bashrc; pip3 install --upgrade pip
+RUN source /etc/bashrc; pip3 install numpy scipy matplotlib pandas \
+                                    tensorflow-gpu keras h5py tables \
+                                    scikit-image scikit-learn Pillow opencv-python \
+                                    jsonschema jinja2 tornado pyzmq ipython jupyter notebook
+
+# Install MPICH
+RUN  cd /root && wget -q http://www.mpich.org/static/downloads/3.2.1/mpich-3.2.1.tar.gz \
+  && tar xf mpich-3.2.1.tar.gz \
+  && rm mpich-3.2.1.tar.gz \
+  && cd mpich-3.2.1 \
+  && source /etc/bashrc; ./configure --prefix=/usr/local/mpich/install --disable-wrapper-rpath \
+  && make -j 4 install \
+  && cd .. \
+  && rm -rf mpich-3.2.1
+
+ENV PATH ${PATH}:/usr/local/mpich/install/bin
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/mpich/install/lib
+RUN env | sort
+
+# Install plasma-python (https://github.com/PPPLDeepLearning/plasma-python)
+# For 'pip'-based install: pip --no-cache-dir --disable-pip-version-check install -i https://testpypi.python.org/pypi plasma
+RUN cd /root && git clone https://github.com/PPPLDeepLearning/plasma-python \
+  && cd plasma-python \
+  && source /etc/bashrc; python setup.py install \
+  && cd ..
+
+# nccl2
+RUN cd /root && git clone https://github.com/NVIDIA/nccl.git \
+  && cd nccl \
+  && make -j src.build \
+  && make pkg.redhat.build \
+  && rpm -i build/pkg/rpm/x86_64/libnccl*
+
+# pip-install mpi4py
+RUN source /etc/bashrc; pip3 install mpi4py
+
+RUN yum install -y libffi libffi-devel
+
+RUN source /etc/bashrc; pip3 install tensorflow
+
+# Workaround to build horovod without needing cuda libraries available:
+# temporary add stub drivers to ld.so.cache
+RUN ldconfig /usr/local/cuda/lib64/stubs \
+  && source /etc/bashrc; HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_NCCL_HOME=/nccl/build/ pip3 --no-cache-dir install horovod \
+  && ldconfig
+
+ENV NCCL_P2P_DISABLE 1
+```
+
+### Converting Docker to Singularity
+
+Needed to build a singularity image for Cooley... Used vagrant:
+
+```
+cd ~/vm-singularity/
+vagrant up
+vagrant ssh
+sudo singularity build centos7-cuda-tf1.12.0-plasma.simg docker://rjzamora/centos7-cuda-tf1.12.0.dimg:latest
+```
+
+
+
+
+
+# First time setup on Theta (Fall 2017)
+
+```bash
+mkdir PPPL
+cd PPPL/
+git clone https://github.com/PPPLDeepLearning/plasma-python
+
+wget https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh
+sh Anaconda3-4.4.0-Linux-x86_64.sh
+PPPL/plasma-python/
+
+conda create --name PPPL_dev --file=requirements-travis.txt
+#~/.bashrc
+export PATH="/home/alexeys/anaconda3/bin:$PATH"
+conda create --name PPPL_dev --file=requirements-travis.txt
+source activate PPPL_dev
+
+python setup.py install
+module load PrgEnv-intel/6.0.4
+#which mpicc
+env MPICC=/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpicc pip install --user mpi4py
diff --git a/docs/ANL_Theta.md b/docs/ANL_Theta.md
deleted file mode 100644
index abc2f2eb..00000000
--- a/docs/ANL_Theta.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# First time setup on Theta, Argonne
-
-```bash
-mkdir PPPL
-cd PPPL/
-git clone https://github.com/PPPLDeepLearning/plasma-python
-
-wget https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh
-sh Anaconda3-4.4.0-Linux-x86_64.sh 
-PPPL/plasma-python/
-
-conda create --name PPPL_dev --file=requirements-travis.txt 
-#~/.bashrc
-export PATH="/home/alexeys/anaconda3/bin:$PATH"
-conda create --name PPPL_dev --file=requirements-travis.txt 
-source activate PPPL_dev
-
-python setup.py install
-module load PrgEnv-intel/6.0.4
-#which mpicc
-env MPICC=/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpicc pip install --user mpi4py
diff --git a/docs/PrincetonUTutorial.md b/docs/PrincetonUTutorial.md
index 6493b7a1..5a4a2954 100644
--- a/docs/PrincetonUTutorial.md
+++ b/docs/PrincetonUTutorial.md
@@ -245,7 +245,7 @@ A URL should be emitted to the console output. Navigate to this link in your bro
 
 You should see something like:
 
-![tensorboard example](https://github.com/PPPLDeepLearning/plasma-python/blob/master/docs/tb.png)
+![tensorboard example](https://github.com/PPPLDeepLearning/plasma-python/blob/master/docs/images/tb.png)
 
 When you are finished with analyzing the summaries in TensorBoard, you may wish to unmount the remote filesystem:
 ```
diff --git a/docs/Targets.md b/docs/Targets.md
deleted file mode 100644
index 75c83917..00000000
--- a/docs/Targets.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Understanding targets
-
-An abstract base class implemented using Python ABC library and  a set of classes derived from it.
-
-## Data members
-
-activation and loss, type string
-
-
-## Static methods
-
-remapper and threshold_range
diff --git a/docs/tb.png b/docs/images/tb.png
similarity index 100%
rename from docs/tb.png
rename to docs/images/tb.png
diff --git a/docs/Model.md b/docs/old/Model.md
similarity index 100%
rename from docs/Model.md
rename to docs/old/Model.md
diff --git a/docs/Titan.md b/docs/old/Titan.md
similarity index 100%
rename from docs/Titan.md
rename to docs/old/Titan.md
diff --git a/envs/pip-requirements-travis.txt b/envs/pip-requirements-travis.txt
new file mode 100644
index 00000000..75815e68
--- /dev/null
+++ b/envs/pip-requirements-travis.txt
@@ -0,0 +1,8 @@
+# pip dependencies for Travis CI builds
+scipy
+pandas
+flake8
+h5py
+pyparsing
+pyyaml
+tensorflow-gpu>=1.3,<2.0.0
diff --git a/requirements-travis.txt b/envs/requirements-cpu.yaml
similarity index 100%
rename from requirements-travis.txt
rename to envs/requirements-cpu.yaml
diff --git a/envs/requirements-linux-64-gpu.yaml b/envs/requirements-linux-64-gpu.yaml
new file mode 100644
index 00000000..21bdcf2d
--- /dev/null
+++ b/envs/requirements-linux-64-gpu.yaml
@@ -0,0 +1,24 @@
+name: frnn
+channels:
+  - conda-forge
+  - defaults
+#channel_priority: strict
+dependencies:
+  - python>=3.6.8
+  - pip
+  - scipy
+  - pandas
+  - flake8
+  - h5py
+  - pyparsing
+  - pyyaml
+  - tensorflow-gpu>=1.3,<2.0.0
+  - pip:
+      - keras>=2.0.5,<2.3.0
+      - pathos
+      - matplotlib>=2.0.2
+      - hyperopt  # TODO(KGF): remove
+      # - mpi4py   # must reload MPI library modules before installing via pip
+      - xgboost
+      - scikit-learn
+      - joblib
diff --git a/envs/requirements-traverse.yaml b/envs/requirements-traverse.yaml
new file mode 100644
index 00000000..a305ea17
--- /dev/null
+++ b/envs/requirements-traverse.yaml
@@ -0,0 +1,26 @@
+name: frnn
+channels:
+  - https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda
+  - conda-forge
+  - defaults
+# channel_priority: strict   # set in .condarc
+dependencies:
+  - python>=3.6.8
+  - cython
+  - pip
+  - scipy
+  - pandas
+  - flake8
+  - h5py
+  - pyparsing
+  - pyyaml
+  - tensorflow-gpu>=1.3,<2.0.0
+  - pip:
+      - keras>=2.0.5,<2.3.0
+      - pathos
+      - matplotlib>=2.0.2
+      - hyperopt  # TODO(KGF): remove
+      # - mpi4py   # must reload MPI library modules before installing via pip
+      - xgboost
+      - scikit-learn
+      - joblib
diff --git a/envs/tigergpu.cmd b/envs/tigergpu.cmd
new file mode 100644
index 00000000..e0f05368
--- /dev/null
+++ b/envs/tigergpu.cmd
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+module load anaconda3
+# must activate conda env before module loads
+conda activate frnn
+export OMPI_MCA_btl="tcp,self,vader"  #sm"
+module load cudatoolkit
+module load cudnn
+
+module load openmpi/gcc/3.1.3/64
+module load hdf5/gcc/openmpi-1.10.2/1.10.0
diff --git a/envs/traverse.cmd b/envs/traverse.cmd
new file mode 100644
index 00000000..b6131c88
--- /dev/null
+++ b/envs/traverse.cmd
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+module load anaconda3
+# must activate conda env before module loads
+conda activate frnn
+export OMPI_MCA_btl="tcp,self,vader"
+
+module load cudatoolkit
+module load cudnn/cuda-10.1/7.6.1
+module load openmpi/gcc/3.1.4/64
+module load hdf5/gcc/openmpi-3.1.4/1.10.5
diff --git a/examples/conf.yaml b/examples/conf.yaml
index dfda1145..edbabeeb 100644
--- a/examples/conf.yaml
+++ b/examples/conf.yaml
@@ -1,141 +1,149 @@
 # conf.py will parse the yaml and extract parameters based on what is specified
+# note, the YAML parser will NOT evaluate expressions in the value fields.
+# e.g. "validation_frac: 1.0/3.0" will result in str value "1.0/3.0"
 
 # will do stuff in fs_path / [username] / signal_data | shot_lists | processed shots, etc.
 
 fs_path: '/tigress'
 target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
 num_gpus: 4  # per node
-
 paths:
-    signal_prepath: '/signal_data/' # /signal_data/jet/
-    shot_list_dir: '/shot_lists/'
-    tensorboard_save_path: '/Graph/'
-    data: d3d_data_0D # 'd3d_to_jet_data' # 'd3d_to_jet_data' #  'jet_to_d3d_data' # jet_data
-    # if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
-    specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
-    executable: "mpi_learn.py"
-    shallow_executable: "learn.py"
-
+  signal_prepath: '/signal_data/' # /signal_data/jet/
+  shot_list_dir: '/shot_lists/'
+  tensorboard_save_path: '/Graph/'
+  data: d3d_0D
+  # if specific_signals: [] left empty, it will use all valid signals defined on a machine. Only use if need a custom set
+  specific_signals: [] # ['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile']
+  executable: "mpi_learn.py"
+  shallow_executable: "learn.py"
 data:
-    bleed_in: 0 # how many shots from the test sit to use in training?
-    bleed_in_repeat_fac: 1 # how many times to repeat shots in training and validation?
-    bleed_in_remove_from_test: True
-    bleed_in_equalize_sets: False
-    # TODO(KGF): make next parameter use 'none' instead of None
-    signal_to_augment: None # 'plasma current' # or None
-    augmentation_mode: 'none'
-    augment_during_training: False
-    cut_shot_ends: True
-    T_min_warn: 30
-    recompute: False
-    recompute_normalization: False
-    # specifies which of the signals in the signals_dirs order contains the plasma current info
-    current_index: 0
-    plotting: False
-    # how many shots to use
-    use_shots: 200000 # 1000 # 200000
-    positive_example_penalty: 1.0 # by what factor to upweight positive examples?
-    # normalization timescale
-    dt: 0.001
-    # maximum TTD considered
-    T_max: 1000.0
-    # The shortest works best so far: less overfitting. log TTd prediction also works well. 0.5 better than 0.2
-    T_warning: 1.024 # 1.024 # 1.024 # 0.512 # 0.25 # 1.0 # 1.0 # warning time in seconds
-    current_thresh: 750000
-    current_end_thresh: 10000
-    # the characteristic decay length of the decaying moving average window
-    window_decay: 2
-    # the width of the actual window
-    window_size: 10
-    # TODO(KGF): optimize the normalizer parameters
-    normalizer: 'var'
-    norm_stat_range: 100.0
-    equalize_classes: False
-    #  shallow_sample_prob: 0.01 # the fraction of samples with which to train the shallow model
-    floatx: 'float32'
-
+  bleed_in: 0 # how many shots from the test set to use in training?
+  bleed_in_repeat_fac: 1 # how many times to repeat shots in training and validation?
+  bleed_in_remove_from_test: True
+  bleed_in_equalize_sets: False
+  signal_to_augment: None # 'plasma current'
+  augmentation_mode: None
+  augment_during_training: False
+  cut_shot_ends: True
+  recompute: False
+  recompute_normalization: False
+  # specifies which of the signals in the signals_dirs order contains the plasma current info
+  current_index: 0
+  plotting: False
+  # maximum number of shots to use
+  use_shots: 200000 # 1000
+  positive_example_penalty: 1.0 # by what factor to upweight positive examples?
+  # normalization timescale
+  dt: 0.001
+  T_min_warn: 30
+  # maximum TTD considered
+  T_max: 1000.0
+  # warning time in seconds
+  # The shortest works best so far: less overfitting. log(TTD) prediction also works well. 0.5s better than 0.2s
+  T_warning: 1.024 # 0.512 # 0.25 # 1.0
+  current_thresh: 750000
+  current_end_thresh: 10000
+  # the characteristic decay length of the decaying moving average window
+  window_decay: 2
+  # the width of the actual window
+  window_size: 10
+  # TODO(KGF): optimize the normalizer parameters
+  normalizer: 'var'
+  norm_stat_range: 100.0
+  equalize_classes: False
+  # the fraction of samples with which to train the shallow model
+  #  shallow_sample_prob: 0.01
+  floatx: 'float32'
 model:
-    loss_scale_factor: 1.0
-    use_batch_norm: false
-    torch: False
-    shallow: False
-    shallow_model:
-        num_samples: 1000000 # 1000000 # the number of samples to use for training
-        type: "xgboost" # "xgboost" #"random_forest"
-        n_estimators: 100 # for random forest
-        max_depth: 3 # for random forest and xgboost (def = 3)
-        C: 1.0 # for svm
-        kernel: "rbf" # rbf, sigmoid, linear, poly, for svm
-        learning_rate: 0.1 # used in xgboost
-        scale_pos_weight: 10.0 # used in xgboost
-        final_hidden_layer_size: 10 # final layers has this many neurons, every layer before twice as many
-        num_hidden_layers: 3
-        learning_rate_mlp: 0.0001
-        mlp_regularization: 0.0001
-        skip_train: False # should a finished model be loaded if available
-    # length of LSTM memory
-    pred_length: 200
-    pred_batch_size: 128
-    # TODO(KGF): optimize length of LSTM memory
-    length: 128
-    skip: 1
-    # hidden layer size
-    # TODO(KGF): optimize size of RNN layers
-    rnn_size: 200
-    # size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100. Prediction much better with size 100, size 20 cannot capture the data.
-    rnn_type: 'LSTM'
-    # TODO(KGF): optimize number of RNN layers
-    rnn_layers: 2
-    num_conv_filters: 128
-    size_conv_filters: 3
-    num_conv_layers: 3
-    pool_size: 2
-    dense_size: 128
-    extra_dense_input: False
-    # have not found a difference yet
-    optimizer: 'adam'
-    clipnorm: 10.0
-    regularization: 0.001
-    dense_regularization: 0.001
-    # lr=1e-4 is too high, 5e-7 is too low. 5e-5 seems best at 256 batch size, full dataset
-    # and ~10 epochs, and lr decay of 0.90
-    # lr=1e-4 also works well if we decay a lot (i.e ~0.7 or more)
-    lr: 0.00002 # 0.00001 # 0.0005 # for adam plots 0.0000001 # 0.00005 # 0.00005 # 0.00005
-    lr_decay: 0.97 # 0.98 # 0.9
-    stateful: True
-    return_sequences: True
-    dropout_prob: 0.1
-    # only relevant if we want to do MPI training. The number of steps with a single replica
-    warmup_steps: 0
-    ignore_timesteps: 100 # how many initial timesteps to ignore during evaluation (to let the internal state settle)
-    backend: 'tensorflow'
+  loss_scale_factor: 1.0
+  use_batch_norm: false
+  torch: False
+  shallow: False
+  shallow_model:
+    type: "xgboost" # "random_forest"
+    # the number of samples to use for training
+    num_samples: 1000000 # 1000000
+    n_estimators: 100 # used in random forest
+    max_depth: 3 # used in random forest and xgboost (def = 3)
+    C: 1.0 # used in svm
+    kernel: "rbf" # rbf, sigmoid, linear, poly, for svm
+    learning_rate: 0.1 # used in xgboost
+    scale_pos_weight: 10.0 # used in xgboost
+    # final layer has this many neurons, every layer before has twice as many
+    final_hidden_layer_size: 10
+    num_hidden_layers: 3
+    learning_rate_mlp: 0.0001
+    mlp_regularization: 0.0001
+    # should a finished model be loaded if available?
+    skip_train: False
+  # length of LSTM memory
+  pred_length: 200
+  pred_batch_size: 128
+  # TODO(KGF): optimize length of LSTM memory
+  length: 128
+  skip: 1
+  # hidden layer size
+  # TODO(KGF): optimize size of RNN layers
+  # size 100 slight overfitting, size 20 no overfitting. 200 is not better than 100.
+  # Prediction is much better with size 100, size 20 cannot capture the data.
+  rnn_size: 200
+  rnn_type: 'LSTM'
+  # TODO(KGF): optimize number of RNN layers
+  rnn_layers: 2
+  num_conv_filters: 128
+  size_conv_filters: 3
+  num_conv_layers: 3
+  pool_size: 2
+  dense_size: 128
+  extra_dense_input: False
+  # have not found a difference yet
+  optimizer: 'adam'
+  clipnorm: 10.0
+  regularization: 0.001
+  dense_regularization: 0.001
+  # lr=1e-4 is too high, 5e-7 is too low. 5e-5 seems best at 256 batch size, full dataset
+  # and ~10 epochs, and lr decay of 0.90
+  # lr=1e-4 also works well if we decay a lot (i.e ~0.7 or more)
+  lr: 0.00002 # 0.00001 # 0.0005 # for adam plots 0.0000001
+  lr_decay: 0.97 # 0.98 # 0.9
+  stateful: True
+  return_sequences: True
+  dropout_prob: 0.1
+  # only relevant if we want to do MPI training. The number of steps with a single replica
+  warmup_steps: 0
+  # how many initial timesteps to ignore during evaluation (to let the internal state settle)
+  ignore_timesteps: 100
+  backend: 'tensorflow'
 training:
-    as_array_of_shots: True
-    shuffle_training: True
-    train_frac: 0.75
-    validation_frac: 0.33
-    batch_size: 128 # 256
-    # THE MAX_PATCH_LENGTH WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
-    max_patch_length: 100000
-    # How many shots are we loading at once?
-    num_shots_at_once: 200
-    num_epochs: 1000   # large number = maximum number of epochs. Early stopping will occur if loss does not decrease
-    use_mock_data: False
-    data_parallel: False
-    hyperparam_tuning: False
-    batch_generator_warmup_steps: 0
-    use_process_generator: False
-    num_batches_minimum: 20 # minimum number of batches per epoch
-    ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
+  as_array_of_shots: True
+  shuffle_training: True
+  # used iff 1) test & 2) (train U validate) are both sampled from the same distribution/source lists of shots:
+  train_frac: 0.75
+  validation_frac: 0.3333333333333333
+  batch_size: 128 # 256
+  # THE MAX_PATCH_LENGTH WAS THE CULPRIT FOR NO TRAINING! Lower than 1000 performs very poorly
+  max_patch_length: 100000
+  # How many shots are we loading at once?
+  num_shots_at_once: 200
+  # large number = maximum number of epochs.
+  # Early stopping will occur if loss does not decrease, after some patience # of epochs
+  num_epochs: 1000
+  use_mock_data: False
+  data_parallel: False
+  hyperparam_tuning: False
+  batch_generator_warmup_steps: 0
+  use_process_generator: False
+  num_batches_minimum: 20 # minimum number of batches per epoch
+  ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
 callbacks:
-    list: ['earlystop']
-    metrics: ['val_loss','val_roc','train_loss']
-    mode: 'max'
-    monitor: 'val_roc'
-    patience: 5
-    write_grads: False
-    monitor_test: True
-    monitor_times: [30,70,200,500,1000]
+  list: ['earlystop']
+  metrics: ['val_loss','val_roc','train_loss']
+  mode: 'max'
+  monitor: 'val_roc'
+  patience: 5
+  write_grads: False
+  monitor_test: True
+  monitor_times: [30,70,200,500,1000]
 env:
-    name: 'frnn'
-    type: 'anaconda'
+  name: 'frnn'
+  type: 'anaconda'
diff --git a/examples/guarantee_preprocessed.py b/examples/guarantee_preprocessed.py
index 310ed300..3fa75e02 100644
--- a/examples/guarantee_preprocessed.py
+++ b/examples/guarantee_preprocessed.py
@@ -11,4 +11,4 @@
 #####################################################
 np.random.seed(0)
 random.seed(0)
-guarantee_preprocessed(conf)
+guarantee_preprocessed(conf, verbose=True)
diff --git a/examples/hyper_learn.py b/examples/hyper_learn.py
index 0dc41a8a..5186bebe 100644
--- a/examples/hyper_learn.py
+++ b/examples/hyper_learn.py
@@ -1,47 +1,48 @@
-from plasma.models import runner
-from plasma.models.loader import Loader
-
-import numpy as np
-from hyperopt import Trials, tpe
-
-from plasma.conf import conf
-from pprint import pprint
-pprint(conf)
-# from plasma.primitives.shots import Shot, ShotList
-# from plasma.models.runner import train, make_predictions,make_predictions_gpu
-
-if conf['data']['normalizer'] == 'minmax':
-    from plasma.preprocessor.normalize import MinMaxNormalizer as Normalizer
-elif conf['data']['normalizer'] == 'meanvar':
-    from plasma.preprocessor.normalize import MeanVarNormalizer as Normalizer
-elif conf['data']['normalizer'] == 'var':
-    # performs !much better than minmaxnormalizer
-    from plasma.preprocessor.normalize import VarNormalizer as Normalizer
-elif conf['data']['normalizer'] == 'averagevar':
-    # performs !much better than minmaxnormalizer
-    from plasma.preprocessor.normalize import (
-        AveragingVarNormalizer as Normalizer
-    )
-else:
-    print('unkown normalizer. exiting')
-    exit(1)
-
-np.random.seed(1)
-
-print("normalization", end='')
-nn = Normalizer(conf)
-nn.train()
-loader = Loader(conf, nn)
-shot_list_train, shot_list_validate, shot_list_test = loader.load_shotlists(
-    conf)
-print("...done")
-
-print('Training on {} shots, testing on {} shots'.format(
-    len(shot_list_train), len(shot_list_test)))
-
-specific_runner = runner.HyperRunner(conf, loader, shot_list_train)
-
-best_run, best_model = specific_runner.frnn_minimize(
-    algo=tpe.suggest, max_evals=2, trials=Trials())
-print(best_run)
-print(best_model)
+# from plasma.models import runner
+# from plasma.models.loader import Loader
+
+# import numpy as np
+# from hyperopt import Trials, tpe
+
+# from plasma.conf import conf
+# from pprint import pprint
+# pprint(conf)
+#  #from plasma.primitives.shots import Shot, ShotList
+#  #from plasma.models.runner import train, make_predictions
+#  ,make_predictions_gpu
+
+# if conf['data']['normalizer'] == 'minmax':
+#     from plasma.preprocessor.normalize import MinMaxNormalizer as Normalizer
+# elif conf['data']['normalizer'] == 'meanvar':
+#     from plasma.preprocessor.normalize import MeanVarNormalizer as Normalizer
+# elif conf['data']['normalizer'] == 'var':
+#     # performs !much better than minmaxnormalizer
+#     from plasma.preprocessor.normalize import VarNormalizer as Normalizer
+# elif conf['data']['normalizer'] == 'averagevar':
+#     # performs !much better than minmaxnormalizer
+#     from plasma.preprocessor.normalize import (
+#         AveragingVarNormalizer as Normalizer
+#     )
+# else:
+#     print('unkown normalizer. exiting')
+#     exit(1)
+
+# np.random.seed(1)
+
+# print("normalization", end='')
+# nn = Normalizer(conf)
+# nn.train()
+# loader = Loader(conf, nn)
+# shot_list_train, shot_list_validate, shot_list_test = loader.load_shotlists(
+#     conf)
+# print("...done")
+
+# print('Training on {} shots, testing on {} shots'.format(
+#     len(shot_list_train), len(shot_list_test)))
+
+# specific_runner = runner.HyperRunner(conf, loader, shot_list_train)
+
+# best_run, best_model = specific_runner.frnn_minimize(
+#     algo=tpe.suggest, max_evals=2, trials=Trials())
+# print(best_run)
+# print(best_model)
diff --git a/examples/learn.py b/examples/learn.py
index b1156130..0f89c71c 100644
--- a/examples/learn.py
+++ b/examples/learn.py
@@ -39,7 +39,9 @@
         train, make_predictions_and_evaluate_gpu
         )
 else:
-    from plasma.models.runner import train, make_predictions_and_evaluate_gpu
+    print('unknown driver. exiting')
+    exit(1)
+    # from plasma.models.runner import train, make_predictions_and_evaluate_gpu
 
 if conf['data']['normalizer'] == 'minmax':
     from plasma.preprocessor.normalize import MinMaxNormalizer as Normalizer
diff --git a/plasma/__init__.py b/plasma/__init__.py
index 3be7f58e..f823703a 100644
--- a/plasma/__init__.py
+++ b/plasma/__init__.py
@@ -15,7 +15,7 @@
 warnings.filterwarnings('ignore',
                         category=FutureWarning,
                         message=r"passing \(type, 1\) or '1type' as a synonym of type is deprecated",  # noqa
-                        module="tensorflow")
+                        module="tensor*")
 
 # Optional: disable the C-based library diagnostic info and warning messages:
 # 2019-11-06 18:27:31.698908: I ...  dynamic library libcublas.so.10
diff --git a/plasma/conf_parser.py b/plasma/conf_parser.py
index 8c2be13e..a3d879b6 100644
--- a/plasma/conf_parser.py
+++ b/plasma/conf_parser.py
@@ -98,7 +98,8 @@ def parameters(input_file):
         elif params['target'] == 'ttdlinear':
             params['data']['target'] = TTDLinearTarget
         else:
-            g.print_unique('Unkown type of target. Exiting')
+            # TODO(KGF): "Target" base class is unused here
+            g.print_unique('Unknown type of target. Exiting')
             exit(1)
 
         # params['model']['output_activation'] =
@@ -189,42 +190,44 @@ def parameters(input_file):
         # nstx_full = ShotListFiles(
         #     nstx, params['paths']['shot_list_dir'],
         #     ['disrupt_nstx.txt'], 'nstx shots (all are disruptive')
-
-        if params['paths']['data'] == 'jet_data':
+        # ==================
+        # JET DATASETS
+        # ==================
+        if params['paths']['data'] == 'jet_all':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.jet_signals
-        elif params['paths']['data'] == 'jet_data_0D':
+        elif params['paths']['data'] == 'jet_0D':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.jet_signals_0D
-        elif params['paths']['data'] == 'jet_data_1D':
+        elif params['paths']['data'] == 'jet_1D':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.jet_signals_1D
-        elif params['paths']['data'] == 'jet_data_late':
+        elif params['paths']['data'] == 'jet_late':
             params['paths']['shot_files'] = [jet_iterlike_wall_late]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.jet_signals
-        elif params['paths']['data'] == 'jet_data_carbon_to_late_0D':
+        elif params['paths']['data'] == 'jet_carbon_to_late_0D':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall_late]
             params['paths']['use_signals_dict'] = sig.jet_signals_0D
-        elif params['paths']['data'] == 'jet_data_temp_profile':
+        elif params['paths']['data'] == 'jet_temp_profile':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = {
                 'etemp_profile': sig.etemp_profile}
-        elif params['paths']['data'] == 'jet_data_dens_profile':
+        elif params['paths']['data'] == 'jet_dens_profile':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = {
                 'edens_profile': sig.edens_profile}
-        elif params['paths']['data'] == 'jet_carbon_data':
+        elif params['paths']['data'] == 'jet_carbon_all':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.jet_signals
-        elif params['paths']['data'] == 'jet_mixed_data':
+        elif params['paths']['data'] == 'jet_mixed_all':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.jet_signals
@@ -232,18 +235,20 @@ def parameters(input_file):
             params['paths']['shot_files'] = [jenkins_jet_carbon_wall]
             params['paths']['shot_files_test'] = [jenkins_jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.jet_signals
-        # jet data but with fully defined signals
-        elif params['paths']['data'] == 'jet_data_fully_defined':
+        # JET data but with fully defined signals
+        elif params['paths']['data'] == 'jet_fully_defined':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        # jet data but with fully defined signals
-        elif params['paths']['data'] == 'jet_data_fully_defined_0D':
+        # JET data but with fully defined signals
+        elif params['paths']['data'] == 'jet_fully_defined_0D':
             params['paths']['shot_files'] = [jet_carbon_wall]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
-
-        elif params['paths']['data'] == 'd3d_data':
+        # ==================
+        # D3D DATASETS
+        # ==================
+        elif params['paths']['data'] == 'd3d_all':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -286,8 +291,7 @@ def parameters(input_file):
                 'etemp_profilet': sig.etemp_profilet,
                 'edens_profilet': sig.edens_profilet,
             }
-
-        elif params['paths']['data'] == 'd3d_data_2019':
+        elif params['paths']['data'] == 'd3d_2019':
             params['paths']['shot_files'] = [d3d_full_2019]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -308,7 +312,7 @@ def parameters(input_file):
                 'etemp_profile': sig.etemp_profile,
                 'edens_profile': sig.edens_profile,
             }
-        elif params['paths']['data'] == 'd3d_data_1D':
+        elif params['paths']['data'] == 'd3d_1D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -316,7 +320,7 @@ def parameters(input_file):
                 'etemp_profile': sig.etemp_profile,
                 'edens_profile': sig.edens_profile,
             }
-        elif params['paths']['data'] == 'd3d_data_all_profiles':
+        elif params['paths']['data'] == 'd3d_all_profiles':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -332,7 +336,7 @@ def parameters(input_file):
                 'bootstrap_current_profile': sig.bootstrap_current_profile,
                 'q_psi_profile': sig.q_psi_profile,
             }
-        elif params['paths']['data'] == 'd3d_data_0D':
+        elif params['paths']['data'] == 'd3d_0D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
@@ -351,10 +355,12 @@ def parameters(input_file):
                 'iptarget': sig.iptarget,
                 'iperr': sig.iperr,
             }
-        elif params['paths']['data'] == 'd3d_data_all':
-            params['paths']['shot_files'] = [d3d_full]
-            params['paths']['shot_files_test'] = []
-            params['paths']['use_signals_dict'] = sig.d3d_signals
+        # TODO(KGF): rename. Unlike JET, there are probably differences between
+        # sig.d3d_signals and the manually-defined sigs in above d3d_all
+        # elif params['paths']['data'] == 'd3d_all':
+        #     params['paths']['shot_files'] = [d3d_full]
+        #     params['paths']['shot_files_test'] = []
+        #     params['paths']['use_signals_dict'] = sig.d3d_signals
         elif params['paths']['data'] == 'jenkins_d3d':
             params['paths']['shot_files'] = [d3d_jenkins]
             params['paths']['shot_files_test'] = []
@@ -377,54 +383,55 @@ def parameters(input_file):
                 'edens_profile': sig.edens_profile,
             }
         # jet data but with fully defined signals
-        elif params['paths']['data'] == 'd3d_data_fully_defined':
+        elif params['paths']['data'] == 'd3d_fully_defined':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
         # jet data but with fully defined signals
-        elif params['paths']['data'] == 'd3d_data_fully_defined_0D':
+        elif params['paths']['data'] == 'd3d_fully_defined_0D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
-        elif params['paths']['data'] == 'd3d_data_temp_profile':
+        elif params['paths']['data'] == 'd3d_temp_profile':
             # jet data but with fully defined signals
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
                 'etemp_profile': sig.etemp_profile}  # fully_defined_signals_0D
-        elif params['paths']['data'] == 'd3d_data_dens_profile':
+        elif params['paths']['data'] == 'd3d_dens_profile':
             # jet data but with fully defined signals
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = []
             params['paths']['use_signals_dict'] = {
                 'edens_profile': sig.edens_profile}  # fully_defined_signals_0D
-
-        # cross-machine
-        elif params['paths']['data'] == 'jet_to_d3d_data':
+        # ======================
+        # CROSS-MACHINE DATASETS
+        # ======================
+        elif params['paths']['data'] == 'jet_to_d3d_all':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = [d3d_full]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        elif params['paths']['data'] == 'd3d_to_jet_data':
+        elif params['paths']['data'] == 'd3d_to_jet_all':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        elif params['paths']['data'] == 'd3d_to_late_jet_data':
+        elif params['paths']['data'] == 'd3d_to_late_jet':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall_late]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals
-        elif params['paths']['data'] == 'jet_to_d3d_data_0D':
+        elif params['paths']['data'] == 'jet_to_d3d_0D':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = [d3d_full]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
-        elif params['paths']['data'] == 'd3d_to_jet_data_0D':
+        elif params['paths']['data'] == 'd3d_to_jet_0D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_0D
-        elif params['paths']['data'] == 'jet_to_d3d_data_1D':
+        elif params['paths']['data'] == 'jet_to_d3d_1D':
             params['paths']['shot_files'] = [jet_full]
             params['paths']['shot_files_test'] = [d3d_full]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_1D
-        elif params['paths']['data'] == 'd3d_to_jet_data_1D':
+        elif params['paths']['data'] == 'd3d_to_jet_1D':
             params['paths']['shot_files'] = [d3d_full]
             params['paths']['shot_files_test'] = [jet_iterlike_wall]
             params['paths']['use_signals_dict'] = sig.fully_defined_signals_1D
diff --git a/plasma/models/builder.py b/plasma/models/builder.py
index 07514aae..95307efe 100644
--- a/plasma/models/builder.py
+++ b/plasma/models/builder.py
@@ -3,13 +3,14 @@
 # KGF: the first time Keras is ever imported via mpi_learn.py -> mpi_runner.py
 import keras.backend as K
 # KGF: see below synchronization--- output is launched here
-from keras.models import Sequential, Model
+from keras.models import Model  # , Sequential
+# KGF: (was used only in hyper_build_model())
 from keras.layers import Input
 from keras.layers.core import (
     Dense, Activation, Dropout, Lambda,
     Reshape, Flatten, Permute,  # RepeatVector
     )
-from keras.layers import LSTM, SimpleRNN, BatchNormalization
+from keras.layers import LSTM, CuDNNLSTM, SimpleRNN, BatchNormalization
 from keras.layers.convolutional import Convolution1D
 from keras.layers.pooling import MaxPooling1D
 # from keras.utils.data_utils import get_file
@@ -26,6 +27,16 @@
 from plasma.utils.downloading import makedirs_process_safe
 from plasma.utils.hashing import general_object_hash
 from plasma.models.tcn import TCN
+# TODO(KGF): consider using importlib.util.find_spec() instead (Py>3.4)
+try:
+    import keras2onnx
+    import onnx
+except ImportError:  # as e:
+    _has_onnx = False
+    # onnx = None
+    # keras2onnx = None
+else:
+    _has_onnx = True
 
 # Synchronize 2x stderr msg from TensorFlow initialization via Keras backend
 # "Succesfully opened dynamic library... libcudart" "Using TensorFlow backend."
@@ -128,6 +139,8 @@ def build_model(self, predict, custom_batch_size=None):
 
         if rnn_type == 'LSTM':
             rnn_model = LSTM
+        elif rnn_type == 'CuDNNLSTM':
+            rnn_model = CuDNNLSTM
         elif rnn_type == 'SimpleRNN':
             rnn_model = SimpleRNN
         else:
@@ -263,19 +276,22 @@ def slicer_output_shape(input_shape, indices):
                 activity_regularizer=l2(dense_regularization))(pre_rnn)
 
         pre_rnn_model = Model(inputs=pre_rnn_input, outputs=pre_rnn)
-        from mpi4py import MPI
-        comm = MPI.COMM_WORLD
-        task_index = comm.Get_rank()
-        if not predict and task_index == 0:
-            print('Printing out pre_rnn model...')
-            fr = open('model_architecture.log', 'w')
-            ori = sys.stdout
-            sys.stdout = fr
-            pre_rnn_model.summary()
-            sys.stdout = ori
-            fr.close()
+        # TODO(KGF): uncomment following lines to get summary of pre-RNN model
+        # from mpi4py import MPI
+        # comm = MPI.COMM_WORLD
+        # task_index = comm.Get_rank()
+        # if not predict and task_index == 0:
+        #     print('Printing out pre_rnn model...')
+        #     fr = open('model_architecture.log', 'w')
+        #     ori = sys.stdout
+        #     sys.stdout = fr
+        #     pre_rnn_model.summary()
+        #     sys.stdout = ori
+        #     fr.close()
         # pre_rnn_model.summary()
         x_input = Input(batch_shape=batch_input_shape)
+        # TODO(KGF): Ge moved this inside a new conditional in Dec 2019. check
+        # x_in = TimeDistributed(pre_rnn_model)(x_input)
         if (num_1D > 0 or (
                 'extra_dense_input' in model_conf.keys()
                 and model_conf['extra_dense_input'])):
@@ -309,21 +325,25 @@ def slicer_output_shape(input_shape, indices):
             # ==========
             # RNN MODEL
             # ==========
+            # LSTM in ONNX: "The maximum opset needed by this model is only 9."
+            model_kwargs = dict(return_sequences=return_sequences,
+                                # batch_input_shape=batch_input_shape,
+                                stateful=stateful,
+                                kernel_regularizer=l2(regularization),
+                                recurrent_regularizer=l2(regularization),
+                                bias_regularizer=l2(regularization),
+                                )
+            if rnn_type != 'CuDNNLSTM':
+                # Dropout is unsupported in CuDNN library
+                model_kwargs['dropout'] = dropout_prob
+                model_kwargs['recurrent_dropout'] = dropout_prob
             for _ in range(model_conf['rnn_layers']):
-                x_in = rnn_model(
-                    rnn_size, return_sequences=return_sequences,
-                    # batch_input_shape=batch_input_shape,
-                    stateful=stateful, kernel_regularizer=l2(regularization),
-                    recurrent_regularizer=l2(regularization),
-                    bias_regularizer=l2(regularization), dropout=dropout_prob,
-                    recurrent_dropout=dropout_prob)(x_in)
+                x_in = rnn_model(rnn_size, **model_kwargs)(x_in)
                 x_in = Dropout(dropout_prob)(x_in)
             if return_sequences:
                 # x_out = TimeDistributed(Dense(100,activation='tanh')) (x_in)
                 x_out = TimeDistributed(
                     Dense(1, activation=output_activation))(x_in)
-            else:
-                x_out = Dense(1, activation=output_activation)(x_in)
         model = Model(inputs=x_input, outputs=x_out)
         # bug with tensorflow/Keras
         # TODO(KGF): what is this bug? this is the only direct "tensorflow"
@@ -343,16 +363,29 @@ def build_train_test_models(self):
     def save_model_weights(self, model, epoch):
         save_path = self.get_save_path(epoch)
         model.save_weights(save_path, overwrite=True)
+        # try:
+        if _has_onnx:
+            save_path = self.get_save_path(epoch, ext='onnx')
+            onnx_model = keras2onnx.convert_keras(model, model.name,
+                                                  target_opset=10)
+            onnx.save_model(onnx_model, save_path)
+        # except Exception as e:
+        #     print(e)
+        return
 
     def delete_model_weights(self, model, epoch):
         save_path = self.get_save_path(epoch)
         assert os.path.exists(save_path)
         os.remove(save_path)
 
-    def get_save_path(self, epoch):
+    def get_save_path(self, epoch, ext='h5'):
         unique_id = self.get_unique_id()
-        return (self.conf['paths']['model_save_path']
-                + 'model.{}._epoch_.{}.h5'.format(unique_id, epoch))
+        dir_path = self.conf['paths']['model_save_path']
+        # TODO(KGF): consider storing .onnx files in subdirectory away from .h5
+        # if ext == 'onnx':
+        #     os.path.join(dir_path, 'onnx/')
+        return os.path.join(
+            dir_path, 'model.{}._epoch_.{}.{}'.format(unique_id, epoch, ext))
 
     def ensure_save_directory(self):
         prepath = self.conf['paths']['model_save_path']
@@ -378,28 +411,30 @@ def load_model_weights(self, model, custom_path=None):
             g.write_all("Loading from custom epoch {}\n".format(epoch))
             return epoch
 
-    # TODO(KGF): method only called in non-MPI runner.py. Deduplicate?
-    def get_latest_save_path(self):
-        epochs = self.get_all_saved_files()
-        if len(epochs) == 0:
-            print('no previous checkpoint found')
-            return ''
-        else:
-            max_epoch = max(epochs)
-            print('loading from epoch {}'.format(max_epoch))
-            return self.get_save_path(max_epoch)
+    # TODO(KGF): method was only called in non-MPI runner.py. Remove.
+    # def get_latest_save_path(self):
+    #     epochs = self.get_all_saved_files()
+    #     if len(epochs) == 0:
+    #         print('no previous checkpoint found')
+    #         return ''
+    #     else:
+    #         max_epoch = max(epochs)
+    #         print('loading from epoch {}'.format(max_epoch))
+    #         return self.get_save_path(max_epoch)
 
     def extract_id_and_epoch_from_filename(self, filename):
         regex = re.compile(r'-?\d+')
         numbers = [int(x) for x in regex.findall(filename)]
-        assert len(numbers) == 3  # id,epoch number and extension
-        assert numbers[2] == 5  # .h5 extension
+        if filename[-3:] == '.h5':
+            assert len(numbers) == 3  # id, epoch number, and .h5 extension
+            assert numbers[2] == 5  # .h5 extension
         return numbers[0], numbers[1]
 
     def get_all_saved_files(self):
         self.ensure_save_directory()
         unique_id = self.get_unique_id()
         path = self.conf['paths']['model_save_path']
+        # TODO(KGF): probably should only list .h5 file, not ONNX right now
         filenames = [name for name in os.listdir(path)
                      if os.path.isfile(os.path.join(path, name))]
         epochs = []
@@ -409,66 +444,71 @@ def get_all_saved_files(self):
                 epochs.append(epoch)
         return epochs
 
-    # FIXME this is essentially the ModelBuilder.build_model
-        # in the long run we want to replace the space dictionary with the
-        # regular conf file - I am sure there is a way to accomodate
-    def hyper_build_model(self, space, predict, custom_batch_size=None):
-        conf = self.conf
-        model_conf = conf['model']
-        rnn_size = model_conf['rnn_size']
-        rnn_type = model_conf['rnn_type']
-        regularization = model_conf['regularization']
-
-        dropout_prob = model_conf['dropout_prob']
-        length = model_conf['length']
-        pred_length = model_conf['pred_length']
-        # skip = model_conf['skip']
-        stateful = model_conf['stateful']
-        return_sequences = model_conf['return_sequences']
-        # model_conf['output_activation']
-        output_activation = conf['data']['target'].activation
-        num_signals = conf['data']['num_signals']
-
-        batch_size = self.conf['training']['batch_size']
-        if predict:
-            batch_size = self.conf['model']['pred_batch_size']
-            # so we can predict with one time point at a time!
-            if return_sequences:
-                length = pred_length
-            else:
-                length = 1
-
-        if custom_batch_size is not None:
-            batch_size = custom_batch_size
-
-        if rnn_type == 'LSTM':
-            rnn_model = LSTM
-        elif rnn_type == 'SimpleRNN':
-            rnn_model = SimpleRNN
-        else:
-            print('Unkown Model Type, exiting.')
-            exit(1)
-
-        batch_input_shape = (batch_size, length, num_signals)
-        model = Sequential()
-
-        for _ in range(model_conf['rnn_layers']):
-            model.add(
-                rnn_model(
-                    rnn_size,
-                    return_sequences=return_sequences,
-                    batch_input_shape=batch_input_shape,
-                    stateful=stateful,
-                    kernel_regularizer=l2(regularization),
-                    recurrent_regularizer=l2(regularization),
-                    bias_regularizer=l2(regularization),
-                    dropout=dropout_prob,
-                    recurrent_dropout=dropout_prob))
-            model.add(Dropout(space['Dropout']))
-        if return_sequences:
-            model.add(TimeDistributed(Dense(1, activation=output_activation)))
-        else:
-            model.add(Dense(1, activation=output_activation))
-        model.reset_states()
-
-        return model
+    # TODO(felker): remove the following code or use as template for DeepHyper
+    # plugin. Formerly was only used in single-GPU runner.py with hyperopt
+
+    # TODO(alexeys): this is essentially the ModelBuilder.build_model
+    # in the long run we want to replace the space dictionary with the
+    # regular conf file - I am sure there is a way to accomodate
+    # def hyper_build_model(self, space, predict, custom_batch_size=None):
+    #     conf = self.conf
+    #     model_conf = conf['model']
+    #     rnn_size = model_conf['rnn_size']
+    #     rnn_type = model_conf['rnn_type']
+    #     regularization = model_conf['regularization']
+
+    #     dropout_prob = model_conf['dropout_prob']
+    #     length = model_conf['length']
+    #     pred_length = model_conf['pred_length']
+    #     # skip = model_conf['skip']
+    #     stateful = model_conf['stateful']
+    #     return_sequences = model_conf['return_sequences']
+    #     # model_conf['output_activation']
+    #     output_activation = conf['data']['target'].activation
+    #     num_signals = conf['data']['num_signals']
+
+    #     batch_size = self.conf['training']['batch_size']
+    #     if predict:
+    #         batch_size = self.conf['model']['pred_batch_size']
+    #         # so we can predict with one time point at a time!
+    #         if return_sequences:
+    #             length = pred_length
+    #         else:
+    #             length = 1
+
+    #     if custom_batch_size is not None:
+    #         batch_size = custom_batch_size
+
+    #     if rnn_type == 'LSTM':
+    #         rnn_model = CuDNNLSTM
+    #     elif rnn_type == 'SimpleRNN':
+    #         rnn_model = SimpleRNN
+    #     else:
+    #         print('Unkown Model Type, exiting.')
+    #         exit(1)
+
+    #     batch_input_shape = (batch_size, length, num_signals)
+    #     model = Sequential()
+
+    #     for _ in range(model_conf['rnn_layers']):
+    #         model.add(
+    #             rnn_model(
+    #                 rnn_size,
+    #                 return_sequences=return_sequences,
+    #                 batch_input_shape=batch_input_shape,
+    #                 stateful=stateful,
+    #                 kernel_regularizer=l2(regularization),
+    #                 recurrent_regularizer=l2(regularization),
+    #                 bias_regularizer=l2(regularization),
+    #                 # dropout=dropout_prob,
+    #                 # recurrent_dropout=dropout_prob
+    #             ))
+    #         model.add(Dropout(space['Dropout']))
+    #     if return_sequences:
+    #         model.add(TimeDistributed(Dense(1, activation=output_activation)
+    # ))
+    #     else:
+    #         model.add(Dense(1, activation=output_activation))
+    #     model.reset_states()
+
+    #     return model
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
index 273fd226..51301d2c 100644
--- a/plasma/models/mpi_runner.py
+++ b/plasma/models/mpi_runner.py
@@ -167,7 +167,7 @@ def get_deltas(self, raw_deltas):
             self.m_list = [np.zeros_like(grad) for grad in raw_deltas]
             self.v_list = [np.zeros_like(grad) for grad in raw_deltas]
         t = self.iterations + 1
-        lr_t = self.lr * np.sqrt(1-self.beta_2**t)/(1-self.beta_1**t)
+        lr_t = self.lr * np.sqrt(1 - self.beta_2**t)/(1 - self.beta_1**t)
         deltas = []
         for (i, grad) in enumerate(raw_deltas):
             m_t = (self.beta_1 * self.m_list[i]) + (1-self.beta_1) * grad
@@ -182,16 +182,20 @@ def get_deltas(self, raw_deltas):
 
 
 class Averager(object):
+    """Compute and store a cumulative moving average (CMA).
+
+    """
+
     def __init__(self):
         self.steps = 0
-        self.val = 0.0
+        self.cma = 0.0
 
-    def add_val(self, val):
-        self.val = (self.steps * self.val + 1.0 * val)/(self.steps + 1.0)
+    def add_val(self, new_val):
+        self.cma = (self.steps * self.cma + 1.0 * new_val)/(self.steps + 1.0)
         self.steps += 1
 
-    def get_val(self):
-        return self.val
+    def get_ave(self):
+        return self.cma
 
 
 class MPIModel():
@@ -432,7 +436,7 @@ def build_callbacks(self, conf, callbacks_list):
         val_loss this should be min, etc. In auto mode, the direction is
         automatically inferred from the name of the monitored quantity.
 
-        -monitor: Quantity used for early stopping, has to
+        - monitor: Quantity used for early stopping, has to
         be from the list of metrics
 
         - patience: Number of epochs used to decide on whether to apply early
@@ -481,12 +485,14 @@ def add_noise(self, X):
 
     def train_epoch(self):
         '''
-        The purpose of the method is to perform distributed mini-batch SGD for
+        Perform distributed mini-batch SGD for
         one epoch.  It takes the batch iterator function and a NN model from
         MPIModel object, fetches mini-batches in a while-loop until number of
         samples seen by the ensemble of workers (num_so_far) exceeds the
         training dataset size (num_total).
 
+        NOTE: "sample" = "an entire shot" within this description
+
         During each iteration, the gradient updates (deltas) and the loss are
         calculated for each model replica in the ensemble, weights are averaged
         over ensemble, and the new weights are set.
@@ -497,11 +503,11 @@ def train_epoch(self):
         Argument list: Empty
 
         Returns:
-          - step: epoch number
-          - ave_loss: training loss averaged over replicas
-          - curr_loss:
-          - num_so_far: the number of samples seen by ensemble of replicas to a
-        current epoch (step)
+          - step: final iteration number
+          - ave_loss: model loss averaged over iterations within this epoch
+          - curr_loss: training loss averaged over replicas at final iteration
+          - num_so_far: the cumulative number of samples seen by the ensemble
+        of replicas up to the end of the final iteration (step) of this epoch
 
         Intermediate outputs and logging: debug printout of task_index (MPI),
         epoch number, number of samples seen to a current epoch, average
@@ -579,7 +585,7 @@ def train_epoch(self):
                 curr_loss = self.mpi_average_scalars(1.0*loss, num_replicas)
                 # g.print_unique(self.model.get_weights()[0][0][:4])
                 loss_averager.add_val(curr_loss)
-                ave_loss = loss_averager.get_val()
+                ave_loss = loss_averager.get_ave()
                 eta = self.estimate_remaining_time(
                     t0 - t_start, self.num_so_far - self.epoch*num_total,
                     num_total)
@@ -641,7 +647,7 @@ def calculate_speed(self, t0, t_after_deltas, t_after_update, num_replicas,
 
         print_str = ('{:.2E} Examples/sec | {:.2E} sec/batch '.format(
             examples_per_sec, t_tot)
-                     + '[{:.1%} calc., {:.1%} synch.]'.format(
+                     + '[{:.1%} calc., {:.1%} sync.]'.format(
                          frac_calculate, frac_sync))
         print_str += '[batch = {} = {}*{}] [lr = {:.2E} = {:.2E}*{}]'.format(
             effective_batch_size, self.batch_size, num_replicas,
@@ -700,7 +706,8 @@ def mpi_make_predictions(conf, shot_list, loader, custom_path=None):
     model = specific_builder.build_model(True)
     specific_builder.load_model_weights(model, custom_path)
 
-    # broadcast model weights then set it explicitely: fix for Py3.6
+    # broadcast model weights then set it explicitly: fix for Py3.6
+    # TODO(KGF): remove if we no longer support Py2
     if sys.version_info[0] > 2:
         if g.task_index == 0:
             new_weights = model.get_weights()
diff --git a/plasma/models/runner.py b/plasma/models/runner.py
deleted file mode 100644
index 547e088e..00000000
--- a/plasma/models/runner.py
+++ /dev/null
@@ -1,523 +0,0 @@
-from plasma.utils.state_reset import reset_states
-from plasma.utils.evaluation import get_loss_from_list
-from plasma.utils.performance import PerformanceAnalyzer
-from plasma.models.loader import Loader, ProcessGenerator
-from plasma.conf import conf
-import pathos.multiprocessing as mp
-from functools import partial
-import os
-import time
-from hyperopt import hp, STATUS_OK
-import numpy as np
-import sys
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt  # noqa
-
-# if sys.version_info[0] < 3:
-#     from itertools import imap
-
-
-backend = conf['model']['backend']
-
-
-def train(conf, shot_list_train, shot_list_validate, loader,
-          shot_list_test=None):
-    loader.set_inference_mode(False)
-    np.random.seed(1)
-
-    validation_losses = []
-    validation_roc = []
-    training_losses = []
-    print('validate: {} shots, {} disruptive'.format(
-        len(shot_list_validate), shot_list_validate.num_disruptive()))
-    print('training: {} shots, {} disruptive'.format(
-        len(shot_list_train), shot_list_train.num_disruptive()))
-
-    if backend == 'tf' or backend == 'tensorflow':
-        first_time = "tensorflow" not in sys.modules
-        if first_time:
-            import tensorflow as tf
-            os.environ['KERAS_BACKEND'] = 'tensorflow'
-            from keras.backend.tensorflow_backend import set_session
-            config = tf.ConfigProto(device_count={"GPU": 1})
-            set_session(tf.Session(config=config))
-    else:
-        os.environ['KERAS_BACKEND'] = 'theano'
-        os.environ['THEANO_FLAGS'] = 'device=gpu,floatX=float32'
-        # import theano
-
-    from keras.utils.generic_utils import Progbar
-    from keras import backend as K
-    from plasma.models import builder
-
-    print('Build model...', end='')
-    specific_builder = builder.ModelBuilder(conf)
-    train_model = specific_builder.build_model(False)
-    print('Compile model', end='')
-    train_model.compile(optimizer=optimizer_class(),
-                        loss=conf['data']['target'].loss)
-    print('...done')
-
-    # load the latest epoch we did. Returns -1 if none exist yet
-    e = specific_builder.load_model_weights(train_model)
-    e_start = e
-    batch_generator = partial(
-        loader.training_batch_generator_partial_reset,
-        shot_list=shot_list_train)
-    batch_iterator = ProcessGenerator(batch_generator())
-
-    num_epochs = conf['training']['num_epochs']
-    # num_at_once = conf['training']['num_shots_at_once']
-    lr_decay = conf['model']['lr_decay']
-    print('{} epochs left to go'.format(num_epochs - 1 - e))
-    num_so_far_accum = 0
-    num_so_far = 0
-    num_total = np.inf
-
-    if conf['callbacks']['mode'] == 'max':
-        best_so_far = -np.inf
-        cmp_fn = max
-    else:
-        best_so_far = np.inf
-        cmp_fn = min
-
-    while e < (num_epochs - 1):
-        e += 1
-        print('\nEpoch {}/{}'.format(e+1, num_epochs))
-        pbar = Progbar(len(shot_list_train))
-
-        # TODO(KGF): check this fix; lr, tf were undefined in neglected
-        # serial runner.py, since mpi_runner.py has been the main tool
-        lr = conf['model']['lr']
-        # decay learning rate each epoch:
-        K.set_value(train_model.optimizer.lr, lr*lr_decay**(e))
-
-        num_batches_minimum = 100
-        num_batches_current = 0
-        training_losses_tmp = []
-
-        while (num_so_far < (e - e_start)*num_total
-               or num_batches_current < num_batches_minimum):
-            num_so_far_old = num_so_far
-            try:
-                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                 num_total, is_warmup_period) = next(batch_iterator)
-            except StopIteration:
-                print("Resetting batch iterator.")
-                num_so_far_accum = num_so_far
-                batch_iterator = ProcessGenerator(batch_generator())
-                (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
-                 num_total, is_warmup_period) = next(batch_iterator)
-            if np.any(batches_to_reset):
-                reset_states(train_model, batches_to_reset)
-            if not is_warmup_period:
-                num_so_far = num_so_far_accum + num_so_far_curr
-                num_batches_current += 1
-                loss = train_model.train_on_batch(batch_xs, batch_ys)
-                training_losses_tmp.append(loss)
-                pbar.add(num_so_far - num_so_far_old,
-                         values=[("train loss", loss)])
-                loader.verbose = False  # True during the first iteration
-            else:
-                _ = train_model.predict(
-                    batch_xs, batch_size=conf['training']['batch_size'])
-
-        e = e_start + 1.0*num_so_far/num_total
-        sys.stdout.flush()
-        ave_loss = np.mean(training_losses_tmp)
-        training_losses.append(ave_loss)
-        specific_builder.save_model_weights(train_model, int(round(e)))
-
-        if conf['training']['validation_frac'] > 0.0:
-            print("prediction on GPU...")
-            _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu(
-                conf, shot_list_validate, loader)
-            validation_losses.append(loss)
-            validation_roc.append(roc_area)
-
-            epoch_logs = {}
-            epoch_logs['val_roc'] = roc_area
-            epoch_logs['val_loss'] = loss
-            epoch_logs['train_loss'] = ave_loss
-            best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']],
-                                 best_so_far)
-            # only save model weights if quantity we are tracking is improving
-            if best_so_far != epoch_logs[conf['callbacks']['monitor']]:
-                print("Not saving model weights")
-                specific_builder.delete_model_weights(train_model,
-                                                      int(round(e)))
-
-            if conf['training']['ranking_difficulty_fac'] != 1.0:
-                (_, _, _, roc_area_train,
-                 loss_train) = make_predictions_and_evaluate_gpu(
-                     conf, shot_list_train, loader)
-                batch_iterator.__exit__()
-                batch_generator = partial(
-                    loader.training_batch_generator_partial_reset,
-                    shot_list=shot_list_train)
-                batch_iterator = ProcessGenerator(batch_generator())
-                num_so_far_accum = num_so_far
-
-        print('=========Summary========')
-        print('Training Loss Numpy: {:.3e}'.format(training_losses[-1]))
-        if conf['training']['validation_frac'] > 0.0:
-            print('Validation Loss: {:.3e}'.format(validation_losses[-1]))
-            print('Validation ROC: {:.4f}'.format(validation_roc[-1]))
-            if conf['training']['ranking_difficulty_fac'] != 1.0:
-                print('Train Loss: {:.3e}'.format(loss_train))
-                print('Train ROC: {:.4f}'.format(roc_area_train))
-
-    # plot_losses(conf,[training_losses],specific_builder,name='training')
-    if conf['training']['validation_frac'] > 0.0:
-        plot_losses(conf, [training_losses, validation_losses, validation_roc],
-                    specific_builder, name='training_validation_roc')
-    batch_iterator.__exit__()
-    print('...done')
-
-
-def optimizer_class():
-    from keras.optimizers import SGD, Adam, RMSprop, Nadam, TFOptimizer
-    # TODO(KGF): check this fix; lr, tf were undefined in neglected
-    # serial runner.py, since mpi_runner.py has been the main tool
-    import tensorflow as tf
-
-    if conf['model']['optimizer'] == 'sgd':
-        return SGD(lr=conf['model']['lr'], clipnorm=conf['model']['clipnorm'])
-    elif conf['model']['optimizer'] == 'momentum_sgd':
-        return SGD(lr=conf['model']['lr'], clipnorm=conf['model']['clipnorm'],
-                   decay=1e-6, momentum=0.9)
-    elif conf['model']['optimizer'] == 'tf_momentum_sgd':
-        return TFOptimizer(tf.train.MomentumOptimizer(
-            learning_rate=conf['model']['lr'], momentum=0.9))
-    elif conf['model']['optimizer'] == 'adam':
-        return Adam(lr=conf['model']['lr'], clipnorm=conf['model']['clipnorm'])
-    elif conf['model']['optimizer'] == 'tf_adam':
-        return TFOptimizer(tf.train.AdamOptimizer(
-            learning_rate=conf['model']['lr']))
-    elif conf['model']['optimizer'] == 'rmsprop':
-        return RMSprop(lr=conf['model']['lr'],
-                       clipnorm=conf['model']['clipnorm'])
-    elif conf['model']['optimizer'] == 'nadam':
-        return Nadam(lr=conf['model']['lr'],
-                     clipnorm=conf['model']['clipnorm'])
-    else:
-        print("Optimizer not implemented yet")
-        exit(1)
-
-
-class HyperRunner(object):
-    def __init__(self, conf, loader, shot_list):
-        self.loader = loader
-        self.shot_list = shot_list
-        self.conf = conf
-
-    # FIXME setup for hyperas search
-    def keras_fmin_fnct(self, space):
-        from plasma.models import builder
-
-        specific_builder = builder.ModelBuilder(self.conf)
-
-        train_model = specific_builder.hyper_build_model(space, False)
-        train_model.compile(optimizer=optimizer_class(),
-                            loss=conf['data']['target'].loss)
-
-        np.random.seed(1)
-        validation_losses = []
-        validation_roc = []
-        training_losses = []
-        shot_list_train, shot_list_validate = self.shot_list.split_direct(
-            1.0-conf['training']['validation_frac'], do_shuffle=True)
-
-        from keras.utils.generic_utils import Progbar
-        from keras import backend as K
-
-        num_epochs = self.conf['training']['num_epochs']
-        num_at_once = self.conf['training']['num_shots_at_once']
-        lr_decay = self.conf['model']['lr_decay']
-
-        resulting_dict = {'loss': None, 'status': STATUS_OK, 'model': None}
-
-        e = -1
-        # print("Current num_epochs {}".format(e))
-        while e < num_epochs-1:
-            e += 1
-            pbar = Progbar(len(shot_list_train))
-
-            shot_list_train.shuffle()
-            shot_sublists = shot_list_train.sublists(num_at_once)[:1]
-            training_losses_tmp = []
-            # TODO(KGF): check this fix; lr, tf were undefined in neglected
-            # serial runner.py, since mpi_runner.py has been the main tool
-            lr = conf['model']['lr']
-            K.set_value(train_model.optimizer.lr, lr*lr_decay**(e))
-            for (i, shot_sublist) in enumerate(shot_sublists):
-                X_list, y_list = self.loader.load_as_X_y_list(shot_sublist)
-                for j, (X, y) in enumerate(zip(X_list, y_list)):
-                    history = builder.LossHistory()
-                    train_model.fit(X, y,
-                                    batch_size=Loader.get_batch_size(self.conf['training']['batch_size'],  prediction_mode=False),  # noqa
-                                    epochs=1, shuffle=False, verbose=0,
-                                    validation_split=0.0, callbacks=[history])
-                    train_model.reset_states()
-                    train_loss = np.mean(history.losses)
-                    training_losses_tmp.append(train_loss)
-
-                    pbar.add(1.0*len(shot_sublist)/len(X_list),
-                             values=[("train loss", train_loss)])
-                    self.loader.verbose = False
-            sys.stdout.flush()
-            training_losses.append(np.mean(training_losses_tmp))
-            specific_builder.save_model_weights(train_model, e)
-            _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu(
-                self.conf, shot_list_validate, self.loader)
-            print("Epoch: {}, loss: {}, validation_losses_size: {}".format(
-                e, loss, len(validation_losses)))
-            validation_losses.append(loss)
-            validation_roc.append(roc_area)
-            resulting_dict['loss'] = loss
-            resulting_dict['model'] = train_model
-            # print("Results {}, before
-            # {}".format(resulting_dict,id(resulting_dict)))
-
-        # print("Results {}, after
-        # {}".format(resulting_dict,id(resulting_dict)))
-        return resulting_dict
-
-    def get_space(self):
-        return {'Dropout': hp.uniform('Dropout', 0, 1), }
-
-    def frnn_minimize(self, algo, max_evals, trials, rseed=1337):
-        from hyperopt import fmin
-        best_run = fmin(self.keras_fmin_fnct, space=self.get_space(),
-                        algo=algo, max_evals=max_evals, trials=trials,
-                        rstate=np.random.RandomState(rseed))
-        best_model = None
-        for trial in trials:
-            vals = trial.get('misc').get('vals')
-            for key in vals.keys():
-                vals[key] = vals[key][0]
-            if (trial.get('misc').get('vals') == best_run
-                    and 'model' in trial.get('result').keys()):
-                best_model = trial.get('result').get('model')
-
-        return best_run, best_model
-
-
-def plot_losses(conf, losses_list, specific_builder, name=''):
-    unique_id = specific_builder.get_unique_id()
-    savedir = 'losses'
-    if not os.path.exists(savedir):
-        os.makedirs(savedir)
-
-    save_path = os.path.join(savedir, '{}_loss_{}.png'.format(name, unique_id))
-    plt.figure()
-    for losses in losses_list:
-        plt.semilogy(losses)
-    plt.xlabel('Epoch')
-    plt.ylabel('Loss')
-    plt.grid()
-    plt.savefig(save_path)
-
-
-def make_predictions(conf, shot_list, loader):
-    loader.set_inference_mode(True)
-
-    use_cores = max(1, mp.cpu_count()-2)
-
-    if backend == 'tf' or backend == 'tensorflow':
-        first_time = "tensorflow" not in sys.modules
-        if first_time:
-            import tensorflow as tf
-            os.environ['KERAS_BACKEND'] = 'tensorflow'
-            from keras.backend.tensorflow_backend import set_session
-            config = tf.ConfigProto(device_count={"CPU": use_cores})
-            set_session(tf.Session(config=config))
-    else:
-        os.environ['THEANO_FLAGS'] = 'device=cpu'
-        # import theano
-
-    from plasma.models.builder import ModelBuilder
-    specific_builder = ModelBuilder(conf)
-
-    y_prime = []
-    y_gold = []
-    disruptive = []
-
-    model = specific_builder.build_model(True)
-    model.compile(optimizer=optimizer_class(),
-                  loss=conf['data']['target'].loss)
-
-    specific_builder.load_model_weights(model)
-    model_save_path = specific_builder.get_latest_save_path()
-
-    start_time = time.time()
-    pool = mp.Pool(use_cores)
-    fn = partial(make_single_prediction, builder=specific_builder,
-                 loader=loader, model_save_path=model_save_path)
-
-    print('running in parallel on {} processes'.format(pool._processes))
-    for (i, (y_p, y, is_disruptive)) in enumerate(pool.imap(fn, shot_list)):
-        print('Shot {}/{}'.format(i, len(shot_list)))
-        sys.stdout.flush()
-        y_prime.append(y_p)
-        y_gold.append(y)
-        disruptive.append(is_disruptive)
-    pool.close()
-    pool.join()
-    print('Finished Predictions in {} seconds'.format(time.time()-start_time))
-    loader.set_inference_mode(False)
-    return y_prime, y_gold, disruptive
-
-
-def make_single_prediction(shot, specific_builder, loader, model_save_path):
-    loader.set_inference_mode(True)
-    model = specific_builder.build_model(True)
-    model.compile(optimizer=optimizer_class(),
-                  loss=conf['data']['target'].loss)
-
-    model.load_weights(model_save_path)
-    model.reset_states()
-    X, y = loader.load_as_X_y(shot, prediction_mode=True)
-    assert X.shape[0] == y.shape[0]
-    y_p = model.predict(
-        X, batch_size=Loader.get_batch_size(conf['training']['batch_size'],
-                                            prediction_mode=True), verbose=0)
-    answer_dims = y_p.shape[-1]
-    if conf['model']['return_sequences']:
-        shot_length = y_p.shape[0]*y_p.shape[1]
-    else:
-        shot_length = y_p.shape[0]
-    y_p = np.reshape(y_p, (shot_length, answer_dims))
-    y = np.reshape(y, (shot_length, answer_dims))
-    is_disruptive = shot.is_disruptive_shot()
-    model.reset_states()
-    loader.set_inference_mode(False)
-    return y_p, y, is_disruptive
-
-
-def make_predictions_gpu(conf, shot_list, loader, custom_path=None):
-    loader.set_inference_mode(True)
-    if backend == 'tf' or backend == 'tensorflow':
-        first_time = "tensorflow" not in sys.modules
-        if first_time:
-            import tensorflow as tf
-            os.environ['KERAS_BACKEND'] = 'tensorflow'
-            from keras.backend.tensorflow_backend import set_session
-            config = tf.ConfigProto(device_count={"GPU": 1})
-            set_session(tf.Session(config=config))
-    else:
-        os.environ['THEANO_FLAGS'] = 'device=gpu,floatX=float32'
-        # import theano
-
-    from keras.utils.generic_utils import Progbar
-    from plasma.models.builder import ModelBuilder
-    specific_builder = ModelBuilder(conf)
-
-    y_prime = []
-    y_gold = []
-    disruptive = []
-
-    model = specific_builder.build_model(True)
-    model.compile(optimizer=optimizer_class(),
-                  loss=conf['data']['target'].loss)
-
-    specific_builder.load_model_weights(model, custom_path)
-    model.reset_states()
-
-    pbar = Progbar(len(shot_list))
-    shot_sublists = shot_list.sublists(conf['model']['pred_batch_size'],
-                                       do_shuffle=False, equal_size=True)
-    for (i, shot_sublist) in enumerate(shot_sublists):
-        X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist)
-        # load data and fit on data
-        y_p = model.predict(X,
-                            batch_size=conf['model']['pred_batch_size'])
-        model.reset_states()
-        y_p = loader.batch_output_to_array(y_p)
-        y = loader.batch_output_to_array(y)
-        # cut arrays back
-        y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)]
-        y = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y)]
-
-        pbar.add(1.0*len(shot_sublist))
-        loader.verbose = False  # True during the first iteration
-        y_prime += y_p
-        y_gold += y
-        disruptive += disr
-    y_prime = y_prime[:len(shot_list)]
-    y_gold = y_gold[:len(shot_list)]
-    disruptive = disruptive[:len(shot_list)]
-    loader.set_inference_mode(False)
-    return y_prime, y_gold, disruptive
-
-
-def make_predictions_and_evaluate_gpu(
-        conf, shot_list, loader, custom_path=None):
-    y_prime, y_gold, disruptive = make_predictions_gpu(
-        conf, shot_list, loader, custom_path)
-    analyzer = PerformanceAnalyzer(conf=conf)
-    roc_area = analyzer.get_roc_area(y_prime, y_gold, disruptive)
-    shot_list.set_weights(analyzer.get_shot_difficulty(
-        y_prime, y_gold, disruptive))
-    loss = get_loss_from_list(y_prime, y_gold, conf['data']['target'])
-    return y_prime, y_gold, disruptive, roc_area, loss
-
-
-def make_evaluations_gpu(conf, shot_list, loader):
-    loader.set_inference_mode(True)
-
-    if backend == 'tf' or backend == 'tensorflow':
-        first_time = "tensorflow" not in sys.modules
-        if first_time:
-            import tensorflow as tf
-            os.environ['KERAS_BACKEND'] = 'tensorflow'
-            from keras.backend.tensorflow_backend import set_session
-            config = tf.ConfigProto(device_count={"GPU": 1})
-            set_session(tf.Session(config=config))
-    else:
-        os.environ['THEANO_FLAGS'] = 'device=gpu,floatX=float32'
-        # import theano
-
-    from keras.utils.generic_utils import Progbar
-    from plasma.models.builder import ModelBuilder
-    specific_builder = ModelBuilder(conf)
-
-    # y_prime = []
-    # y_gold = []
-    # disruptive = []
-    batch_size = min(len(shot_list), conf['model']['pred_batch_size'])
-
-    pbar = Progbar(len(shot_list))
-    print('evaluating {} shots using batchsize {}'.format(
-        len(shot_list), batch_size))
-
-    shot_sublists = shot_list.sublists(batch_size, equal_size=False)
-    all_metrics = []
-    all_weights = []
-    for (i, shot_sublist) in enumerate(shot_sublists):
-        batch_size = len(shot_sublist)
-        model = specific_builder.build_model(
-            True, custom_batch_size=batch_size)
-        model.compile(optimizer=optimizer_class(),
-                      loss=conf['data']['target'].loss)
-
-        specific_builder.load_model_weights(model)
-        model.reset_states()
-        X, y, shot_lengths, disr = loader.load_as_X_y_pred(
-            shot_sublist, custom_batch_size=batch_size)
-        # load data and fit on data
-        all_metrics.append(model.evaluate(X, y, batch_size=batch_size,
-                                          verbose=False))
-        all_weights.append(batch_size)
-        model.reset_states()
-
-        pbar.add(1.0*len(shot_sublist))
-        loader.verbose = False  # True during the first iteration
-
-    if len(all_metrics) > 1:
-        print('evaluations all: {}'.format(all_metrics))
-    loss = np.average(all_metrics, weights=all_weights)
-    print('Evaluation Loss: {}'.format(loss))
-    loader.set_inference_mode(False)
-    return loss
diff --git a/plasma/models/shallow_runner.py b/plasma/models/shallow_runner.py
index 0b378ed7..f78b6735 100644
--- a/plasma/models/shallow_runner.py
+++ b/plasma/models/shallow_runner.py
@@ -10,6 +10,7 @@
 # from plasma.utils.state_reset import reset_states
 from plasma.utils.evaluation import get_loss_from_list
 from plasma.utils.performance import PerformanceAnalyzer
+from plasma.utils.diagnostics import print_shot_list_sizes
 # from plasma.models.loader import Loader, ProcessGenerator
 # from plasma.conf import conf
 from sklearn.neural_network import MLPClassifier
@@ -324,12 +325,13 @@ def build_callbacks(conf):
 def train(conf, shot_list_train, shot_list_validate, loader,
           shot_list_test=None):
     np.random.seed(1)
-    print('validate: {} shots, {} disruptive'.format(
-        len(shot_list_validate),
-        shot_list_validate.num_disruptive()))
+    print_shot_list_sizes(shot_list_train, shot_list_validate)
     print('training: {} shots, {} disruptive'.format(
         len(shot_list_train),
         shot_list_train.num_disruptive()))
+    print('validate: {} shots, {} disruptive'.format(
+        len(shot_list_validate),
+        shot_list_validate.num_disruptive()))
 
     num_samples = conf['model']['shallow_model']['num_samples']
     feature_extractor = FeatureExtractor(loader)
diff --git a/plasma/models/targets.py b/plasma/models/targets.py
index 567fb43b..fc6729f9 100644
--- a/plasma/models/targets.py
+++ b/plasma/models/targets.py
@@ -126,10 +126,12 @@ def threshold_range(T_warning):
 # time sequence is punished. Also implements class weighting
 class MaxHingeTarget(Target):
     activation = 'linear'
+    loss = 'hinge'
     fac = 1.0
 
     @staticmethod
     def loss(y_true, y_pred):
+        # TODO(KGF): this function is unused and unique to this class
         from plasma.conf import conf
         fac = MaxHingeTarget.fac
         # overall_fac =
@@ -150,8 +152,11 @@ def loss(y_true, y_pred):
     @staticmethod
     def loss_np(y_true, y_pred):
         from plasma.conf import conf
-        fac = MaxHingeTarget.fac
-        # print(y_pred.shape)
+        # TODO(KGF): fac = positive_example_penalty is only used in this class,
+        # only in above (unused) loss() fn, which only this class has, and is
+        # never called. 2 lines related to fac commented-out in this fn.
+        #
+        # fac = MaxHingeTarget.fac
         overall_fac = np.prod(np.array(y_pred.shape).astype(np.float32))
         max_val = np.max(y_pred, axis=-2)  # temporal axis!
         max_val = np.reshape(
@@ -160,10 +165,8 @@ def loss_np(y_true, y_pred):
         mask = np.equal(max_val, y_pred)
         mask = mask.astype(np.float32)
         y_pred = mask * y_pred + (1-mask) * y_true
-        weight_mask = np.greater(
-            y_true, 0.0).astype(
-            np.float32)  # positive label!
-        weight_mask = fac*weight_mask + (1 - weight_mask)
+        # positive label! weight_mask = fac*weight_mask + (1 - weight_mask):
+        weight_mask = np.greater(y_true, 0.0).astype(np.float32)
         # return np.mean(
         #  weight_mask*np.square(np.maximum(1. - y_true * y_pred, 0.)))
         # , axis=-1)
@@ -196,8 +199,7 @@ def threshold_range(T_warning):
 
 class HingeTarget(Target):
     activation = 'linear'
-
-    loss = 'hinge'  # hinge
+    loss = 'hinge'
 
     @staticmethod
     def loss_np(y_true, y_pred):
diff --git a/plasma/models/tcn.py b/plasma/models/tcn.py
new file mode 100644
index 00000000..9e9355b2
--- /dev/null
+++ b/plasma/models/tcn.py
@@ -0,0 +1,233 @@
+from typing import List, Tuple
+import keras.backend as K
+import keras.layers
+from keras import optimizers
+from keras.engine.topology import Layer
+from keras.layers import Activation, Lambda
+from keras.layers import Conv1D, SpatialDropout1D
+from keras.layers import Dense, BatchNormalization
+from keras.models import Input, Model
+
+
+def residual_block(x, dilation_rate, nb_filters, kernel_size, padding,
+                   activation='relu', dropout_rate=0,
+                   kernel_initializer='he_normal', use_batch_norm=False):
+    # type: (Layer, int, int, int, str, str, float, str, bool) -> Tuple[Layer, Layer]
+    """Defines the residual block for the WaveNet TCN
+
+    Args:
+        x: The previous layer in the model
+        dilation_rate: The dilation power of 2 we are using for this residual block
+        nb_filters: The number of convolutional filters to use in this block
+        kernel_size: The size of the convolutional kernel
+        padding: The padding used in the convolutional layers, 'same' or 'causal'.
+        activation: The final activation used in o = Activation(x + F(x))
+        dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
+        kernel_initializer: Initializer for the kernel weights matrix (Conv1D).
+        use_batch_norm: Whether to use batch normalization in the residual layers or not.
+    Returns:
+        A tuple where the first element is the residual model layer, and the second
+        is the skip connection.
+    """
+    prev_x = x
+    for k in range(2):
+        x = Conv1D(filters=nb_filters,
+                   kernel_size=kernel_size,
+                   dilation_rate=dilation_rate,
+                   kernel_initializer=kernel_initializer,
+                   padding=padding)(x)
+        if use_batch_norm:
+            # TODO should be WeightNorm here, but using batchNorm instead
+            x = BatchNormalization()(x)
+        x = Activation('relu')(x)
+        x = SpatialDropout1D(rate=dropout_rate)(x)
+
+    # 1x1 conv to match the shapes (channel dimension).
+    prev_x = Conv1D(nb_filters, 1, padding='same')(prev_x)
+    res_x = keras.layers.add([prev_x, x])
+    res_x = Activation(activation)(res_x)
+    return res_x, x
+
+
+def process_dilations(dilations):
+    def is_power_of_two(num):
+        return num != 0 and ((num & (num - 1)) == 0)
+
+    if all([is_power_of_two(i) for i in dilations]):
+        return dilations
+
+    else:
+        new_dilations = [2 ** i for i in dilations]
+        return new_dilations
+
+
+class TCN:
+    """Creates a TCN layer.
+
+        Input shape:
+            A tensor of shape (batch_size, timesteps, input_dim).
+
+        Args:
+            nb_filters: The number of filters to use in the convolutional layers.
+            kernel_size: The size of the kernel to use in each convolutional layer.
+            dilations: The list of the dilations. Example is: [1, 2, 4, 8, 16, 32, 64].
+            nb_stacks : The number of stacks of residual blocks to use.
+            padding: The padding to use in the convolutional layers, 'causal' or 'same'.
+            use_skip_connections: Boolean. If we want to add skip connections from input to each residual block.
+            return_sequences: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+            activation: The activation used in the residual blocks o = Activation(x + F(x)).
+            dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
+            name: Name of the model. Useful when having multiple TCN.
+            kernel_initializer: Initializer for the kernel weights matrix (Conv1D).
+            use_batch_norm: Whether to use batch normalization in the residual layers or not.
+
+        Returns:
+            A TCN layer.
+        """
+
+    def __init__(self, nb_filters=25, kernel_size=5, nb_stacks=1,
+                 num_layers=10,  # [1, 2, 4, 8, 16, 32,64,128,256,512],
+                 padding='causal', use_skip_connections=True, dropout_rate=0.0,
+                 return_sequences=True, activation='linear', name='tcn',
+                 kernel_initializer='heb_normal', use_batch_norm=False):
+        dilations = [2**i for i in range(0, num_layers)]
+        self.name = name
+        self.return_sequences = return_sequences
+        self.dropout_rate = dropout_rate
+        self.use_skip_connections = use_skip_connections
+        self.dilations = dilations
+        self.nb_stacks = nb_stacks
+        self.kernel_size = kernel_size
+        self.nb_filters = nb_filters
+        self.activation = activation
+        self.padding = padding
+        self.kernel_initializer = kernel_initializer
+        self.use_batch_norm = use_batch_norm
+
+        if padding != 'causal' and padding != 'same':
+            raise ValueError("Only 'causal' or 'same' padding are compatible for this layer.")  # noqa
+
+        if not isinstance(nb_filters, int):
+            print('An interface change occurred after the version 2.1.2.')
+            print('Before: tcn.TCN(x, return_sequences=False, ...)')
+            print('Now should be: tcn.TCN(return_sequences=False, ...)(x)')
+            print('The alternative is to downgrade to 2.1.2 (pip install keras-tcn==2.1.2).')  # noqa
+            raise Exception()
+
+    def __call__(self, inputs):
+        x = inputs
+        # 1D FCN.
+        x = Conv1D(self.nb_filters, 1, padding=self.padding,
+                   kernel_initializer=self.kernel_initializer)(x)
+        skip_connections = []
+        for s in range(self.nb_stacks):
+            for d in self.dilations:
+                x, skip_out = residual_block(
+                    x, dilation_rate=d, nb_filters=self.nb_filters,
+                    kernel_size=self.kernel_size, padding=self.padding,
+                    activation=self.activation, dropout_rate=self.dropout_rate,
+                    kernel_initializer=self.kernel_initializer,
+                    use_batch_norm=self.use_batch_norm)
+                skip_connections.append(skip_out)
+        if self.use_skip_connections:
+            x = keras.layers.add(skip_connections)
+        if not self.return_sequences:
+            x = Lambda(lambda tt: tt[:, -1, :])(x)
+        return x
+
+
+def compiled_tcn(num_feat,  # type: int
+                 num_classes,  # type: int
+                 nb_filters,  # type: int
+                 kernel_size,  # type: int
+                 dilations,  # type: List[int]
+                 nb_stacks,  # type: int
+                 max_len,  # type: int
+                 padding='causal',  # type: str
+                 use_skip_connections=True,  # type: bool
+                 return_sequences=True,
+                 regression=False,  # type: bool
+                 dropout_rate=0.05,  # type: float
+                 name='tcn',  # type: str,
+                 kernel_initializer='he_normal',  # type: str,
+                 activation='linear',  # type:str,
+                 opt='adam',
+                 lr=0.002,
+                 use_batch_norm=False):
+    # type: (...) -> keras.Model
+    """Creates a compiled TCN model for a given task (i.e. regression or classification).
+    Classification uses a sparse categorical loss. Please input class ids and not one-hot encodings.
+
+    Args:
+        num_feat: The number of features of your input, i.e. the last dimension of: (batch_size, timesteps, input_dim).
+        num_classes: The size of the final dense layer, how many classes we are predicting.
+        nb_filters: The number of filters to use in the convolutional layers.
+        kernel_size: The size of the kernel to use in each convolutional layer.
+        dilations: The list of the dilations. Example is: [1, 2, 4, 8, 16, 32, 64].
+        nb_stacks : The number of stacks of residual blocks to use.
+        max_len: The maximum sequence length, use None if the sequence length is dynamic.
+        padding: The padding to use in the convolutional layers.
+        use_skip_connections: Boolean. If we want to add skip connections from input to each residual block.
+        return_sequences: Boolean. Whether to return the last output in the output sequence, or the full sequence.
+        regression: Whether the output should be continuous or discrete.
+        dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
+        activation: The activation used in the residual blocks o = Activation(x + F(x)).
+        name: Name of the model. Useful when having multiple TCN.
+        kernel_initializer: Initializer for the kernel weights matrix (Conv1D).
+        opt: Optimizer name.
+        lr: Learning rate.
+        use_batch_norm: Whether to use batch normalization in the residual layers or not.
+    Returns:
+        A compiled keras TCN.
+    """
+
+    dilations = process_dilations(dilations)
+
+    input_layer = Input(shape=(max_len, num_feat))
+
+    x = TCN(nb_filters, kernel_size, nb_stacks, dilations, padding,
+            use_skip_connections, dropout_rate, return_sequences,
+            activation, name, kernel_initializer, use_batch_norm)(input_layer)
+
+    print('x.shape=', x.shape)
+
+    def get_opt():
+        if opt == 'adam':
+            return optimizers.Adam(lr=lr, clipnorm=1.)
+        elif opt == 'rmsprop':
+            return optimizers.RMSprop(lr=lr, clipnorm=1.)
+        else:
+            raise Exception('Only Adam and RMSProp are available here')
+
+    if not regression:
+        # classification
+        x = Dense(num_classes)(x)
+        x = Activation('softmax')(x)
+        output_layer = x
+        model = Model(input_layer, output_layer)
+
+        # https://github.com/keras-team/keras/pull/11373
+        # It's now in Keras@master but still not available with pip.
+        # TODO remove later.
+        def accuracy(y_true, y_pred):
+            # reshape in case it's in shape (num_samples, 1) instead of
+            # (num_samples,)
+            if K.ndim(y_true) == K.ndim(y_pred):
+                y_true = K.squeeze(y_true, -1)
+            # convert dense predictions to labels
+            y_pred_labels = K.argmax(y_pred, axis=-1)
+            y_pred_labels = K.cast(y_pred_labels, K.floatx())
+            return K.cast(K.equal(y_true, y_pred_labels), K.floatx())
+
+        model.compile(get_opt(), loss='sparse_categorical_crossentropy',
+                      metrics=[accuracy])
+    else:
+        # regression
+        x = Dense(1)(x)
+        x = Activation('linear')(x)
+        output_layer = x
+        model = Model(input_layer, output_layer)
+        model.compile(get_opt(), loss='mean_squared_error')
+    print('model.x = {}'.format(input_layer.shape))
+    print('model.y = {}'.format(output_layer.shape))
+    return model
diff --git a/plasma/models/torch_runner.py b/plasma/models/torch_runner.py
index 5c85e134..195b5275 100644
--- a/plasma/models/torch_runner.py
+++ b/plasma/models/torch_runner.py
@@ -5,6 +5,7 @@
 from torch.autograd import Variable
 import torch.nn as nn
 import torch
+from plasma.utils.diagnostics import print_shot_list_sizes
 from plasma.utils.downloading import makedirs_process_safe
 from plasma.utils.performance import PerformanceAnalyzer
 from plasma.utils.evaluation import get_loss_from_list
@@ -414,11 +415,7 @@ def train(conf, shot_list_train, shot_list_validate, loader):
     data_gen = partial(
         loader.training_batch_generator_full_shot_partial_reset,
         shot_list=shot_list_train)()
-    print('validate: {} shots, {} disruptive'.format(
-        len(shot_list_validate), shot_list_validate.num_disruptive()))
-    print('training: {} shots, {} disruptive'.format(
-        len(shot_list_train), shot_list_train.num_disruptive()))
-
+    print_shot_list_sizes(shot_list_train, shot_list_validate)
     loader.set_inference_mode(False)
 
     train_model = build_torch_model(conf)
diff --git a/plasma/preprocessor/augment.py b/plasma/preprocessor/augment.py
index 9ecb4d43..fef7ae01 100644
--- a/plasma/preprocessor/augment.py
+++ b/plasma/preprocessor/augment.py
@@ -135,8 +135,9 @@ def augment(self, signal, strength=10):
         elif self.conf['data']['augmentation_mode'] == "zero":
             # if "set to zero" augmentation. Can control in conf.
             return signal*0.0
-        elif self.conf['data']['augmentation_mode'] == "none":
-            return signal  # if no augmentation. Should be the default in conf.
+        elif self.conf['data']['augmentation_mode'] is None:
+            # no augmentation should be the default in conf.yaml
+            return signal
         else:
             print("Unknown augmentation mode. Exiting")
             exit(-1)
diff --git a/plasma/preprocessor/normalize.py b/plasma/preprocessor/normalize.py
index 951af3d0..aac8393a 100644
--- a/plasma/preprocessor/normalize.py
+++ b/plasma/preprocessor/normalize.py
@@ -244,7 +244,7 @@ def extract_stats(self, shot):
                                     (1, num_signals))
             stats.is_disruptive = shot.is_disruptive
         else:
-            print('Warning: shot {} not valid, omitting'.format(shot.number))
+            print('Warning: shot {} not valid [omit]'.format(shot.number))
         stats.valid = shot.valid
         stats.machine = shot.machine
         return stats
@@ -397,7 +397,7 @@ def extract_stats(self, shot):
             stats.maximums = np.array([np.max(sig) for sig in list_of_signals])
             stats.is_disruptive = shot.is_disruptive
         else:
-            print('Warning: shot {} not valid, omitting'.format(shot.number))
+            print('Warning: shot {} not valid [omit]'.format(shot.number))
         stats.valid = shot.valid
         stats.machine = shot.machine
         return stats
@@ -465,10 +465,6 @@ def load_stats(self, verbose=False):
             self.print_summary()
 
 
-def get_individual_shot_file(prepath, shot_num, ext='.txt'):
-    return prepath + str(shot_num) + ext
-
-
 def apply_positivity(shot):
     for (i, sig) in enumerate(shot.signals):
         if hasattr(sig, "is_strictly_positive"):
diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
index cfd83a5e..4b5660ca 100644
--- a/plasma/preprocessor/preprocess.py
+++ b/plasma/preprocessor/preprocess.py
@@ -1,6 +1,6 @@
 '''
 #########################################################
-This file containts classes to handle data processing
+This file contains classes to handle data processing
 
 Author: Julian Kates-Harbeck, jkatesharbeck@g.harvard.edu
 
@@ -19,6 +19,7 @@
 import pathos.multiprocessing as mp
 
 from plasma.utils.processing import append_to_filename
+from plasma.utils.diagnostics import print_shot_list_sizes
 from plasma.primitives.shots import ShotList
 from plasma.utils.downloading import mkdirdepth
 
@@ -99,12 +100,18 @@ def preprocess_from_files(self, shot_files, use_shots):
 
         pool.close()
         pool.join()
-        print('Finished Preprocessing {} files in {} seconds'.format(
+        print('\nFinished preprocessing {} files in {} seconds'.format(
             len(shot_list_picked), time.time() - start_time))
-        print('Omitted {} shots of {} total.'.format(
+        print('Using {} shots ({} disruptive shots)'.format(
+            len(used_shots), used_shots.num_disruptive()))
+        print('Omitted {} shots of {} total shots'.format(
             len(shot_list_picked) - len(used_shots), len(shot_list_picked)))
-        print('{}/{} disruptive shots'.format(used_shots.num_disruptive(),
-                                              len(used_shots)))
+        print(
+            'Omitted {} disruptive shots of {} total disruptive shots'.format(
+                shot_list_picked.num_disruptive()
+                - used_shots.num_disruptive(),
+                shot_list_picked.num_disruptive()))
+
         if len(used_shots) == 0:
             print("WARNING: All shots were omitted, please ensure raw data "
                   " is complete and available at {}.".format(
@@ -217,7 +224,7 @@ def apply_bleed_in(conf, shot_list_train, shot_list_validate, shot_list_test):
         #         if conf['data']['bleed_in_remove_from_test']:
         #             shot_list_test.remove(s)
         # else:
-        #     print('No disruptive shots in test set, omitting bleed in')
+        #     print('No disruptive shots in test set, [omit] bleed in')
         # if num_nd > 0:
         #     for i in range(num):
         #         s = shot_list_test.sample_single_class(False)
@@ -226,7 +233,7 @@ def apply_bleed_in(conf, shot_list_train, shot_list_validate, shot_list_test):
         #         if conf['data']['bleed_in_remove_from_test']:
         #             shot_list_test.remove(s)
         # else:
-        #     print('No nondisruptive shots in test set, omitting bleed in')
+        #     print('No nondisruptive shots in test set, [omit] bleed in')
     return shot_list_train, shot_list_validate, shot_list_test
 
 
@@ -256,12 +263,8 @@ def guarantee_preprocessed(conf, verbose=False):
     shot_list_train, shot_list_validate, shot_list_test = apply_bleed_in(
         conf, shot_list_train, shot_list_validate, shot_list_test)
     if verbose:
-        g.print_unique('validate: {} shots, {} disruptive'.format(
-            len(shot_list_validate), shot_list_validate.num_disruptive()))
-        g.print_unique('training: {} shots, {} disruptive'.format(
-            len(shot_list_train), shot_list_train.num_disruptive()))
-        g.print_unique('testing: {} shots, {} disruptive'.format(
-            len(shot_list_test), shot_list_test.num_disruptive()))
+        print_shot_list_sizes(shot_list_train, shot_list_validate,
+                              shot_list_test)
         g.print_unique("...done")
     #    g.print_unique("...printing test shot list:")
     #    for s in shot_list_test:
diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py
index 83267a8d..91a2aed0 100644
--- a/plasma/primitives/data.py
+++ b/plasma/primitives/data.py
@@ -5,7 +5,6 @@
 import re
 
 from scipy.interpolate import UnivariateSpline
-
 from plasma.utils.processing import get_individual_shot_file
 from plasma.utils.downloading import get_missing_value_array
 from plasma.utils.hashing import myhash
@@ -51,9 +50,10 @@ def is_ip(self):
         return self.is_ip
 
     def get_file_path(self, prepath, machine, shot_number):
-        dirname = self.get_path(machine)
-        return get_individual_shot_file(prepath + '/' + machine.name + '/'
-                                        + dirname + '/', shot_number)
+        signal_dirname = self.get_path(machine)
+        dirname = os.path.join(prepath, machine.name, signal_dirname)
+        return get_individual_shot_file(dirname, machine.name, shot_number,
+                                        raw_signal=True)
 
     def is_valid(self, prepath, shot, dtype='float32'):
         t, data, exists = self.load_data(prepath, shot, dtype)
@@ -66,24 +66,24 @@ def is_saved(self, prepath, shot):
     def load_data_from_txt_safe(self, prepath, shot, dtype='float32'):
         file_path = self.get_file_path(prepath, shot.machine, shot.number)
         if not self.is_saved(prepath, shot):
-            print('Signal {}, shot {} was never downloaded'.format(
+            print('Signal {}, shot {} was never downloaded [omit]'.format(
                 self.description, shot.number))
             return None, False
 
         if os.path.getsize(file_path) == 0:
             print('Signal {}, shot {} '.format(self.description, shot.number),
-                  'was downloaded incorrectly (empty file). Removing.')
+                  'was downloaded incorrectly (empty file) [omit]')
             os.remove(file_path)
             return None, False
         try:
             data = np.loadtxt(file_path, dtype=dtype)
             if np.all(data == get_missing_value_array()):
-                print('Signal {}, shot {} contains no data'.format(
+                print('Signal {}, shot {} contains no data [omit]'.format(
                     self.description, shot.number))
                 return None, False
         except Exception as e:
             print(e)
-            print('Couldnt load signal {} shot {}. Removing.'.format(
+            print('Cannot load signal {} shot {} [omit]'.format(
                 file_path, shot.number))
             os.remove(file_path)
             return None, False
@@ -104,12 +104,12 @@ def load_data(self, prepath, shot, dtype='float32'):
         if self.is_ip:  # restrict shot to current threshold
             region = np.where(np.abs(sig) >= shot.machine.current_threshold)[0]
             if len(region) == 0:
-                print('shot {} has no current'.format(shot.number))
+                print('Shot {} has no current [omit]'.format(shot.number))
                 return None, sig.shape, False
             first_idx = region[0]
             last_idx = region[-1]
             # add 50 ms to cover possible disruption event
-            last_time = t[last_idx]+5e-2
+            last_time = t[last_idx] + 5e-2
             last_indices = np.where(t > last_time)[0]
             if len(last_indices) == 0:
                 last_idx = -1
@@ -121,19 +121,16 @@ def load_data(self, prepath, shot, dtype='float32'):
         # make sure shot is not garbage data
         if len(t) <= 1 or (np.max(sig) == 0.0 and np.min(sig) == 0.0):
             if self.is_ip:
-                print('shot {} has no current'.format(shot.number))
+                print('Shot {} has no current [omit]'.format(shot.number))
             else:
-                print(
-                    'Signal {}, shot {} contains no data'.format(
-                        self.description, shot.number))
+                print('Signal {}, shot {} contains no data [omit]'.format(
+                    self.description, shot.number))
             return None, sig.shape, False
 
-        # make sure data doesn't contain nan
+        # make sure data doesn't contain NaN values
         if np.any(np.isnan(t)) or np.any(np.isnan(sig)):
-            print(
-                'Signal {}, shot {} contains NAN'.format(
-                    self.description,
-                    shot.number))
+            print('Signal {}, shot {} contains NaN [omit]'.format(
+                self.description, shot.number))
             return None, sig.shape, False
 
         return t, sig, True
@@ -247,15 +244,16 @@ def load_data(self, prepath, shot, dtype='float32'):
             print('Signal {}, shot {} '.format(self.description, shot.number),
                   'should be profile but has only one channel. Possibly only ',
                   'one profile fit was run for the duration of the shot and ',
-                  'was transposed during downloading. Need at least 2.')
+                  'was transposed during downloading. Need at least 2 channels'
+                  ' [omit]')
             return None, None, False
         if len(t) <= 1 or (np.max(sig) == 0.0 and np.min(sig) == 0.0):
             print('Signal {}, shot {} '.format(self.description, shot.number),
-                  'contains no data.')
+                  'contains no data [omit]')
             return None, None, False
         if np.any(np.isnan(t)) or np.any(np.isnan(sig)):
             print('Signal {}, shot {} '.format(self.description, shot.number),
-                  'contains NaN value(s).')
+                  'contains NaN value(s) [omit]')
             return None, None, False
 
         timesteps = len(t)
@@ -272,14 +270,15 @@ def load_data(self, prepath, shot, dtype='float32'):
                 print('Signal {}, shot {} '.format(self.description,
                                                    shot.number),
                       'has insufficient points for linear interpolation. ',
-                      'dfitpack.error: (m>k) failed for hidden m: fpcurf0:m=1')
+                      'dfitpack.error: (m>k) failed for hidden m: fpcurf0:m=1 '
+                      '[omit]')
                 return None, None, False
 
         return t, sig_interp, True
 
     def fetch_data(self, machine, shot_num, c):
-        time, data, mapping, success = self.fetch_data_basic(machine, shot_num,
-                                                             c)
+        time, data, mapping, success = self.fetch_data_basic(
+            machine, shot_num, c)
         path = self.get_path(machine)
         mapping_path = self.get_mapping_path(machine)
 
@@ -353,19 +352,21 @@ def fetch_data(self, machine, shot_num, c):
         if channel_num is not None and success:
             if np.ndim(data) != 2:
                 print("Channel Signal {} expected 2D array for shot {}".format(
-                    self, self.shot_number))
+                    self, self.shot_number), ' [omit]')
                 success = False
             else:
                 data = data[channel_num, :]  # extract channel of interest
         return time, data, mapping, success
 
     def get_file_path(self, prepath, machine, shot_number):
-        dirname = self.get_path(machine)
+        signal_dirname = self.get_path(machine)
         num = self.get_channel_num(machine)
         if num is not None:
-            dirname += "/channel{}".format(num)
-        return get_individual_shot_file(prepath + '/' + machine.name + '/'
-                                        + dirname + '/', shot_number)
+            # TODO(KGF): deduplicate with parent class fn. Only difference:
+            signal_dirname += "/channel{}".format(num)
+        dirname = os.path.join(prepath, machine.name, signal_dirname)
+        return get_individual_shot_file(dirname, machine.name, shot_number,
+                                        raw_signal=True)
 
 
 class Machine(object):
diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py
index 4862f067..d6671fdf 100644
--- a/plasma/primitives/shots.py
+++ b/plasma/primitives/shots.py
@@ -13,10 +13,12 @@
 import os.path
 import sys
 import random as rnd
-
 import numpy as np
 
-from plasma.utils.processing import train_test_split, cut_and_resample_signal
+from plasma.utils.processing import (
+    train_test_split, cut_and_resample_signal,
+    get_individual_shot_file
+    )
 from plasma.utils.downloading import makedirs_process_safe
 from plasma.utils.hashing import myhash
 
@@ -75,19 +77,18 @@ def __init__(self, shots=None):
             assert all([isinstance(shot, Shot) for shot in shots])
             self.shots = [shot for shot in shots]
 
-    def load_from_shot_list_files_object(
-            self, shot_list_files_object, signals):
+    def load_from_shot_list_files_object(self, shot_list_files_object,
+                                         signals):
         machine = shot_list_files_object.machine
         shot_numbers, disruption_times = (
             shot_list_files_object.get_shot_numbers_and_disruption_times())
         for number, t in list(zip(shot_numbers, disruption_times)):
-            self.append(
-                Shot(number=number, t_disrupt=t, machine=machine,
-                     signals=[s for s in signals if
-                              s.is_defined_on_machine(machine)]))
+            self.append(Shot(number=number, t_disrupt=t, machine=machine,
+                             signals=[s for s in signals if
+                                      s.is_defined_on_machine(machine)]))
 
-    def load_from_shot_list_files_objects(
-            self, shot_list_files_objects, signals):
+    def load_from_shot_list_files_objects(self, shot_list_files_objects,
+                                          signals):
         for obj in shot_list_files_objects:
             self.load_from_shot_list_files_object(obj, signals)
 
@@ -99,13 +100,17 @@ def split_train_test(self, conf):
         shuffle_training = conf['training']['shuffle_training']
         use_shots = conf['data']['use_shots']
         all_signals = conf['paths']['all_signals']
-        # split randomly
+        # split "maximum number of shots to use" into:
+        # test vs. (train U validate)
         use_shots_train = int(round(train_frac*use_shots))
         use_shots_test = int(round((1-train_frac)*use_shots))
         if len(shot_files_test) == 0:
+            # split randomly, e.g. sample both sets from same distribution
+            # such as D3D test and train
             shot_list_train, shot_list_test = train_test_split(
                 self.shots, train_frac, shuffle_training)
-        # train and test list given
+        # train and test list given, e.g. they are sampled from separate
+        # distributions such as train=CW and test=ILW for JET
         else:
             shot_list_train = ShotList()
             shot_list_train.load_from_shot_list_files_objects(
@@ -117,7 +122,6 @@ def split_train_test(self, conf):
 
         shot_numbers_train = [shot.number for shot in shot_list_train]
         shot_numbers_test = [shot.number for shot in shot_list_test]
-        print(len(shot_numbers_train), len(shot_numbers_test))
         # make sure we only use pre-filtered valid shots
         shots_train = self.filter_by_number(shot_numbers_train)
         shots_test = self.filter_by_number(shot_numbers_test)
@@ -172,6 +176,7 @@ def sample_equal_classes(self):
         return self.sample_weighted_given_arr(p)
 
     def get_weights_d_nd(self):
+        # TODO(KGF): only called in above sample_equal_classes()
         num_total = len(self)
         num_d = self.num_disruptive()
         num_nd = num_total - num_d
@@ -259,7 +264,7 @@ def append_if_valid(self, shot):
             self.append(shot)
             return True
         else:
-            # print('Warning: shot {} not valid, omitting'.format(shot.number))
+            # print('Warning: shot {} not valid [omit]'.format(shot.number))
             return False
 
 
@@ -434,7 +439,7 @@ def get_signals_and_times_from_file(self, conf):
                             print('Shot {}: disruption event '.format(
                                 self.number),
                                   'is not contained in valid time region of ',
-                                  'signal {} by {}s, omitting.'.format(
+                                  'signal {} by {}s [omit]'.format(
                                       self.number, signal,
                                       self.t_disrupt - np.max(t)))
                             valid = False
@@ -454,7 +459,7 @@ def get_signals_and_times_from_file(self, conf):
         dt = conf['data']['dt']
         if (t_max - t_min)/dt <= (2*conf['model']['length']
                                   + conf['data']['T_min_warn']):
-            print('Shot {} contains insufficient data, omitting.'.format(
+            print('Shot {} contains insufficient data [omit]'.format(
                 self.number))
             valid = False
 
@@ -512,7 +517,8 @@ def save(self, prepath):
         print('...saved shot {}'.format(self.number))
 
     def get_save_path(self, prepath):
-        return get_individual_shot_file(prepath, self.number, '.npz')
+        return get_individual_shot_file(prepath, self.machine, self.number,
+                                        ext='.npz')
 
     def restore(self, prepath, light=False):
         assert self.previously_saved(prepath), 'shot was never saved'
@@ -540,9 +546,3 @@ def make_light(self):
     @staticmethod
     def is_disruptive_given_disruption_time(t):
         return t >= 0
-
-# it used to be in utilities, but can't import globals in multiprocessing
-
-
-def get_individual_shot_file(prepath, shot_num, ext='.txt'):
-    return prepath + str(shot_num) + ext
diff --git a/plasma/utils/diagnostics.py b/plasma/utils/diagnostics.py
new file mode 100644
index 00000000..d38d6e78
--- /dev/null
+++ b/plasma/utils/diagnostics.py
@@ -0,0 +1,28 @@
+'''
+#########################################################
+This file contains fns for printing diagnostic messages
+#########################################################
+'''
+
+from __future__ import print_function
+import plasma.global_vars as g
+
+
+def print_shot_list_sizes(shot_list_train, shot_list_validate,
+                          shot_list_test=None):
+    nshots = len(shot_list_train) + len(shot_list_validate)
+    nshots_disrupt = (shot_list_train.num_disruptive()
+                      + shot_list_validate.num_disruptive())
+    if shot_list_test is not None:
+        nshots += len(shot_list_test)
+        nshots_disrupt += shot_list_test.num_disruptive()
+    g.print_unique('total: {} shots, {} disruptive'.format(nshots,
+                                                           nshots_disrupt))
+    g.print_unique('training: {} shots, {} disruptive'.format(
+        len(shot_list_train), shot_list_train.num_disruptive()))
+    g.print_unique('validate: {} shots, {} disruptive'.format(
+        len(shot_list_validate), shot_list_validate.num_disruptive()))
+    if shot_list_test is not None:
+        g.print_unique('testing: {} shots, {} disruptive'.format(
+            len(shot_list_test), shot_list_test.num_disruptive()))
+    return
diff --git a/plasma/utils/processing.py b/plasma/utils/processing.py
index d6995bcb..532b5a8b 100644
--- a/plasma/utils/processing.py
+++ b/plasma/utils/processing.py
@@ -10,7 +10,7 @@
 
 from __future__ import print_function
 import itertools
-
+import os
 import numpy as np
 # from scipy.interpolate import UnivariateSpline
 
@@ -57,8 +57,13 @@ def cut_and_resample_signal(t, sig, tmin, tmax, dt, precision_str):
     return resample_signal(t, sig, tmin, tmax, dt, precision_str)
 
 
-def get_individual_shot_file(prepath, shot_num, ext='.txt'):
-    return prepath + str(shot_num) + ext
+def get_individual_shot_file(prepath, machine, shot_num, raw_signal=False,
+                             ext='.txt'):
+    """Return filepath of raw input .txt shot signal or processed .npz shot"""
+    if raw_signal:
+        return os.path.join(prepath, str(shot_num) + ext)
+    else:
+        return os.path.join(prepath, str(machine) + '_' + str(shot_num) + ext)
 
 
 def append_to_filename(path, to_append):
@@ -68,9 +73,13 @@ def append_to_filename(path, to_append):
 
 
 def train_test_split(x, frac, do_shuffle=False):
+    # TODO(KGF): rename these 2x fns; used for generic ShotList.split_direct
     if not isinstance(x, np.ndarray):
         return train_test_split_robust(x, frac, do_shuffle)
     mask = np.array(range(len(x))) < frac*len(x)
+    # Note, these functions do not directly split the "disruptive" subset of
+    # ShotLists; they are only applied to the overall sets and rely on random
+    # shuffling to produce the correct disruptive split in large N sample limit
     if do_shuffle:
         np.random.shuffle(mask)
     return x[mask], x[~mask]
@@ -90,15 +99,15 @@ def train_test_split_robust(x, frac, do_shuffle=False):
     return train, test
 
 
-def train_test_split_all(x, frac, do_shuffle=True):
-    groups = []
-    length = len(x[0])
-    mask = np.array(range(length)) < frac*length
-    if do_shuffle:
-        np.random.shuffle(mask)
-    for item in x:
-        groups.append((item[mask], item[~mask]))
-    return groups
+# def train_test_split_all(x, frac, do_shuffle=True):
+#     groups = []
+#     length = len(x[0])
+#     mask = np.array(range(length)) < frac*length
+#     if do_shuffle:
+#         np.random.shuffle(mask)
+#     for item in x:
+#         groups.append((item[mask], item[~mask]))
+#     return groups
 
 
 def concatenate_sublists(superlist):
diff --git a/setup.cfg b/setup.cfg
index db3e245d..32fe63a5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,5 +27,7 @@ ignore =
        # W503: line break before binary operator (use mutually exclusive W504)
        W503
 # suppress linter warning about MPI init fn call before module-level imports
+# and long comment lines in externally-written tcn.py
 per-file-ignores =
        mpi_learn.py:E402
+       tcn.py:E501       
diff --git a/setup.py b/setup.py
index da722f07..46b365fe 100644
--- a/setup.py
+++ b/setup.py
@@ -19,22 +19,26 @@
       long_description="""Add description here""",
       author="Julian Kates-Harbeck, Alexey Svyatkovskiy",
       author_email="jkatesharbeck@g.harvard.edu",
-      maintainer="Alexey Svyatkovskiy",
-      maintainer_email="alexeys@princeton.edu",
+      maintainer="Kyle Gerard Felker",
+      maintainer_email="felker@anl.gov",
       # url = "http://",
       download_url="https://github.com/PPPLDeepLearning/plasma-python",
       # license = "Apache Software License v2",
       test_suite="tests",
+      # TODO(KGF): continue specifying "mininmum reqs" of deps w/o any version
+      # info in this file in conjunction with specific reqs in Conda YAML?
       install_requires=[
-          'keras>=2.0.5',
+          'keras',
           'pathos',
-          'matplotlib>=2.0.2',
+          'matplotlib',
           'hyperopt',
           'mpi4py',
           'xgboost',
           'scikit-learn',
           'joblib',
           ],
+      # TODO(KGF): add optional feature specs for [deephyper,balsam,
+      # readthedocs,onnx,keras2onnx]
       tests_require=[],
       classifiers=["Development Status :: 3 - Alpha",
                    "Environment :: Console",