From 3a3a9f27444ea77c17034fa8740c520bc7d870fb Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 14:05:08 -0500 Subject: [PATCH 01/51] Print out total shot counts in preprocess.py --- plasma/preprocessor/preprocess.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py index e93e7309..2c6384e4 100644 --- a/plasma/preprocessor/preprocess.py +++ b/plasma/preprocessor/preprocess.py @@ -257,6 +257,11 @@ def guarantee_preprocessed(conf, verbose=False): shot_list_train, shot_list_validate, shot_list_test = apply_bleed_in( conf, shot_list_train, shot_list_validate, shot_list_test) if verbose: + g.print_unique('total: {} shots, {} disruptive'.format( + len(shot_list_validate)+len(shot_list_train)+len(shot_list_test), + shot_list_validate.num_disruptive() + + shot_list_train.num_disruptive() + + shot_list_test.num_disruptive())) g.print_unique('validate: {} shots, {} disruptive'.format( len(shot_list_validate), shot_list_validate.num_disruptive())) g.print_unique('training: {} shots, {} disruptive'.format( From 09c4d6771c7883c195b3f1b26fb6155e88b4469e Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 14:25:12 -0500 Subject: [PATCH 02/51] Always print # train shots before # validate shots in diagnostics Canonical order: train, validate, test --- plasma/models/runner.py | 4 ++-- plasma/models/shallow_runner.py | 6 +++--- plasma/models/torch_runner.py | 5 ++--- plasma/preprocessor/preprocess.py | 4 ++-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/plasma/models/runner.py b/plasma/models/runner.py index 5b3f1d5a..29c42c3c 100644 --- a/plasma/models/runner.py +++ b/plasma/models/runner.py @@ -29,10 +29,10 @@ def train(conf, shot_list_train, shot_list_validate, loader, validation_losses = [] validation_roc = [] training_losses = [] - print('validate: {} shots, {} disruptive'.format( - len(shot_list_validate), shot_list_validate.num_disruptive())) print('training: {} shots, {} disruptive'.format( len(shot_list_train), shot_list_train.num_disruptive())) + print('validate: {} shots, {} disruptive'.format( + len(shot_list_validate), shot_list_validate.num_disruptive())) if backend == 'tf' or backend == 'tensorflow': first_time = "tensorflow" not in sys.modules diff --git a/plasma/models/shallow_runner.py b/plasma/models/shallow_runner.py index 5d459a43..cdfe2c32 100644 --- a/plasma/models/shallow_runner.py +++ b/plasma/models/shallow_runner.py @@ -324,12 +324,12 @@ def build_callbacks(conf): def train(conf, shot_list_train, shot_list_validate, loader, shot_list_test=None): np.random.seed(1) - print('validate: {} shots, {} disruptive'.format( - len(shot_list_validate), - shot_list_validate.num_disruptive())) print('training: {} shots, {} disruptive'.format( len(shot_list_train), shot_list_train.num_disruptive())) + print('validate: {} shots, {} disruptive'.format( + len(shot_list_validate), + shot_list_validate.num_disruptive())) num_samples = conf['model']['shallow_model']['num_samples'] feature_extractor = FeatureExtractor(loader) diff --git a/plasma/models/torch_runner.py b/plasma/models/torch_runner.py index 5c85e134..a4fde559 100644 --- a/plasma/models/torch_runner.py +++ b/plasma/models/torch_runner.py @@ -414,11 +414,10 @@ def train(conf, shot_list_train, shot_list_validate, loader): data_gen = partial( loader.training_batch_generator_full_shot_partial_reset, shot_list=shot_list_train)() - print('validate: {} shots, {} disruptive'.format( - len(shot_list_validate), shot_list_validate.num_disruptive())) print('training: {} shots, {} disruptive'.format( len(shot_list_train), shot_list_train.num_disruptive())) - + print('validate: {} shots, {} disruptive'.format( + len(shot_list_validate), shot_list_validate.num_disruptive())) loader.set_inference_mode(False) train_model = build_torch_model(conf) diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py index 2c6384e4..deaccee3 100644 --- a/plasma/preprocessor/preprocess.py +++ b/plasma/preprocessor/preprocess.py @@ -262,10 +262,10 @@ def guarantee_preprocessed(conf, verbose=False): shot_list_validate.num_disruptive() + shot_list_train.num_disruptive() + shot_list_test.num_disruptive())) - g.print_unique('validate: {} shots, {} disruptive'.format( - len(shot_list_validate), shot_list_validate.num_disruptive())) g.print_unique('training: {} shots, {} disruptive'.format( len(shot_list_train), shot_list_train.num_disruptive())) + g.print_unique('validate: {} shots, {} disruptive'.format( + len(shot_list_validate), shot_list_validate.num_disruptive())) g.print_unique('testing: {} shots, {} disruptive'.format( len(shot_list_test), shot_list_test.num_disruptive())) g.print_unique("...done") From df7228ea17915ffb5d7c34f7449b421ab3768861 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 14:40:33 -0500 Subject: [PATCH 03/51] Encapsulate printing of shot set sizes in new fn, in new file diagnostics.py --- plasma/models/runner.py | 6 ++---- plasma/models/shallow_runner.py | 2 ++ plasma/models/torch_runner.py | 6 ++---- plasma/preprocessor/preprocess.py | 16 ++++------------ plasma/utils/diagnostics.py | 28 ++++++++++++++++++++++++++++ 5 files changed, 38 insertions(+), 20 deletions(-) create mode 100644 plasma/utils/diagnostics.py diff --git a/plasma/models/runner.py b/plasma/models/runner.py index 29c42c3c..c2008817 100644 --- a/plasma/models/runner.py +++ b/plasma/models/runner.py @@ -1,6 +1,7 @@ from plasma.utils.state_reset import reset_states from plasma.utils.evaluation import get_loss_from_list from plasma.utils.performance import PerformanceAnalyzer +from plasma.utils.diagnostics import print_shot_list_sizes from plasma.models.loader import Loader, ProcessGenerator from plasma.conf import conf import pathos.multiprocessing as mp @@ -29,10 +30,7 @@ def train(conf, shot_list_train, shot_list_validate, loader, validation_losses = [] validation_roc = [] training_losses = [] - print('training: {} shots, {} disruptive'.format( - len(shot_list_train), shot_list_train.num_disruptive())) - print('validate: {} shots, {} disruptive'.format( - len(shot_list_validate), shot_list_validate.num_disruptive())) + print_shot_list_sizes(shot_list_train, shot_list_validate) if backend == 'tf' or backend == 'tensorflow': first_time = "tensorflow" not in sys.modules diff --git a/plasma/models/shallow_runner.py b/plasma/models/shallow_runner.py index cdfe2c32..0ea4962b 100644 --- a/plasma/models/shallow_runner.py +++ b/plasma/models/shallow_runner.py @@ -10,6 +10,7 @@ # from plasma.utils.state_reset import reset_states from plasma.utils.evaluation import get_loss_from_list from plasma.utils.performance import PerformanceAnalyzer +from plasma.utils.diagnostics import print_shot_list_sizes # from plasma.models.loader import Loader, ProcessGenerator # from plasma.conf import conf from sklearn.neural_network import MLPClassifier @@ -324,6 +325,7 @@ def build_callbacks(conf): def train(conf, shot_list_train, shot_list_validate, loader, shot_list_test=None): np.random.seed(1) + print_shot_list_sizes(shot_list_train, shot_list_validate) print('training: {} shots, {} disruptive'.format( len(shot_list_train), shot_list_train.num_disruptive())) diff --git a/plasma/models/torch_runner.py b/plasma/models/torch_runner.py index a4fde559..195b5275 100644 --- a/plasma/models/torch_runner.py +++ b/plasma/models/torch_runner.py @@ -5,6 +5,7 @@ from torch.autograd import Variable import torch.nn as nn import torch +from plasma.utils.diagnostics import print_shot_list_sizes from plasma.utils.downloading import makedirs_process_safe from plasma.utils.performance import PerformanceAnalyzer from plasma.utils.evaluation import get_loss_from_list @@ -414,10 +415,7 @@ def train(conf, shot_list_train, shot_list_validate, loader): data_gen = partial( loader.training_batch_generator_full_shot_partial_reset, shot_list=shot_list_train)() - print('training: {} shots, {} disruptive'.format( - len(shot_list_train), shot_list_train.num_disruptive())) - print('validate: {} shots, {} disruptive'.format( - len(shot_list_validate), shot_list_validate.num_disruptive())) + print_shot_list_sizes(shot_list_train, shot_list_validate) loader.set_inference_mode(False) train_model = build_torch_model(conf) diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py index deaccee3..0e649c2d 100644 --- a/plasma/preprocessor/preprocess.py +++ b/plasma/preprocessor/preprocess.py @@ -1,6 +1,6 @@ ''' ######################################################### -This file containts classes to handle data processing +This file contains classes to handle data processing Author: Julian Kates-Harbeck, jkatesharbeck@g.harvard.edu @@ -19,6 +19,7 @@ import pathos.multiprocessing as mp from plasma.utils.processing import append_to_filename +from plasma.utils.diagnostics import print_shot_list_sizes from plasma.primitives.shots import ShotList from plasma.utils.downloading import mkdirdepth @@ -257,16 +258,7 @@ def guarantee_preprocessed(conf, verbose=False): shot_list_train, shot_list_validate, shot_list_test = apply_bleed_in( conf, shot_list_train, shot_list_validate, shot_list_test) if verbose: - g.print_unique('total: {} shots, {} disruptive'.format( - len(shot_list_validate)+len(shot_list_train)+len(shot_list_test), - shot_list_validate.num_disruptive() - + shot_list_train.num_disruptive() - + shot_list_test.num_disruptive())) - g.print_unique('training: {} shots, {} disruptive'.format( - len(shot_list_train), shot_list_train.num_disruptive())) - g.print_unique('validate: {} shots, {} disruptive'.format( - len(shot_list_validate), shot_list_validate.num_disruptive())) - g.print_unique('testing: {} shots, {} disruptive'.format( - len(shot_list_test), shot_list_test.num_disruptive())) + print_shot_list_sizes(shot_list_train, shot_list_validate, + shot_list_test) g.print_unique("...done") return shot_list_train, shot_list_validate, shot_list_test diff --git a/plasma/utils/diagnostics.py b/plasma/utils/diagnostics.py new file mode 100644 index 00000000..6f632887 --- /dev/null +++ b/plasma/utils/diagnostics.py @@ -0,0 +1,28 @@ +''' +######################################################### +This file contains fns for printing diagnostic messages +######################################################### +''' + +from __future__ import print_function +import plasma.global_vars as g + + +def print_shot_list_sizes(shot_list_train, shot_list_validate, + shot_list_test=None): + nshots = len(shot_list_train) + len(shot_list_validate) + nshots_disrupt = (shot_list_train.num_disruptive() + + shot_list_validate.num_disruptive()) + if shot_list_test is not None: + nshots += len(shot_list_test) + nshots_disrupt += shot_list_test.num_disruptive() + g.print_unique('total: {} shots, {} disruptive'.format(nshots, + nshots_disrupt) + g.print_unique('training: {} shots, {} disruptive'.format( + len(shot_list_train), shot_list_train.num_disruptive())) + g.print_unique('validate: {} shots, {} disruptive'.format( + len(shot_list_validate), shot_list_validate.num_disruptive())) + if shot_list_test is not None: + g.print_unique('testing: {} shots, {} disruptive'.format( + len(shot_list_test), shot_list_test.num_disruptive())) + return From cbc31b42aac55e1d2fce646925176a309e1ccb5f Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 14:42:02 -0500 Subject: [PATCH 04/51] Add missing bracket --- plasma/utils/diagnostics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plasma/utils/diagnostics.py b/plasma/utils/diagnostics.py index 6f632887..d38d6e78 100644 --- a/plasma/utils/diagnostics.py +++ b/plasma/utils/diagnostics.py @@ -17,7 +17,7 @@ def print_shot_list_sizes(shot_list_train, shot_list_validate, nshots += len(shot_list_test) nshots_disrupt += shot_list_test.num_disruptive() g.print_unique('total: {} shots, {} disruptive'.format(nshots, - nshots_disrupt) + nshots_disrupt)) g.print_unique('training: {} shots, {} disruptive'.format( len(shot_list_train), shot_list_train.num_disruptive())) g.print_unique('validate: {} shots, {} disruptive'.format( From 366de2d8084e3bd5386c417790e5f4c2a143f98c Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 14:11:17 -0600 Subject: [PATCH 05/51] Comment out unused fn --- plasma/utils/processing.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/plasma/utils/processing.py b/plasma/utils/processing.py index d6995bcb..12938c1b 100644 --- a/plasma/utils/processing.py +++ b/plasma/utils/processing.py @@ -90,15 +90,15 @@ def train_test_split_robust(x, frac, do_shuffle=False): return train, test -def train_test_split_all(x, frac, do_shuffle=True): - groups = [] - length = len(x[0]) - mask = np.array(range(length)) < frac*length - if do_shuffle: - np.random.shuffle(mask) - for item in x: - groups.append((item[mask], item[~mask])) - return groups +# def train_test_split_all(x, frac, do_shuffle=True): +# groups = [] +# length = len(x[0]) +# mask = np.array(range(length)) < frac*length +# if do_shuffle: +# np.random.shuffle(mask) +# for item in x: +# groups.append((item[mask], item[~mask])) +# return groups def concatenate_sublists(superlist): From e527c38d215610ce714135d8bb0d050ce8c13fbb Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 15:21:11 -0500 Subject: [PATCH 06/51] Add details about how many omitted shots were disruptive to preprocess.py --- plasma/preprocessor/preprocess.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py index 0e649c2d..871a27fc 100644 --- a/plasma/preprocessor/preprocess.py +++ b/plasma/preprocessor/preprocess.py @@ -101,12 +101,16 @@ def preprocess_from_files(self, shot_files, use_shots): pool.close() pool.join() - print('Finished Preprocessing {} files in {} seconds'.format( + print('Finished preprocessing {} files in {} seconds'.format( len(shot_list_picked), time.time() - start_time)) + print('Using {}/{} disruptive shots'.format( + used_shots.num_disruptive(), len(used_shots))) print('Omitted {} shots of {} total.'.format( len(shot_list_picked) - len(used_shots), len(shot_list_picked))) - print('{}/{} disruptive shots'.format(used_shots.num_disruptive(), - len(used_shots))) + print('Omitted {} disruptive shots of {} total disruptive.'.format( + shot_list_picked.num_disruptive() - used_shots.num_disruptive, + shot_list_picked.num_disruptive())) + if len(used_shots) == 0: print("WARNING: All shots were omitted, please ensure raw data " " is complete and available at {}.".format( From 286dc8793feda0bd58ddd9c0cf7d2fc37ae7ed08 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 15:32:28 -0500 Subject: [PATCH 07/51] Typo --- plasma/preprocessor/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py index 871a27fc..eaf99604 100644 --- a/plasma/preprocessor/preprocess.py +++ b/plasma/preprocessor/preprocess.py @@ -108,7 +108,7 @@ def preprocess_from_files(self, shot_files, use_shots): print('Omitted {} shots of {} total.'.format( len(shot_list_picked) - len(used_shots), len(shot_list_picked))) print('Omitted {} disruptive shots of {} total disruptive.'.format( - shot_list_picked.num_disruptive() - used_shots.num_disruptive, + shot_list_picked.num_disruptive() - used_shots.num_disruptive(), shot_list_picked.num_disruptive())) if len(used_shots) == 0: From 300966252320fd792fc7616bc5962bdf69270139 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 15:52:43 -0600 Subject: [PATCH 08/51] Remove stray diagnostic print --- plasma/primitives/shots.py | 1 - 1 file changed, 1 deletion(-) diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py index cafb76ed..a9e95df9 100644 --- a/plasma/primitives/shots.py +++ b/plasma/primitives/shots.py @@ -121,7 +121,6 @@ def split_train_test(self, conf): shot_numbers_train = [shot.number for shot in shot_list_train] shot_numbers_test = [shot.number for shot in shot_list_test] - print(len(shot_numbers_train), len(shot_numbers_test)) # make sure we only use pre-filtered valid shots shots_train = self.filter_by_number(shot_numbers_train) shots_test = self.filter_by_number(shot_numbers_test) From 0e4715b78e5e73bb0de220c1e7818a0648d1ed47 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 21 Nov 2019 17:10:36 -0500 Subject: [PATCH 09/51] Reformat diagnostics --- plasma/preprocessor/preprocess.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py index eaf99604..18cf3114 100644 --- a/plasma/preprocessor/preprocess.py +++ b/plasma/preprocessor/preprocess.py @@ -101,15 +101,17 @@ def preprocess_from_files(self, shot_files, use_shots): pool.close() pool.join() - print('Finished preprocessing {} files in {} seconds'.format( + print('\nFinished preprocessing {} files in {} seconds'.format( len(shot_list_picked), time.time() - start_time)) - print('Using {}/{} disruptive shots'.format( - used_shots.num_disruptive(), len(used_shots))) - print('Omitted {} shots of {} total.'.format( + print('Using {} shots ({} disruptive shots)'.format( + len(used_shots), used_shots.num_disruptive())) + print('Omitted {} shots of {} total shots'.format( len(shot_list_picked) - len(used_shots), len(shot_list_picked))) - print('Omitted {} disruptive shots of {} total disruptive.'.format( - shot_list_picked.num_disruptive() - used_shots.num_disruptive(), - shot_list_picked.num_disruptive())) + print( + 'Omitted {} disruptive shots of {} total disruptive shots'.format( + shot_list_picked.num_disruptive() + - used_shots.num_disruptive(), + shot_list_picked.num_disruptive())) if len(used_shots) == 0: print("WARNING: All shots were omitted, please ensure raw data " From a99c01acf94d71a75d4249ba20a8d2cdbee94688 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Fri, 22 Nov 2019 10:25:44 -0600 Subject: [PATCH 10/51] Reduce number of lines --- plasma/primitives/data.py | 16 ++++------- plasma/primitives/shots.py | 57 +++++++++++++------------------------- 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/plasma/primitives/data.py b/plasma/primitives/data.py index 63d30abc..5d84fb0b 100644 --- a/plasma/primitives/data.py +++ b/plasma/primitives/data.py @@ -5,7 +5,6 @@ import re from scipy.interpolate import UnivariateSpline - from plasma.utils.processing import get_individual_shot_file from plasma.utils.downloading import get_missing_value_array from plasma.utils.hashing import myhash @@ -123,17 +122,14 @@ def load_data(self, prepath, shot, dtype='float32'): if self.is_ip: print('shot {} has no current'.format(shot.number)) else: - print( - 'Signal {}, shot {} contains no data'.format( - self.description, shot.number)) + print('Signal {}, shot {} contains no data'.format( + self.description, shot.number)) return None, None, False # make sure data doesn't contain nan if np.any(np.isnan(t)) or np.any(np.isnan(sig)): - print( - 'Signal {}, shot {} contains NAN'.format( - self.description, - shot.number)) + print('Signal {}, shot {} contains NAN'.format( + self.description, shot.number)) return None, None, False return t, sig, True @@ -278,8 +274,8 @@ def load_data(self, prepath, shot, dtype='float32'): return t, sig_interp, True def fetch_data(self, machine, shot_num, c): - time, data, mapping, success = self.fetch_data_basic(machine, shot_num, - c) + time, data, mapping, success = self.fetch_data_basic( + machine, shot_num, c) path = self.get_path(machine) mapping_path = self.get_mapping_path(machine) diff --git a/plasma/primitives/shots.py b/plasma/primitives/shots.py index a9e95df9..3079b8b2 100644 --- a/plasma/primitives/shots.py +++ b/plasma/primitives/shots.py @@ -13,7 +13,6 @@ import os.path import sys import random as rnd - import numpy as np from plasma.utils.processing import train_test_split, cut_and_resample_signal @@ -36,11 +35,9 @@ def __repr__(self): return self.__str__() def get_single_shot_numbers_and_disruption_times(self, full_path): - data = np.loadtxt( - full_path, ndmin=1, dtype={ - 'names': ( - 'num', 'disrupt_times'), 'formats': ( - 'i4', 'f4')}) + data = np.loadtxt(full_path, ndmin=1, + dtype={'names': ('num', 'disrupt_times'), + 'formats': ('i4', 'f4')}) shots = np.array(list(zip(*data))[0]) disrupt_times = np.array(list(zip(*data))[1]) return shots, disrupt_times @@ -77,21 +74,18 @@ def __init__(self, shots=None): assert(all([isinstance(shot, Shot) for shot in shots])) self.shots = [shot for shot in shots] - def load_from_shot_list_files_object( - self, shot_list_files_object, signals): + def load_from_shot_list_files_object(self, shot_list_files_object, + signals): machine = shot_list_files_object.machine shot_numbers, disruption_times = ( shot_list_files_object.get_shot_numbers_and_disruption_times()) for number, t in list(zip(shot_numbers, disruption_times)): - self.append( - Shot(number=number, t_disrupt=t, machine=machine, - signals=[s for s in signals if - s.is_defined_on_machine(machine)] - ) - ) - - def load_from_shot_list_files_objects( - self, shot_list_files_objects, signals): + self.append(Shot(number=number, t_disrupt=t, machine=machine, + signals=[s for s in signals if + s.is_defined_on_machine(machine)])) + + def load_from_shot_list_files_objects(self, shot_list_files_objects, + signals): for obj in shot_list_files_objects: self.load_from_shot_list_files_object(obj, signals) @@ -276,16 +270,9 @@ class Shot(object): property. ''' - def __init__( - self, - number=None, - machine=None, - signals=None, - signals_dict=None, - ttd=None, - valid=None, - is_disruptive=None, - t_disrupt=None): + def __init__(self, number=None, machine=None, signals=None, + signals_dict=None, ttd=None, valid=None, is_disruptive=None, + t_disrupt=None): ''' Shot objects contain following attributes: @@ -415,8 +402,7 @@ def get_signals_and_times_from_file(self, conf): if self.is_disruptive and self.t_disrupt > np.max(t): t_max_total = ( np.max(t) + signal.get_data_avail_tolerance( - self.machine) - ) + self.machine)) if (self.t_disrupt > t_max_total): print('Shot {}: disruption event '.format(self.number), 'is not contained in valid time region of ', @@ -425,8 +411,8 @@ def get_signals_and_times_from_file(self, conf): self.t_disrupt - np.max(t))) valid = False else: - t_max = np.max( - t) + signal.get_data_avail_tolerance(self.machine) + t_max = np.max(t) + signal.get_data_avail_tolerance( + self.machine) else: t_max = min(t_max, np.max(t)) @@ -449,13 +435,8 @@ def get_signals_and_times_from_file(self, conf): return time_arrays, signal_arrays, t_min, t_max, valid - def cut_and_resample_signals( - self, - time_arrays, - signal_arrays, - t_min, - t_max, - conf): + def cut_and_resample_signals(self, time_arrays, signal_arrays, t_min, + t_max, conf): dt = conf['data']['dt'] signals_dict = dict() From 31fafa34e0e4aeafc5ad04e40c7d5e61ecc65892 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Fri, 22 Nov 2019 11:35:43 -0600 Subject: [PATCH 11/51] Drop Rick Zamora's ALCF notes into docs/ --- docs/ALCF.md | 366 +++++++++++++++++++++++++++++++++++++++++++++ examples/conf.yaml | 6 +- 2 files changed, 369 insertions(+), 3 deletions(-) create mode 100644 docs/ALCF.md diff --git a/docs/ALCF.md b/docs/ALCF.md new file mode 100644 index 00000000..63877423 --- /dev/null +++ b/docs/ALCF.md @@ -0,0 +1,366 @@ +# ALCF Theta `plasma-python` FRNN Notes + +**Author: Rick Zamora (rzamora@anl.gov)** + +This document is intended to act as a tutorial for running the [plasma-python](https://github.com/PPPLDeepLearning/plasma-python) implementation of the Fusion recurrent neural network (FRNN) on the ALCF Theta supercomputer (Cray XC40; Intel KNL processors). The steps followed in these notes are based on the Princeton [Tiger-GPU tutorial](https://github.com/PPPLDeepLearning/plasma-python/blob/master/docs/PrincetonUTutorial.md#location-of-the-data-on-tigress), hosted within the main GitHub repository for the project. + +## Environment Setup + + +Choose a *root* directory for FRNN-related installations on Theta: + +``` +export FRNN_ROOT= +cd $FRNN_ROOT +``` + +*Personal Note: Using FRNN_ROOT=/home/zamora/ESP* + +Create a simple directory structure allowing experimental *builds* of the `plasma-python` python code/library: + +``` +mkdir build +mkdir build/miniconda-3.6-4.5.4 +cd build/miniconda-3.6-4.5.4 +``` + +### Custom Miniconda Environment Setup + +Copy miniconda installation script to working directory (and install): + +``` +cp /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/install_miniconda-3.6-4.5.4.sh . +./install_miniconda-3.6-4.5.4.sh +``` + +The `install_miniconda-3.6-4.5.4.sh` script will install `miniconda-4.5.4` (using `Python-3.6`), as well as `Tensorflow-1.12.0` and `Keras 2.2.4`. + + +Update your environment variables to use miniconda: + +``` +export PATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/bin:$PATH +export PYTHONPATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/lib/python3.6/site-packages/:$PYTHONPATH +``` + +Note that the previous lines (as well as the definition of `FRNN_ROOT`) can be appended to your `$HOME/.bashrc` file if you want to use this environment on Theta by default. + + +## Installing `plasma-python` + +Here, we assume the installation is within the custom miniconda environment installed in the previous steps. We also assume the following commands have already been executed: + +``` +export FRNN_ROOT= +export PATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/bin:$PATH +export PYTHONPATH=${FRNN_ROOT}/build/miniconda-3.6-4.5.4/miniconda3/4.5.4/lib/python3.6/site-packages/:$PYTHONPATH +``` + +*Personal Note: Using `export FRNN_ROOT=/lus/theta-fs0/projects/fusiondl_aesp/zamora/FRNN_project`* + +If the environment is set up correctly, installation of `plasma-python` is straightforward: + +``` +cd ${FRNN_ROOT}/build/miniconda-3.6-4.5.4 +git clone https://github.com/PPPLDeepLearning/plasma-python.git +cd plasma-python +python setup.py build +python setup.py install +``` + +## Data Access + +Sample data and metadata is available in `/lus/theta-fs0/projects/FRNN/tigress/alexeys/signal_data` and `/lus/theta-fs0/projects/FRNN/tigress/alexeys/shot_lists`, respectively. It is recommended that users create their own symbolic links to these directories. I recommend that you do this within a directory called `/lus/theta-fs0/projects/fusiondl_aesp//`. For example: + +``` +ln -s /lus/theta-fs0/projects/fusiondl_aesp/FRNN/tigress/alexeys/shot_lists  /lus/theta-fs0/projects/fusiondl_aesp//shot_lists +ln -s /lus/theta-fs0/projects/fusiondl_aesp/FRNN/tigress/alexeys/signal_data  /lus/theta-fs0/projects/fusiondl_aesp//signal_data +``` + +For the examples included in `plasma-python`, there is a configuration file that specifies the root directory of the raw data. Change the `fs_path: '/tigress'` line in `examples/conf.yaml` to reflect the following: + +``` +fs_path: '/lus/theta-fs0/projects/fusiondl_aesp' +``` + +Its also a good idea to change `num_gpus: 4` to `num_gpus: 1`. I am also using the `jet_data_0D` dataset: + +``` +paths: + data: jet_data_0D +``` + + +### Data Preprocessing + +#### The SLOW Way (On Theta) + +Theta is KNL-based, and is **not** the best resource for processing many text files in python. However, the preprocessing step *can* be used by using the following steps (although it may need to be repeated many times to get through the whole dataset in a 60-minute debug queues): + +``` +cd ${FRNN_ROOT}/build/miniconda-3.6-4.5.4/plasma-python/examples +cp /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/submit_guarantee_preprocessed.sh . +``` + +Modify the paths defined in `submit_guarantee_preprocessed.sh` to match your environment. + +Note that the preprocessing module will use Pathos multiprocessing (not MPI/mpi4py). Therefore, the script will see every compute core (all 256 per node) as an available resource. Since the LUSTRE file system is unlikely to perform well with 256 processes (on the same node) opening/closing/creating files at once, it might improve performance if you make a slight change to line 85 in the `vi ~/plasma-python/plasma/preprocessor/preprocess.py` file: + +``` +line 85: use_cores = min( , max(1,mp.cpu_count()-2) ) +``` + +After optionally re-building and installing plasm-python with this change, submit the preprocessing job: + +``` +qsub submit_guarantee_preprocessed.sh +``` + +#### The FAST Way (On Cooley) + +You will fine it much less painful to preprocess the data on Cooley, because the Haswell processors are much better suited for this... Log onto the ALCF Cooley Machine: + +``` +ssh @cooley.alcf.anl.gov +``` + +Copy my `cooley_preprocess` example directory to whatever directory you choose to work in: + +``` +cp -r /lus/theta-fs0/projects/fusiondl_aesp/FRNN/rzamora/scripts/cooley_preprocess . +cd cooley_preprocess +``` + +This directory has a Singularity image with everything you need to run your code on Cooley. Assuming you have created symbolic links to the `shot_lists` and `signal_data` directories in `/lus/theta-fs0/projects/fusiondl_aesp//`, you can just submit the included `COBALT` script (to specify the data you want to process, just modify the included `conf.yaml` file): + +``` +qsub submit.sh +``` + +For me, this finishes in less than 10 minutes, and creates 5523 `.npz` files in the `/lus/theta-fs0/projects/fusiondl_aesp//processed_shots/` directory. The output file of the COBALT submission ends with the following message: + +``` +5522/5523Finished Preprocessing 5523 files in 406.94421911239624 seconds +Omitted 5523 shots of 5523 total. +0/0 disruptive shots +WARNING: All shots were omitted, please ensure raw data is complete and available at /lus/theta-fs0/projects/fusiondl_aesp/zamora/signal_data/. +4327 1196 +``` + + +# Notes on Revisiting Pre-Processes + +## Preprocessing Information + +To understand what might be going wrong with the preprocessing step, let's investigate what the code is actually doing. + +**Step 1** Call `guarentee_preprocessed( conf )`, which is defined in `plasma/preprocessor/preprocess.py`. This function first initializes a `Preprocessor()` object (whose class definition is in the same file), and then checks if the preprocessing was already done (by looking for a file). The preprocessor object is called `pp`. + +**Step 2** Assuming preprocessing is needed, we call `pp.clean_shot_lists()`, which loops through each file in the `shot_lists` directory and calls `self.clean_shot_list()` (not plural) for each text-file item. I do not believe this function is doing any thing when I run it, because all the shot list files have been "cleaned." The cleaning of a shot-list file just means the data is corrected to have two columns, and the file is renamed (to have "clear" in the name). + +**Step 3** We call `pp.preprocess_all()`, which parses some of the config file, and ultimately calls `self.preprocess_from_files(shot_files_all,use_shots)` (where I believe `shot_files_all` is the output directory, and `use_shots` is the number of shots to use). + +**Step 4** The `preprocess_from_files()` function is used to do the actual preprocessing. It does this by creating a multiprocessing pool, and mapping the processes to the `self.preprocess_single_file` function (note that the code for `ShotList` class is in `plasma/primitives/shots.py`, and the preprocessing code is still in `plasma/preprocessor/preprocess.py`). + +**Important:** It looks like the code uses the path definitions in `data/shot_lists/signals.py` to define the location/path of signal data. I believe that some of the signal data is missing, which is causing every "shot" to be labeled as incomplete (and consequently thrown out). + +### Possible Issues + +From the preprocessing output, it is clear that the *Signal Radiated Power Core* data was not downloaded correctly. According to the `data/shot_lists/signals.py` file, the data *should* be in `/lus/theta-fs0/projects/fusiondl_aesp//signal_data/jet/ppf/bolo/kb5h/channel14`. However, the only subdirectory of `~/jet/ppf/` is `~/jet/ppf/efit` + +Another possible issue is that the `data/shot_lists/signals.py` file specifies the **name** of the directory containing the *Radiated Power* data incorrectly (*I THINK*). Instead of the following line: + +`pradtot = Signal("Radiated Power",['jpf/db/b5r-ptot>out'],[jet])` + +We might need this: + +`pradtot = Signal("Radiated Power",['jpf/db/b5r-ptot\>out'],[jet])` + +The issue has to do with the `>` character in the directory name (without the proper `\` escape character, python may be looking in the wrong path). **NOTE: I need to confirm that there is actually an issue with the way the code is actually using the string.** + + +## Singularity/Docker Notes + +Recall that the data preprocessing step was PAINFULLY slow on Theta, and so I decided to use Cooley. To simplify the process of using Cooley, I created a Docker image with the necessary environment. **Personal Note:** I performed this work on my local machine (Mac) in `/Users/rzamora/container-recipes`. + + +In order to use a Docker image within a Singularity container (required on ALCF machines), it is useful to build the image on your local machine and push it to "Docker Hub": + + +**Step 1:** Install Docker if you don't have it. [Docker-Mac](https://www.docker.com/docker-mac) works well for Mac. + +**Step 2:** Build a Docker image using the recipe discussed below. + +``` +export IMAGENAME="test_image" +export RECIPENAME="Docker.centos7-cuda-tf1.12.0" +docker build -t $IMAGENAME -f $RECIPENAME . +``` + +You can check that the image is functional by starting an interactive shell session, and checking that the necessary python modules are available. For example (using `-it` for an interactive session): + +``` +docker run --rm -it -v $PWD:/tmp -w /tmp $IMAGENAME:latest bash +# python -c "import keras; import plasma; print(plasma.__file__)" +``` + +Note that the `plasma-python` source code will be located in `/root/plasma-python/` for the recipe described below. + +**Step 3:** Push the image to [Docker Hub](https://hub.docker.com/). + +Using your docker-hub username: + +``` +docker login --username= +``` + +Then, "tag" the image using the `IMAGE ID` value displayed with `docker image ls`: + +``` +docker tag /: