diff --git a/tests/end2end_pytorch/run.sh b/tests/end2end_pytorch/run.sh index 3333a66..c9acb32 100755 --- a/tests/end2end_pytorch/run.sh +++ b/tests/end2end_pytorch/run.sh @@ -24,6 +24,30 @@ pytest . # run the example cd examples/local sh run.sh +mv output outout_OLD +# re-run the example to check reproducibility +sh run.sh +# check results are the same +DIFF_LINES=`grep "best_dev_metric" output*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | uniq | wc -l` +if [ ${DIFF_LINES} -gt 1 ]; then + echo "ERROR: two identical runs produced different output results - review seed implementation" + exit 1 +else + echo "PASS: two identical runs produced the same output results." +fi + +# run Orion cd ../.. cd examples/local_orion sh run.sh +mv orion_working_dir orion_working_dir_OLD +# re-run the example to check reproducibility +rm -fr orion_db* +sh run.sh +DIFF_LINES=`grep "best_dev_metric" orion_working_dir*/*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | sort | uniq | wc -l` +if [ ${DIFF_LINES} -gt 2 ]; then + echo "ERROR: two identical Orion runs produced different output results - review seed implementation" + exit 1 +else + echo "PASS: two identical Orion runs produced the same output results." +fi diff --git a/tests/end2end_tensorflow/run.sh b/tests/end2end_tensorflow/run.sh index 98dea4f..03c9dec 100755 --- a/tests/end2end_tensorflow/run.sh +++ b/tests/end2end_tensorflow/run.sh @@ -23,6 +23,30 @@ pytest . # run the examples cd examples/local sh run.sh -cd ../../ +mv output outout_OLD +# re-run the example to check reproducibility +sh run.sh +# check results are the same +DIFF_LINES=`grep "best_dev_metric" output*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | uniq | wc -l` +if [ ${DIFF_LINES} -gt 1 ]; then + echo "ERROR: two identical runs produced different output results - review seed implementation" + exit 1 +else + echo "PASS: two identical runs produced the same output results." +fi + +# run Orion +cd ../.. cd examples/local_orion sh run.sh +mv orion_working_dir orion_working_dir_OLD +# re-run the example to check reproducibility +rm -fr orion_db* +sh run.sh +DIFF_LINES=`grep "best_dev_metric" orion_working_dir*/*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | sort | uniq | wc -l` +if [ ${DIFF_LINES} -gt 2 ]; then + echo "ERROR: two identical Orion runs produced different output results - review seed implementation" + exit 1 +else + echo "PASS: two identical Orion runs produced the same output results." +fi diff --git a/{{cookiecutter.project_slug}}/examples/local/config.yaml b/{{cookiecutter.project_slug}}/examples/local/config.yaml index 4b47520..2e58acc 100644 --- a/{{cookiecutter.project_slug}}/examples/local/config.yaml +++ b/{{cookiecutter.project_slug}}/examples/local/config.yaml @@ -6,6 +6,9 @@ patience: 5 architecture: my_model max_epoch: 99 exp_name: my_exp_1 +# set to null to avoid setting a seed (can speed up GPU computation, but +# results will not be reproducible) +seed: 1234 # architecture size: 10 diff --git a/{{cookiecutter.project_slug}}/examples/local_orion/config.yaml b/{{cookiecutter.project_slug}}/examples/local_orion/config.yaml index 28fdd59..9c05efd 100644 --- a/{{cookiecutter.project_slug}}/examples/local_orion/config.yaml +++ b/{{cookiecutter.project_slug}}/examples/local_orion/config.yaml @@ -6,6 +6,9 @@ patience: 5 architecture: my_model max_epoch: 99 exp_name: my_exp_1 +# set to null to avoid setting a seed (can speed up GPU computation, but +# results will not be reproducible) +seed: 1234 # architecture size: 'orion~uniform(1,10,discrete=True)' diff --git a/{{cookiecutter.project_slug}}/examples/local_orion/orion_config.yaml b/{{cookiecutter.project_slug}}/examples/local_orion/orion_config.yaml index 263ef7b..713cc95 100644 --- a/{{cookiecutter.project_slug}}/examples/local_orion/orion_config.yaml +++ b/{{cookiecutter.project_slug}}/examples/local_orion/orion_config.yaml @@ -1,11 +1,12 @@ experiment: name: my_exp - max_trials: 1 + max_trials: 2 working_dir: orion_working_dir - algorithm: - random + algorithms: + random: + seed: 1234 evc: non_monitored_arguments: - output diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc/config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_cc/config.yaml index 4b47520..2e58acc 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_cc/config.yaml +++ b/{{cookiecutter.project_slug}}/examples/slurm_cc/config.yaml @@ -6,6 +6,9 @@ patience: 5 architecture: my_model max_epoch: 99 exp_name: my_exp_1 +# set to null to avoid setting a seed (can speed up GPU computation, but +# results will not be reproducible) +seed: 1234 # architecture size: 10 diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc/to_submit.sh b/{{cookiecutter.project_slug}}/examples/slurm_cc/to_submit.sh index bb32622..5e33447 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_cc/to_submit.sh +++ b/{{cookiecutter.project_slug}}/examples/slurm_cc/to_submit.sh @@ -13,4 +13,4 @@ export MLFLOW_TRACKING_URI='mlruns' -main --data ../data --output output --config config.yaml --disable-progressbar +main --data ../data --output output --config config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/config.yaml index ed4a831..5c0028c 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/config.yaml +++ b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/config.yaml @@ -6,6 +6,9 @@ patience: 5 architecture: my_model max_epoch: 99 exp_name: my_exp_1 +# set to null to avoid setting a seed (can speed up GPU computation, but +# results will not be reproducible) +seed: 1234 # architecture size: 'orion~uniform(1,100,discrete=True)' diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/orion_config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/orion_config.yaml index 84a1677..f6bd2e1 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/orion_config.yaml +++ b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/orion_config.yaml @@ -4,11 +4,13 @@ experiment: max_trials: 2 working_dir: orion_working_dir - algorithm: - random + algorithms: + random: + seed: 1234 evc: non_monitored_arguments: - output - data + - tmp-folder ignore_code_changes: true diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/to_submit.sh b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/to_submit.sh index f73cc75..209fecb 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/to_submit.sh +++ b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/to_submit.sh @@ -19,4 +19,5 @@ export ORION_DB_TYPE='pickleddb' orion -v hunt --config orion_config.yaml \ main --data ../data --config config.yaml --disable-progressbar \ --output '{exp.working_dir}/{exp.name}_{trial.id}/' \ - --log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log' + --log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log' \ + --tmp-folder ${SLURM_TMPDIR} diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila/config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_mila/config.yaml index 4b47520..2e58acc 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_mila/config.yaml +++ b/{{cookiecutter.project_slug}}/examples/slurm_mila/config.yaml @@ -6,6 +6,9 @@ patience: 5 architecture: my_model max_epoch: 99 exp_name: my_exp_1 +# set to null to avoid setting a seed (can speed up GPU computation, but +# results will not be reproducible) +seed: 1234 # architecture size: 10 diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila/to_submit.sh b/{{cookiecutter.project_slug}}/examples/slurm_mila/to_submit.sh index 464cd60..fed5b77 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_mila/to_submit.sh +++ b/{{cookiecutter.project_slug}}/examples/slurm_mila/to_submit.sh @@ -16,4 +16,4 @@ export MLFLOW_TRACKING_URI='mlruns' -main --data ../data --output output --config config.yaml --disable-progressbar +main --data ../data --output output --config config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/config.yaml index ed4a831..5c0028c 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/config.yaml +++ b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/config.yaml @@ -6,6 +6,9 @@ patience: 5 architecture: my_model max_epoch: 99 exp_name: my_exp_1 +# set to null to avoid setting a seed (can speed up GPU computation, but +# results will not be reproducible) +seed: 1234 # architecture size: 'orion~uniform(1,100,discrete=True)' diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/orion_config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/orion_config.yaml index 84a1677..f6bd2e1 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/orion_config.yaml +++ b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/orion_config.yaml @@ -4,11 +4,13 @@ experiment: max_trials: 2 working_dir: orion_working_dir - algorithm: - random + algorithms: + random: + seed: 1234 evc: non_monitored_arguments: - output - data + - tmp-folder ignore_code_changes: true diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/to_submit.sh b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/to_submit.sh index 7e4c341..a6e669e 100644 --- a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/to_submit.sh +++ b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/to_submit.sh @@ -22,4 +22,5 @@ export ORION_DB_TYPE='pickleddb' orion -v hunt --config orion_config.yaml \ main --data ../data --config config.yaml --disable-progressbar \ --output '{exp.working_dir}/{exp.name}_{trial.id}/' \ - --log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log' + --log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log' \ + --tmp-folder ${SLURM_TMPDIR} diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py index b663f51..42916c9 100644 --- a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py @@ -37,11 +37,11 @@ def get_data(data_folder, prefix): # pragma: no cover {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} -def load_data(args, hyper_params): # pragma: no cover +def load_data(data_dir, hyper_params): # pragma: no cover """Prepare the data into datasets. Args: - args (list): arguments passed from the cli + data_dir (str): path to the folder containing the data hyper_params (dict): hyper parameters from the config file Retruns: @@ -51,9 +51,9 @@ def load_data(args, hyper_params): # pragma: no cover """ # __TODO__ load the data - train_examples, train_labels = get_data(args.data, 'train') + train_examples, train_labels = get_data(data_dir, 'train') train_dataset = tf.data.Dataset.from_tensor_slices((train_examples, train_labels)) - dev_examples, dev_labels = get_data(args.data, 'dev') + dev_examples, dev_labels = get_data(data_dir, 'dev') dev_dataset = tf.data.Dataset.from_tensor_slices((dev_examples, dev_labels)) train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(batch_size=hyper_params['batch_size']) dev_dataset = dev_dataset.batch(batch_size=hyper_params['batch_size']) @@ -90,11 +90,11 @@ def __getitem__(self, index): return input_example, target_example -def load_data(args, hyper_params): # pragma: no cover +def load_data(data_dir, hyper_params): # pragma: no cover """Prepare the data into datasets. Args: - args (list): arguments passed from the cli + data_dir (str): path to the folder containing the data hyper_params (dict): hyper parameters from the config file Retruns: @@ -104,9 +104,9 @@ def load_data(args, hyper_params): # pragma: no cover """ # __TODO__ load the data - train_input, train_target = get_data(args.data, 'train') + train_input, train_target = get_data(data_dir, 'train') train_data = MyDataset(train_input, train_target) - dev_input, dev_target = get_data(args.data, 'dev') + dev_input, dev_target = get_data(data_dir, 'dev') dev_data = MyDataset(dev_input, dev_target) train_loader = DataLoader(train_data, batch_size=hyper_params['batch_size'], shuffle=True) dev_loader = DataLoader(dev_data, batch_size=hyper_params['batch_size'], shuffle=False) diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py index 23723dc..54699e5 100755 --- a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py @@ -15,7 +15,9 @@ from {{cookiecutter.project_slug}}.models.model_loader import load_model from {{cookiecutter.project_slug}}.models.model_loader import load_optimizer from {{cookiecutter.project_slug}}.models.model_loader import load_loss +from {{cookiecutter.project_slug}}.utils.file_utils import rsync_folder from {{cookiecutter.project_slug}}.utils.logging_utils import LoggerWriter, log_exp_details +from {{cookiecutter.project_slug}}.utils.reproducibility_utils import set_seed logger = logging.getLogger(__name__) @@ -35,6 +37,10 @@ def main(): help='config file with generic hyper-parameters, such as optimizer, ' 'batch_size, ... - in yaml format') parser.add_argument('--data', help='path to data', required=True) + parser.add_argument('--tmp-folder', + help='will use this folder as working folder - it will copy the input data ' + 'here, generate results here, and then copy them back to the output ' + 'folder') parser.add_argument('--output', help='path to outputs - will store files here', required=True) parser.add_argument('--disable-progressbar', action='store_true', help='will disable the progressbar while going over the mini-batch') @@ -43,7 +49,21 @@ def main(): parser.add_argument('--debug', action='store_true') args = parser.parse_args() - logging.basicConfig(level=logging.INFO) + logging.basicConfig(stream=sys.stdout, level=logging.INFO) + + if not os.path.exists(args.output): + os.makedirs(args.output) + + if args.tmp_folder is not None: + data_folder_name = os.path.basename(os.path.normpath(args.data)) + rsync_folder(args.data, args.tmp_folder) + data_dir = os.path.join(args.tmp_folder, data_folder_name) + output_dir = os.path.join(args.tmp_folder, 'output') + if not os.path.exists(output_dir): + os.makedirs(output_dir) + else: + data_dir = args.data + output_dir = args.output # will log to a file if provided (useful for orion on cluster) if args.log is not None: @@ -72,37 +92,41 @@ def main(): mlflow.start_run(run_id=mlflow_run_id) else: mlflow.start_run() - run(args, hyper_params) + run(args, data_dir, output_dir, hyper_params) mlflow.end_run() + if args.tmp_folder is not None: + rsync_folder(output_dir + os.path.sep, args.output) + -def run(args, hyper_params): +def run(args, data_dir, output_dir, hyper_params): """Setup and run the dataloaders, training loops, etc. Args: args (list): arguments passed from the cli + data_dir (str): path to input folder + output_dir (str): path to output folder hyper_params (dict): hyper parameters from the config file """ - log_exp_details(os.path.realpath(__file__), args) - - if not os.path.exists(args.output): - os.makedirs(args.output) - # __TODO__ change the hparam that are used from the training algorithm # (and NOT the model - these will be specified in the model itself) logger.info('List of hyper-parameters:') check_and_log_hp( - ['batch_size', 'optimizer', 'patience', 'architecture', 'max_epoch', - 'exp_name'], + ['architecture', 'batch_size', 'exp_name', 'max_epoch', 'optimizer', 'patience', 'seed'], hyper_params) - train_loader, dev_loader = load_data(args, hyper_params) + if hyper_params["seed"] is not None: + set_seed(hyper_params["seed"]) + + log_exp_details(os.path.realpath(__file__), args) + + train_loader, dev_loader = load_data(data_dir, hyper_params) model = load_model(hyper_params) optimizer = load_optimizer(hyper_params, model) loss_fun = load_loss(hyper_params) train(model, optimizer, loss_fun, train_loader, dev_loader, hyper_params['patience'], - args.output, max_epoch=hyper_params['max_epoch'], + output_dir, max_epoch=hyper_params['max_epoch'], use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch) diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/file_utils.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/file_utils.py new file mode 100644 index 0000000..ead1653 --- /dev/null +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/file_utils.py @@ -0,0 +1,18 @@ +import logging +import os +import subprocess + +logger = logging.getLogger(__name__) + + +def rsync_folder(source, target): # pragma: no cover + """Uses rsync to copy the content of source into target. + + :param source: (str) path to the source folder. + :param target: (str) path to the target folder. + """ + if not os.path.exists(target): + os.makedirs(target) + + logger.info('rsyincing {} to {}'.format(source, target)) + subprocess.check_call(["rsync", "-avzq", source, target]) diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/reproducibility_utils.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/reproducibility_utils.py new file mode 100644 index 0000000..a64204a --- /dev/null +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/reproducibility_utils.py @@ -0,0 +1,27 @@ +import random + +import numpy as np +{%- if cookiecutter.dl_framework == 'pytorch' %} +import torch +{%- endif %} +{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} +import tensorflow as tf +{%- endif %} + + +def set_seed(seed): # pragma: no cover + """Set the provided seed in python/numpy/DL framework. + + :param seed: (int) the seed + """ + random.seed(seed) + np.random.seed(seed) +{%- if cookiecutter.dl_framework == 'pytorch' %} + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False +{%- endif %} +{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} + tf.random.set_seed(seed) +{%- endif %}