release 1.1 (#9)

* feat: add option to specify a temporary folder for the experiment. (#5) * added option to rsync input and output data * added docstring * logging to stdout now * fixed script for clusters - now using slurm tmpdir to write temp results * fixing travis * added missing docstring * fixed tensorflow part (method signature change) * renamed variables * Seed for reproducibility (#6) * added option to rsync input and output data * added docstring * logging to stdout now * fixed script for clusters - now using slurm tmpdir to write temp results * fixing travis * added missing docstring * fixed tensorflow part (method signature change) * added seed for pytorch * fixed typo * added comment on how to use seed * fixed flake8 * added test on reproducibility * removed pytorch part from tensorflow * fixed cookiecutter syntax * added check for tensorflow * fixed typo in test file * added command to set the seed in tensorflow * fixed flake8 error * fixed typos * removed duplicate log * typo in docstring * better error message in test * added test to check repro using Orion (#8) * added test to check repro using Orion * more log into travis * more info to debug travis * running two trials for orion * added seed to orion * added orion test to tensorflow part * better log messages in travis
mila-iqia · Jul 29, 2020 · 26dee14 · 26dee14
1 parent 298c140
commit 26dee14
Show file tree

Hide file tree

Showing 19 changed files with 174 additions and 32 deletions.
diff --git a/tests/end2end_pytorch/run.sh b/tests/end2end_pytorch/run.sh
@@ -24,6 +24,30 @@ pytest .
 # run the example
 cd examples/local
 sh run.sh
+mv output outout_OLD
+# re-run the example to check reproducibility
+sh run.sh
+# check results are the same
+DIFF_LINES=`grep "best_dev_metric" output*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | uniq | wc -l`
+if [ ${DIFF_LINES} -gt 1 ]; then
+    echo "ERROR: two identical runs produced different output results - review seed implementation"
+    exit 1
+else
+    echo "PASS: two identical runs produced the same output results."
+fi
+
+# run Orion
 cd ../..
 cd examples/local_orion
 sh run.sh
+mv orion_working_dir orion_working_dir_OLD
+# re-run the example to check reproducibility
+rm -fr orion_db*
+sh run.sh
+DIFF_LINES=`grep "best_dev_metric" orion_working_dir*/*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | sort | uniq | wc -l`
+if [ ${DIFF_LINES} -gt 2 ]; then
+    echo "ERROR: two identical Orion runs produced different output results - review seed implementation"
+    exit 1
+else
+    echo "PASS: two identical Orion runs produced the same output results."
+fi
diff --git a/tests/end2end_tensorflow/run.sh b/tests/end2end_tensorflow/run.sh
@@ -23,6 +23,30 @@ pytest .
 # run the examples
 cd examples/local
 sh run.sh
-cd ../../
+mv output outout_OLD
+# re-run the example to check reproducibility
+sh run.sh
+# check results are the same
+DIFF_LINES=`grep "best_dev_metric" output*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | uniq | wc -l`
+if [ ${DIFF_LINES} -gt 1 ]; then
+    echo "ERROR: two identical runs produced different output results - review seed implementation"
+    exit 1
+else
+    echo "PASS: two identical runs produced the same output results."
+fi
+
+# run Orion
+cd ../..
 cd examples/local_orion
 sh run.sh
+mv orion_working_dir orion_working_dir_OLD
+# re-run the example to check reproducibility
+rm -fr orion_db*
+sh run.sh
+DIFF_LINES=`grep "best_dev_metric" orion_working_dir*/*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | sort | uniq | wc -l`
+if [ ${DIFF_LINES} -gt 2 ]; then
+    echo "ERROR: two identical Orion runs produced different output results - review seed implementation"
+    exit 1
+else
+    echo "PASS: two identical Orion runs produced the same output results."
+fi
diff --git a/{{cookiecutter.project_slug}}/examples/local/config.yaml b/{{cookiecutter.project_slug}}/examples/local/config.yaml
@@ -6,6 +6,9 @@ patience: 5
 architecture: my_model
 max_epoch: 99
 exp_name: my_exp_1
+# set to null to avoid setting a seed (can speed up GPU computation, but
+# results will not be reproducible)
+seed: 1234
 
 # architecture
 size: 10
diff --git a/{{cookiecutter.project_slug}}/examples/local_orion/config.yaml b/{{cookiecutter.project_slug}}/examples/local_orion/config.yaml
@@ -6,6 +6,9 @@ patience: 5
 architecture: my_model
 max_epoch: 99
 exp_name: my_exp_1
+# set to null to avoid setting a seed (can speed up GPU computation, but
+# results will not be reproducible)
+seed: 1234
 
 # architecture
 size: 'orion~uniform(1,10,discrete=True)'
diff --git a/{{cookiecutter.project_slug}}/examples/local_orion/orion_config.yaml b/{{cookiecutter.project_slug}}/examples/local_orion/orion_config.yaml
@@ -1,11 +1,12 @@
 experiment:
   name:
     my_exp
-  max_trials: 1
+  max_trials: 2
   working_dir:
     orion_working_dir
-  algorithm:
-    random
+  algorithms:
+    random:
+      seed: 1234
 evc:
   non_monitored_arguments:
     - output

diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc/config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_cc/config.yaml
@@ -6,6 +6,9 @@ patience: 5
 architecture: my_model
 max_epoch: 99
 exp_name: my_exp_1
+# set to null to avoid setting a seed (can speed up GPU computation, but
+# results will not be reproducible)
+seed: 1234
 
 # architecture
 size: 10
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc/to_submit.sh b/{{cookiecutter.project_slug}}/examples/slurm_cc/to_submit.sh
@@ -13,4 +13,4 @@
 
 export MLFLOW_TRACKING_URI='mlruns'
 
-main --data ../data --output output --config config.yaml --disable-progressbar
+main --data ../data --output output --config config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/config.yaml
@@ -6,6 +6,9 @@ patience: 5
 architecture: my_model
 max_epoch: 99
 exp_name: my_exp_1
+# set to null to avoid setting a seed (can speed up GPU computation, but
+# results will not be reproducible)
+seed: 1234
 
 # architecture
 size: 'orion~uniform(1,100,discrete=True)'
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/orion_config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/orion_config.yaml
@@ -4,11 +4,13 @@ experiment:
   max_trials: 2
   working_dir:
     orion_working_dir
-  algorithm:
-    random
+  algorithms:
+    random:
+      seed: 1234
 evc:
   non_monitored_arguments:
     - output
     - data
+    - tmp-folder
   ignore_code_changes:
     true
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/to_submit.sh b/{{cookiecutter.project_slug}}/examples/slurm_cc_orion/to_submit.sh
@@ -19,4 +19,5 @@ export ORION_DB_TYPE='pickleddb'
 orion -v hunt --config orion_config.yaml \
     main --data ../data --config config.yaml --disable-progressbar \
     --output '{exp.working_dir}/{exp.name}_{trial.id}/' \
-    --log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log'
+    --log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log' \
+    --tmp-folder ${SLURM_TMPDIR}
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila/config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_mila/config.yaml
@@ -6,6 +6,9 @@ patience: 5
 architecture: my_model
 max_epoch: 99
 exp_name: my_exp_1
+# set to null to avoid setting a seed (can speed up GPU computation, but
+# results will not be reproducible)
+seed: 1234
 
 # architecture
 size: 10
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila/to_submit.sh b/{{cookiecutter.project_slug}}/examples/slurm_mila/to_submit.sh
@@ -16,4 +16,4 @@
 
 export MLFLOW_TRACKING_URI='mlruns'
 
-main --data ../data --output output --config config.yaml --disable-progressbar
+main --data ../data --output output --config config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/config.yaml
@@ -6,6 +6,9 @@ patience: 5
 architecture: my_model
 max_epoch: 99
 exp_name: my_exp_1
+# set to null to avoid setting a seed (can speed up GPU computation, but
+# results will not be reproducible)
+seed: 1234
 
 # architecture
 size: 'orion~uniform(1,100,discrete=True)'
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/orion_config.yaml b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/orion_config.yaml
@@ -4,11 +4,13 @@ experiment:
   max_trials: 2
   working_dir:
     orion_working_dir
-  algorithm:
-    random
+  algorithms:
+    random:
+      seed: 1234
 evc:
   non_monitored_arguments:
     - output
     - data
+    - tmp-folder
   ignore_code_changes:
     true
diff --git a/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/to_submit.sh b/{{cookiecutter.project_slug}}/examples/slurm_mila_orion/to_submit.sh
@@ -22,4 +22,5 @@ export ORION_DB_TYPE='pickleddb'
 orion -v hunt --config orion_config.yaml \
     main --data ../data --config config.yaml --disable-progressbar \
     --output '{exp.working_dir}/{exp.name}_{trial.id}/' \
-    --log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log'
+    --log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log' \
+    --tmp-folder ${SLURM_TMPDIR}
diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py
@@ -37,11 +37,11 @@ def get_data(data_folder, prefix):  # pragma: no cover
 {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %}
 
 
-def load_data(args, hyper_params):  # pragma: no cover
+def load_data(data_dir, hyper_params):  # pragma: no cover
     """Prepare the data into datasets.
 
     Args:
-        args (list): arguments passed from the cli
+        data_dir (str): path to the folder containing the data
         hyper_params (dict): hyper parameters from the config file
 
     Retruns:
@@ -51,9 +51,9 @@ def load_data(args, hyper_params):  # pragma: no cover
 
     """
     # __TODO__ load the data
-    train_examples, train_labels = get_data(args.data, 'train')
+    train_examples, train_labels = get_data(data_dir, 'train')
     train_dataset = tf.data.Dataset.from_tensor_slices((train_examples, train_labels))
-    dev_examples, dev_labels = get_data(args.data, 'dev')
+    dev_examples, dev_labels = get_data(data_dir, 'dev')
     dev_dataset = tf.data.Dataset.from_tensor_slices((dev_examples, dev_labels))
     train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(batch_size=hyper_params['batch_size'])
     dev_dataset = dev_dataset.batch(batch_size=hyper_params['batch_size'])
@@ -90,11 +90,11 @@ def __getitem__(self, index):
         return input_example, target_example
 
 
-def load_data(args, hyper_params):  # pragma: no cover
+def load_data(data_dir, hyper_params):  # pragma: no cover
     """Prepare the data into datasets.
 
     Args:
-        args (list): arguments passed from the cli
+        data_dir (str): path to the folder containing the data
         hyper_params (dict): hyper parameters from the config file
 
     Retruns:
@@ -104,9 +104,9 @@ def load_data(args, hyper_params):  # pragma: no cover
 
     """
     # __TODO__ load the data
-    train_input, train_target = get_data(args.data, 'train')
+    train_input, train_target = get_data(data_dir, 'train')
     train_data = MyDataset(train_input, train_target)
-    dev_input, dev_target = get_data(args.data, 'dev')
+    dev_input, dev_target = get_data(data_dir, 'dev')
     dev_data = MyDataset(dev_input, dev_target)
     train_loader = DataLoader(train_data, batch_size=hyper_params['batch_size'], shuffle=True)
     dev_loader = DataLoader(dev_data, batch_size=hyper_params['batch_size'], shuffle=False)

diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py
@@ -15,7 +15,9 @@
 from {{cookiecutter.project_slug}}.models.model_loader import load_model
 from {{cookiecutter.project_slug}}.models.model_loader import load_optimizer
 from {{cookiecutter.project_slug}}.models.model_loader import load_loss
+from {{cookiecutter.project_slug}}.utils.file_utils import rsync_folder
 from {{cookiecutter.project_slug}}.utils.logging_utils import LoggerWriter, log_exp_details
+from {{cookiecutter.project_slug}}.utils.reproducibility_utils import set_seed
 
 logger = logging.getLogger(__name__)
 
@@ -35,6 +37,10 @@ def main():
                         help='config file with generic hyper-parameters,  such as optimizer, '
                              'batch_size, ... -  in yaml format')
     parser.add_argument('--data', help='path to data', required=True)
+    parser.add_argument('--tmp-folder',
+                        help='will use this folder as working folder - it will copy the input data '
+                             'here, generate results here, and then copy them back to the output '
+                             'folder')
     parser.add_argument('--output', help='path to outputs - will store files here', required=True)
     parser.add_argument('--disable-progressbar', action='store_true',
                         help='will disable the progressbar while going over the mini-batch')
@@ -43,7 +49,21 @@ def main():
     parser.add_argument('--debug', action='store_true')
     args = parser.parse_args()
 
-    logging.basicConfig(level=logging.INFO)
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+
+    if args.tmp_folder is not None:
+        data_folder_name = os.path.basename(os.path.normpath(args.data))
+        rsync_folder(args.data, args.tmp_folder)
+        data_dir = os.path.join(args.tmp_folder, data_folder_name)
+        output_dir = os.path.join(args.tmp_folder, 'output')
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+    else:
+        data_dir = args.data
+        output_dir = args.output
 
     # will log to a file if provided (useful for orion on cluster)
     if args.log is not None:
@@ -72,37 +92,41 @@ def main():
         mlflow.start_run(run_id=mlflow_run_id)
     else:
         mlflow.start_run()
-    run(args, hyper_params)
+    run(args, data_dir, output_dir, hyper_params)
     mlflow.end_run()
 
+    if args.tmp_folder is not None:
+        rsync_folder(output_dir + os.path.sep, args.output)
+
 
-def run(args, hyper_params):
+def run(args, data_dir, output_dir, hyper_params):
     """Setup and run the dataloaders, training loops, etc.
 
     Args:
         args (list): arguments passed from the cli
+        data_dir (str): path to input folder
+        output_dir (str): path to output folder
         hyper_params (dict): hyper parameters from the config file
     """
-    log_exp_details(os.path.realpath(__file__), args)
-
-    if not os.path.exists(args.output):
-        os.makedirs(args.output)
-
     # __TODO__ change the hparam that are used from the training algorithm
     # (and NOT the model - these will be specified in the model itself)
     logger.info('List of hyper-parameters:')
     check_and_log_hp(
-        ['batch_size', 'optimizer', 'patience', 'architecture', 'max_epoch',
-         'exp_name'],
+        ['architecture', 'batch_size', 'exp_name', 'max_epoch', 'optimizer', 'patience', 'seed'],
         hyper_params)
 
-    train_loader, dev_loader = load_data(args, hyper_params)
+    if hyper_params["seed"] is not None:
+        set_seed(hyper_params["seed"])
+
+    log_exp_details(os.path.realpath(__file__), args)
+
+    train_loader, dev_loader = load_data(data_dir, hyper_params)
     model = load_model(hyper_params)
     optimizer = load_optimizer(hyper_params, model)
     loss_fun = load_loss(hyper_params)
 
     train(model, optimizer, loss_fun, train_loader, dev_loader, hyper_params['patience'],
-          args.output, max_epoch=hyper_params['max_epoch'],
+          output_dir, max_epoch=hyper_params['max_epoch'],
           use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch)
 
 

diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/file_utils.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/file_utils.py
@@ -0,0 +1,18 @@
+import logging
+import os
+import subprocess
+
+logger = logging.getLogger(__name__)
+
+
+def rsync_folder(source, target):  # pragma: no cover
+    """Uses rsync to copy the content of source into target.
+
+    :param source: (str) path to the source folder.
+    :param target: (str) path to the target folder.
+    """
+    if not os.path.exists(target):
+        os.makedirs(target)
+
+    logger.info('rsyincing {} to {}'.format(source, target))
+    subprocess.check_call(["rsync", "-avzq", source, target])
diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/reproducibility_utils.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/reproducibility_utils.py
@@ -0,0 +1,27 @@
+import random
+
+import numpy as np
+{%- if cookiecutter.dl_framework == 'pytorch' %}
+import torch
+{%- endif %}
+{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %}
+import tensorflow as tf
+{%- endif %}
+
+
+def set_seed(seed):  # pragma: no cover
+    """Set the provided seed in python/numpy/DL framework.
+
+    :param seed: (int) the seed
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+{%- if cookiecutter.dl_framework == 'pytorch' %}
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+{%- endif %}
+{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %}
+    tf.random.set_seed(seed)
+{%- endif %}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,4 +13,4 @@

		export MLFLOW_TRACKING_URI='mlruns'

		main --data ../data --output output --config config.yaml --disable-progressbar
		main --data ../data --output output --config config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,4 +16,4 @@

		export MLFLOW_TRACKING_URI='mlruns'

		main --data ../data --output output --config config.yaml --disable-progressbar
		main --data ../data --output output --config config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar