Skip to content

Commit

Permalink
release 1.1 (#9)
Browse files Browse the repository at this point in the history
* feat: add option to specify a temporary folder for the experiment. (#5)

* added option to rsync input and output data

* added docstring

* logging to stdout now

* fixed script for clusters - now using slurm tmpdir to write temp results

* fixing travis

* added missing docstring

* fixed tensorflow part (method signature change)

* renamed variables

* Seed for reproducibility (#6)

* added option to rsync input and output data

* added docstring

* logging to stdout now

* fixed script for clusters - now using slurm tmpdir to write temp results

* fixing travis

* added missing docstring

* fixed tensorflow part (method signature change)

* added seed for pytorch

* fixed typo

* added comment on how to use seed

* fixed flake8

* added test on reproducibility

* removed pytorch part from tensorflow

* fixed cookiecutter syntax

* added check for tensorflow

* fixed typo in test file

* added command to set the seed in tensorflow

* fixed flake8 error

* fixed typos

* removed duplicate log

* typo in docstring

* better error message in test

* added test to check repro using Orion (#8)

* added test to check repro using Orion

* more log into travis

* more info to debug travis

* running two trials for orion

* added seed to orion

* added orion test to tensorflow part

* better log messages in travis
  • Loading branch information
mirkobronzi authored Jul 29, 2020
1 parent 298c140 commit 26dee14
Show file tree
Hide file tree
Showing 19 changed files with 174 additions and 32 deletions.
24 changes: 24 additions & 0 deletions tests/end2end_pytorch/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,30 @@ pytest .
# run the example
cd examples/local
sh run.sh
mv output outout_OLD
# re-run the example to check reproducibility
sh run.sh
# check results are the same
DIFF_LINES=`grep "best_dev_metric" output*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | uniq | wc -l`
if [ ${DIFF_LINES} -gt 1 ]; then
echo "ERROR: two identical runs produced different output results - review seed implementation"
exit 1
else
echo "PASS: two identical runs produced the same output results."
fi

# run Orion
cd ../..
cd examples/local_orion
sh run.sh
mv orion_working_dir orion_working_dir_OLD
# re-run the example to check reproducibility
rm -fr orion_db*
sh run.sh
DIFF_LINES=`grep "best_dev_metric" orion_working_dir*/*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | sort | uniq | wc -l`
if [ ${DIFF_LINES} -gt 2 ]; then
echo "ERROR: two identical Orion runs produced different output results - review seed implementation"
exit 1
else
echo "PASS: two identical Orion runs produced the same output results."
fi
26 changes: 25 additions & 1 deletion tests/end2end_tensorflow/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,30 @@ pytest .
# run the examples
cd examples/local
sh run.sh
cd ../../
mv output outout_OLD
# re-run the example to check reproducibility
sh run.sh
# check results are the same
DIFF_LINES=`grep "best_dev_metric" output*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | uniq | wc -l`
if [ ${DIFF_LINES} -gt 1 ]; then
echo "ERROR: two identical runs produced different output results - review seed implementation"
exit 1
else
echo "PASS: two identical runs produced the same output results."
fi

# run Orion
cd ../..
cd examples/local_orion
sh run.sh
mv orion_working_dir orion_working_dir_OLD
# re-run the example to check reproducibility
rm -fr orion_db*
sh run.sh
DIFF_LINES=`grep "best_dev_metric" orion_working_dir*/*/stats.yaml | sed 's@^.*best_dev_metric: @@g' | sort | uniq | wc -l`
if [ ${DIFF_LINES} -gt 2 ]; then
echo "ERROR: two identical Orion runs produced different output results - review seed implementation"
exit 1
else
echo "PASS: two identical Orion runs produced the same output results."
fi
3 changes: 3 additions & 0 deletions {{cookiecutter.project_slug}}/examples/local/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ patience: 5
architecture: my_model
max_epoch: 99
exp_name: my_exp_1
# set to null to avoid setting a seed (can speed up GPU computation, but
# results will not be reproducible)
seed: 1234

# architecture
size: 10
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ patience: 5
architecture: my_model
max_epoch: 99
exp_name: my_exp_1
# set to null to avoid setting a seed (can speed up GPU computation, but
# results will not be reproducible)
seed: 1234

# architecture
size: 'orion~uniform(1,10,discrete=True)'
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
experiment:
name:
my_exp
max_trials: 1
max_trials: 2
working_dir:
orion_working_dir
algorithm:
random
algorithms:
random:
seed: 1234
evc:
non_monitored_arguments:
- output
Expand Down
3 changes: 3 additions & 0 deletions {{cookiecutter.project_slug}}/examples/slurm_cc/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ patience: 5
architecture: my_model
max_epoch: 99
exp_name: my_exp_1
# set to null to avoid setting a seed (can speed up GPU computation, but
# results will not be reproducible)
seed: 1234

# architecture
size: 10
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@

export MLFLOW_TRACKING_URI='mlruns'

main --data ../data --output output --config config.yaml --disable-progressbar
main --data ../data --output output --config config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ patience: 5
architecture: my_model
max_epoch: 99
exp_name: my_exp_1
# set to null to avoid setting a seed (can speed up GPU computation, but
# results will not be reproducible)
seed: 1234

# architecture
size: 'orion~uniform(1,100,discrete=True)'
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ experiment:
max_trials: 2
working_dir:
orion_working_dir
algorithm:
random
algorithms:
random:
seed: 1234
evc:
non_monitored_arguments:
- output
- data
- tmp-folder
ignore_code_changes:
true
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ export ORION_DB_TYPE='pickleddb'
orion -v hunt --config orion_config.yaml \
main --data ../data --config config.yaml --disable-progressbar \
--output '{exp.working_dir}/{exp.name}_{trial.id}/' \
--log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log'
--log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log' \
--tmp-folder ${SLURM_TMPDIR}
3 changes: 3 additions & 0 deletions {{cookiecutter.project_slug}}/examples/slurm_mila/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ patience: 5
architecture: my_model
max_epoch: 99
exp_name: my_exp_1
# set to null to avoid setting a seed (can speed up GPU computation, but
# results will not be reproducible)
seed: 1234

# architecture
size: 10
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@

export MLFLOW_TRACKING_URI='mlruns'

main --data ../data --output output --config config.yaml --disable-progressbar
main --data ../data --output output --config config.yaml --tmp-folder ${SLURM_TMPDIR} --disable-progressbar
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ patience: 5
architecture: my_model
max_epoch: 99
exp_name: my_exp_1
# set to null to avoid setting a seed (can speed up GPU computation, but
# results will not be reproducible)
seed: 1234

# architecture
size: 'orion~uniform(1,100,discrete=True)'
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ experiment:
max_trials: 2
working_dir:
orion_working_dir
algorithm:
random
algorithms:
random:
seed: 1234
evc:
non_monitored_arguments:
- output
- data
- tmp-folder
ignore_code_changes:
true
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ export ORION_DB_TYPE='pickleddb'
orion -v hunt --config orion_config.yaml \
main --data ../data --config config.yaml --disable-progressbar \
--output '{exp.working_dir}/{exp.name}_{trial.id}/' \
--log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log'
--log '{exp.working_dir}/{exp.name}_{trial.id}/exp.log' \
--tmp-folder ${SLURM_TMPDIR}
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ def get_data(data_folder, prefix): # pragma: no cover
{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %}


def load_data(args, hyper_params): # pragma: no cover
def load_data(data_dir, hyper_params): # pragma: no cover
"""Prepare the data into datasets.
Args:
args (list): arguments passed from the cli
data_dir (str): path to the folder containing the data
hyper_params (dict): hyper parameters from the config file
Retruns:
Expand All @@ -51,9 +51,9 @@ def load_data(args, hyper_params): # pragma: no cover
"""
# __TODO__ load the data
train_examples, train_labels = get_data(args.data, 'train')
train_examples, train_labels = get_data(data_dir, 'train')
train_dataset = tf.data.Dataset.from_tensor_slices((train_examples, train_labels))
dev_examples, dev_labels = get_data(args.data, 'dev')
dev_examples, dev_labels = get_data(data_dir, 'dev')
dev_dataset = tf.data.Dataset.from_tensor_slices((dev_examples, dev_labels))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(batch_size=hyper_params['batch_size'])
dev_dataset = dev_dataset.batch(batch_size=hyper_params['batch_size'])
Expand Down Expand Up @@ -90,11 +90,11 @@ def __getitem__(self, index):
return input_example, target_example


def load_data(args, hyper_params): # pragma: no cover
def load_data(data_dir, hyper_params): # pragma: no cover
"""Prepare the data into datasets.
Args:
args (list): arguments passed from the cli
data_dir (str): path to the folder containing the data
hyper_params (dict): hyper parameters from the config file
Retruns:
Expand All @@ -104,9 +104,9 @@ def load_data(args, hyper_params): # pragma: no cover
"""
# __TODO__ load the data
train_input, train_target = get_data(args.data, 'train')
train_input, train_target = get_data(data_dir, 'train')
train_data = MyDataset(train_input, train_target)
dev_input, dev_target = get_data(args.data, 'dev')
dev_input, dev_target = get_data(data_dir, 'dev')
dev_data = MyDataset(dev_input, dev_target)
train_loader = DataLoader(train_data, batch_size=hyper_params['batch_size'], shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=hyper_params['batch_size'], shuffle=False)
Expand Down
48 changes: 36 additions & 12 deletions {{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
from {{cookiecutter.project_slug}}.models.model_loader import load_model
from {{cookiecutter.project_slug}}.models.model_loader import load_optimizer
from {{cookiecutter.project_slug}}.models.model_loader import load_loss
from {{cookiecutter.project_slug}}.utils.file_utils import rsync_folder
from {{cookiecutter.project_slug}}.utils.logging_utils import LoggerWriter, log_exp_details
from {{cookiecutter.project_slug}}.utils.reproducibility_utils import set_seed

logger = logging.getLogger(__name__)

Expand All @@ -35,6 +37,10 @@ def main():
help='config file with generic hyper-parameters, such as optimizer, '
'batch_size, ... - in yaml format')
parser.add_argument('--data', help='path to data', required=True)
parser.add_argument('--tmp-folder',
help='will use this folder as working folder - it will copy the input data '
'here, generate results here, and then copy them back to the output '
'folder')
parser.add_argument('--output', help='path to outputs - will store files here', required=True)
parser.add_argument('--disable-progressbar', action='store_true',
help='will disable the progressbar while going over the mini-batch')
Expand All @@ -43,7 +49,21 @@ def main():
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()

logging.basicConfig(level=logging.INFO)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

if not os.path.exists(args.output):
os.makedirs(args.output)

if args.tmp_folder is not None:
data_folder_name = os.path.basename(os.path.normpath(args.data))
rsync_folder(args.data, args.tmp_folder)
data_dir = os.path.join(args.tmp_folder, data_folder_name)
output_dir = os.path.join(args.tmp_folder, 'output')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
else:
data_dir = args.data
output_dir = args.output

# will log to a file if provided (useful for orion on cluster)
if args.log is not None:
Expand Down Expand Up @@ -72,37 +92,41 @@ def main():
mlflow.start_run(run_id=mlflow_run_id)
else:
mlflow.start_run()
run(args, hyper_params)
run(args, data_dir, output_dir, hyper_params)
mlflow.end_run()

if args.tmp_folder is not None:
rsync_folder(output_dir + os.path.sep, args.output)


def run(args, hyper_params):
def run(args, data_dir, output_dir, hyper_params):
"""Setup and run the dataloaders, training loops, etc.
Args:
args (list): arguments passed from the cli
data_dir (str): path to input folder
output_dir (str): path to output folder
hyper_params (dict): hyper parameters from the config file
"""
log_exp_details(os.path.realpath(__file__), args)

if not os.path.exists(args.output):
os.makedirs(args.output)

# __TODO__ change the hparam that are used from the training algorithm
# (and NOT the model - these will be specified in the model itself)
logger.info('List of hyper-parameters:')
check_and_log_hp(
['batch_size', 'optimizer', 'patience', 'architecture', 'max_epoch',
'exp_name'],
['architecture', 'batch_size', 'exp_name', 'max_epoch', 'optimizer', 'patience', 'seed'],
hyper_params)

train_loader, dev_loader = load_data(args, hyper_params)
if hyper_params["seed"] is not None:
set_seed(hyper_params["seed"])

log_exp_details(os.path.realpath(__file__), args)

train_loader, dev_loader = load_data(data_dir, hyper_params)
model = load_model(hyper_params)
optimizer = load_optimizer(hyper_params, model)
loss_fun = load_loss(hyper_params)

train(model, optimizer, loss_fun, train_loader, dev_loader, hyper_params['patience'],
args.output, max_epoch=hyper_params['max_epoch'],
output_dir, max_epoch=hyper_params['max_epoch'],
use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch)


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import logging
import os
import subprocess

logger = logging.getLogger(__name__)


def rsync_folder(source, target): # pragma: no cover
"""Uses rsync to copy the content of source into target.
:param source: (str) path to the source folder.
:param target: (str) path to the target folder.
"""
if not os.path.exists(target):
os.makedirs(target)

logger.info('rsyincing {} to {}'.format(source, target))
subprocess.check_call(["rsync", "-avzq", source, target])
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import random

import numpy as np
{%- if cookiecutter.dl_framework == 'pytorch' %}
import torch
{%- endif %}
{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %}
import tensorflow as tf
{%- endif %}


def set_seed(seed): # pragma: no cover
"""Set the provided seed in python/numpy/DL framework.
:param seed: (int) the seed
"""
random.seed(seed)
np.random.seed(seed)
{%- if cookiecutter.dl_framework == 'pytorch' %}
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
{%- endif %}
{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %}
tf.random.set_seed(seed)
{%- endif %}

0 comments on commit 26dee14

Please sign in to comment.