From f1398501b2457af0dd0bec1beac62c5cf8352e8e Mon Sep 17 00:00:00 2001 From: Mirko Bronzi Date: Fri, 28 May 2021 09:56:31 -0400 Subject: [PATCH] Release 2.0 (#33) * feat: add option to specify a temporary folder for the experiment. (#5) * added option to rsync input and output data * added docstring * logging to stdout now * fixed script for clusters - now using slurm tmpdir to write temp results * fixing travis * added missing docstring * fixed tensorflow part (method signature change) * renamed variables * Seed for reproducibility (#6) * added option to rsync input and output data * added docstring * logging to stdout now * fixed script for clusters - now using slurm tmpdir to write temp results * fixing travis * added missing docstring * fixed tensorflow part (method signature change) * added seed for pytorch * fixed typo * added comment on how to use seed * fixed flake8 * added test on reproducibility * removed pytorch part from tensorflow * fixed cookiecutter syntax * added check for tensorflow * fixed typo in test file * added command to set the seed in tensorflow * fixed flake8 error * fixed typos * removed duplicate log * typo in docstring * better error message in test * added test to check repro using Orion (#8) * added test to check repro using Orion * more log into travis * more info to debug travis * running two trials for orion * added seed to orion * added orion test to tensorflow part * better log messages in travis * Add support for keras and Pytorch Lighning (#12) * added code for keras - still need to complete all tests * fixed flake8 * started adding PyTorch Lightning support - note that mlflow and loading/saving model still does not work * fixed api change * fixed pytorch early stopping * fixed flake8 * fixed flake8 for pytorch version * fixed keras part for flake8 * added code to resume a model - for pytorch lightning * removed forgotten diff * fixed start_from_scratch (not loading a model even if present) / now printing the val loss in the logs * pytorch lightning now correctly logging under the same run * now pytorch is correctly resuming training and continues to plot in the same mlflow run * added github actions * using a different ubuntu image * printing folder - trying to fix github actions * telling git who I am.. * removed not useful test * fixed typo in test folder * removed travis configuration - using github actions from now on * correctly handling the saved models in pytorch * now passing the full hyper-parameter object to train_impl method (for more flexibility). * added option to ask for gpus in pytorch * improved error message * Fixups for the lightning_and_keras PR (#12) (#22) * Update torch model to pl-lightning model * Refactor train+model impl w/ optim module * Refactor data loader w/ data module for plightning * removing codecov from cookeicutter. (#24) * moving to github actions (#25) * removing coverage computation * moving from travis to gitbug actions. * setting fake name/email for git. * removed (not-correct) duplicate for github actions config file. * fixing tests. * refactored pytorch models. (#26) Co-authored-by: Mirko Bronzi * running CI also on develop. Co-authored-by: Pierre-Luc St-Charles * Adding more CI backends. (#27) * added github actions. * moved python version to 3.9 - by default. * added support for azure continuous integration. * updated mlflox/orion dependencies. * now correctly restoring models for pytorch. (#28) * Now running test-coverage locally. (#30) * running test coverage locally. * fixed project name. * correctly allowing mlflow to work in any folder. (#29) * removed duplicate CI. Co-authored-by: Pierre-Luc St-Charles --- .github/workflows/{tests.yml => ci.yml} | 6 +- README.md | 1 - cookiecutter.json | 2 +- tests/end2end_pytorch/run.sh | 2 +- tests/end2end_tensorflow/run.sh | 4 +- .../.azure_pipeline.yml | 27 + .../.github/workflows/tests.yml | 39 ++ {{cookiecutter.project_slug}}/.travis.yml | 4 +- {{cookiecutter.project_slug}}/README.md | 31 +- .../docs/usage/quickstart.md | 4 +- .../examples/local/run.sh | 2 +- {{cookiecutter.project_slug}}/setup.py | 23 +- .../tests/test_train.py | 11 - .../data/data_loader.py | 83 ++- .../{{cookiecutter.project_slug}}/main.py | 76 ++- .../models/model_loader.py | 65 --- .../models/my_model.py | 69 ++- .../models/optim.py | 66 +++ .../{{cookiecutter.project_slug}}/train.py | 489 +++++++++--------- .../utils/hp_utils.py | 9 +- 20 files changed, 637 insertions(+), 376 deletions(-) rename .github/workflows/{tests.yml => ci.yml} (89%) create mode 100644 {{cookiecutter.project_slug}}/.azure_pipeline.yml create mode 100644 {{cookiecutter.project_slug}}/.github/workflows/tests.yml delete mode 100644 {{cookiecutter.project_slug}}/tests/test_train.py create mode 100644 {{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/optim.py diff --git a/.github/workflows/tests.yml b/.github/workflows/ci.yml similarity index 89% rename from .github/workflows/tests.yml rename to .github/workflows/ci.yml index 04ce39b..85110e6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/ci.yml @@ -5,18 +5,20 @@ on: push: branches: - master + - develop pull_request: branches: - master + - develop jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: python-3.7 + - name: python-3.8 uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.8 - name: install-dependencies run: | python -m pip install --upgrade pip diff --git a/README.md b/README.md index a537539..2674d52 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,6 @@ A cookiecutter is a generic project template that will instantiate a new project * Pytorch/Tensorflow * Travis CI -* Codecov * Sphinx (documentation) * MLFlow (experiment management) * Orion (hyperparameter optimization) diff --git a/cookiecutter.json b/cookiecutter.json index 332a1f5..3e6909c 100644 --- a/cookiecutter.json +++ b/cookiecutter.json @@ -5,7 +5,7 @@ "project_name": "Wonderful Project", "project_slug": "{{ cookiecutter.project_name.lower().replace(' ', '_').replace('-', '_') }}", "project_short_description": "{{ cookiecutter.project_name }} is wonderful!", - "python_version": "3.7", + "python_version": "3.8", "dl_framework": ["pytorch", "tensorflow_cpu", "tensorflow_gpu"], "pypi_username": "{{ cookiecutter.github_username }}", "version": "0.0.1", diff --git a/tests/end2end_pytorch/run.sh b/tests/end2end_pytorch/run.sh index e8383f7..7b432a7 100755 --- a/tests/end2end_pytorch/run.sh +++ b/tests/end2end_pytorch/run.sh @@ -25,7 +25,7 @@ pip freeze sh config/hooks/pre-commit # run tests -pytest . +pytest --cov=wonderful_project # run the example cd examples/local diff --git a/tests/end2end_tensorflow/run.sh b/tests/end2end_tensorflow/run.sh index ae5fc5e..41c17ab 100755 --- a/tests/end2end_tensorflow/run.sh +++ b/tests/end2end_tensorflow/run.sh @@ -14,8 +14,6 @@ git add -A git commit -m "initial commit" pip install -e . --quiet pip install flake8 pytest --quiet -# necessary cause tf dependencies are sometimes not updated -pip install -U setuptools numpy six --quiet # print all dependencies pip freeze @@ -24,7 +22,7 @@ pip freeze sh config/hooks/pre-commit # run tests -pytest . +pytest --cov=wonderful_project # run the examples cd examples/local diff --git a/{{cookiecutter.project_slug}}/.azure_pipeline.yml b/{{cookiecutter.project_slug}}/.azure_pipeline.yml new file mode 100644 index 0000000..f1191b1 --- /dev/null +++ b/{{cookiecutter.project_slug}}/.azure_pipeline.yml @@ -0,0 +1,27 @@ +jobs: +- job: + pool: + vmImage: 'ubuntu-16.04' + strategy: + matrix: + Python: + python.version: '{{ cookiecutter.python_version }}' + + steps: + - task: UsePythonVersion@0 + displayName: 'Use Python $(python.version)' + inputs: + versionSpec: '$(python.version)' + + - script: pip install -e . + displayName: 'Install dependencies' + + - script: sh config/hooks/pre-commit + displayName: 'Running commit hook' + + - script: pytest --cov={{cookiecutter.project_slug}} + displayName: 'Run pytest and display test coverage' + + - script: sh run.sh + workingDirectory: examples/local + displayName: 'Run single toy experiment' diff --git a/{{cookiecutter.project_slug}}/.github/workflows/tests.yml b/{{cookiecutter.project_slug}}/.github/workflows/tests.yml new file mode 100644 index 0000000..7f65188 --- /dev/null +++ b/{{cookiecutter.project_slug}}/.github/workflows/tests.yml @@ -0,0 +1,39 @@ +name: unit-tests +on: + # Trigger the workflow on push or pull request, + # but only for the main/develop branch + push: + branches: + - master + - develop + pull_request: + branches: + - master + - develop +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: python-{{ cookiecutter.python_version }} + uses: actions/setup-python@v2 + with: + python-version: {{ cookiecutter.python_version }} + - name: install-dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + pip install -e . + - name: flake8 + run: | + config/hooks/pre-commit + - name: print env + run: | + env + - name: pytest_and_coverage + run: | + pytest --cov={{cookiecutter.project_slug}} + - name: end2end-toyexp + run: | + cd examples/local + sh run.sh diff --git a/{{cookiecutter.project_slug}}/.travis.yml b/{{cookiecutter.project_slug}}/.travis.yml index d9936a2..8f22eb4 100644 --- a/{{cookiecutter.project_slug}}/.travis.yml +++ b/{{cookiecutter.project_slug}}/.travis.yml @@ -1,6 +1,6 @@ language: python python: - - "3.7" + - "{{ cookiecutter.python_version }}" cache: pip install: # Reducing verbosity is needed because of Travis' limit on log length @@ -9,4 +9,4 @@ script: # run flake8 with exactly the same options as in the commit hook: - config/hooks/pre-commit # run tests and compute the coverage - - pytest + - pytest --cov={{cookiecutter.project_slug}} diff --git a/{{cookiecutter.project_slug}}/README.md b/{{cookiecutter.project_slug}}/README.md index f83a13a..4180275 100644 --- a/{{cookiecutter.project_slug}}/README.md +++ b/{{cookiecutter.project_slug}}/README.md @@ -50,15 +50,40 @@ link your local git to the remote project, which should look like this: git remote add origin git@github.com:{{ cookiecutter.github_username }}/{{ cookiecutter.project_slug }}.git git push -u origin master -### Add Travis -A travis configuration file (`.travis.yml`) is already in your repository (so, no need to -create it). This will run `flake8` and run the tests under `tests`. +### Setup Continuous Integration + +Continuous integration will run the following: +- Unit tests under `tests`. +- End-to-end test under `exmaples/local`. +- `flake8` to check the code syntax. +- Checks on documentation presence and format (using `sphinx`). + +We support the following Continuous Integration providers. +Check the following instructions for more details. + +#### GitHub Actions + +Github actions are already configured in `.github/workflows/tests.yml`. +Github actions are already enabled by default when using Github, so, when +pushing to github, they will be executed automatically for pull requests to +`master` and to `develop`. + +#### Travis + +Travis is already configured in (`.travis.yml`). To enable it server-side, just go to https://travis-ci.com/account/repositories and click ` Manage repositories on GitHub`. Give the permission to run on the git repository you just created. Note, the link for public project may be https://travis-ci.org/account/repositories . +#### Azure + +Azure Continuous Integration is already configured in (`.azure_pipeline.yml`). + +To enable it server-side, just in azure and select `.azure_pipeline.yml` as the +configuration one for Continuous Integration. + ## Running the code ### Run the tests diff --git a/{{cookiecutter.project_slug}}/docs/usage/quickstart.md b/{{cookiecutter.project_slug}}/docs/usage/quickstart.md index 5b5a0b6..1b78411 100644 --- a/{{cookiecutter.project_slug}}/docs/usage/quickstart.md +++ b/{{cookiecutter.project_slug}}/docs/usage/quickstart.md @@ -14,12 +14,12 @@ Everybody loves Schrodinger's equation, why not put it everywhere? You can also add math directly in your docstrings! For an example, click at the docstrings here: ```eval_rst -:py:meth:`{{cookiecutter.project_slug}}.models.model_loader.load_loss` +:py:meth:`{{cookiecutter.project_slug}}.models.optim.load_loss` ``` You can even reference them directly anywhere for convenience, because clicking is for the lazy: ```eval_rst -.. autoclass:: {{cookiecutter.project_slug}}.models.model_loader.load_loss +.. autoclass:: {{cookiecutter.project_slug}}.models.optim.load_loss :show-inheritance: :noindex: ``` diff --git a/{{cookiecutter.project_slug}}/examples/local/run.sh b/{{cookiecutter.project_slug}}/examples/local/run.sh index c210988..5d9e811 100644 --- a/{{cookiecutter.project_slug}}/examples/local/run.sh +++ b/{{cookiecutter.project_slug}}/examples/local/run.sh @@ -1 +1 @@ -main --data ../data --output output --config config.yaml --disable-progressbar +main --data ../data --output output --config config.yaml --start-from-scratch diff --git a/{{cookiecutter.project_slug}}/setup.py b/{{cookiecutter.project_slug}}/setup.py index f97fe73..8afcb72 100644 --- a/{{cookiecutter.project_slug}}/setup.py +++ b/{{cookiecutter.project_slug}}/setup.py @@ -7,15 +7,22 @@ packages=find_packages(include=['{{ cookiecutter.project_slug }}', '{{ cookiecutter.project_slug }}.*']), python_requires='>={{ cookiecutter.python_version }}', install_requires=[ + {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} + 'numpy==1.19.2', + 'scipy==1.4.1', + 'setuptools>=41.0.0', + 'six>=1.15.0', + {%- endif %} 'flake8', 'flake8-docstrings', 'gitpython', 'tqdm', 'jupyter', - 'mlflow==1.10.0', - 'orion>=0.1.8', + 'mlflow==1.15.0', + 'orion>=0.1.14', 'pyyaml>=5.3', 'pytest>=4.6', + 'pytest-cov', 'sphinx', 'sphinx-autoapi', 'sphinx-rtd-theme', @@ -23,19 +30,13 @@ 'sphinxcontrib-katex', 'recommonmark', {%- if cookiecutter.dl_framework == 'pytorch' %} - 'torch'], + 'torch==1.8.1', 'pytorch_lightning==1.2.7'], {%- endif %} {%- if cookiecutter.dl_framework == 'tensorflow_cpu' %} - 'tensorflow==2.2.0', + 'tensorflow==2.4.0'], {%- endif %} {%- if cookiecutter.dl_framework == 'tensorflow_gpu' %} - 'tensorflow-gpu==2.2.0' - {%- endif %} - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - 'scipy==1.4.1', - 'setuptools>=41.0.0', - 'six>=1.12.0', - 'numpy>=1.19.4'], + 'tensorflow-gpu==2.4.0'], {%- endif %} entry_points={ 'console_scripts': [ diff --git a/{{cookiecutter.project_slug}}/tests/test_train.py b/{{cookiecutter.project_slug}}/tests/test_train.py deleted file mode 100644 index 4bd035c..0000000 --- a/{{cookiecutter.project_slug}}/tests/test_train.py +++ /dev/null @@ -1,11 +0,0 @@ -from {{cookiecutter.project_slug}}.train import write_stats, load_stats - - -def test_write_and_load_stats__simple(tmp_path): - best_eval_score = 1.0 - epoch = 1 - remaining_patience = 2 - mlflow_run_id = 'NO_MLFLOW' - write_stats(tmp_path, best_eval_score, epoch, remaining_patience) - result = load_stats(tmp_path) - assert (best_eval_score, epoch, remaining_patience, mlflow_run_id) == result diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py index 42916c9..3241fd3 100644 --- a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/data/data_loader.py @@ -1,18 +1,23 @@ import os +import typing -import numpy +import numpy as np {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} import tensorflow as tf BUFFER_SIZE = 100 {%- endif %} {%- if cookiecutter.dl_framework == 'pytorch' %} +import pytorch_lightning as pl from torch.utils.data import Dataset, DataLoader {%- endif %} # __TODO__ change the dataloader to suit your needs... -def get_data(data_folder, prefix): # pragma: no cover +def get_data( + data_folder: typing.AnyStr, + prefix: typing.AnyStr +) -> typing.Tuple[np.ndarray, np.ndarray]: # pragma: no cover """Function to load data into memory. Args: @@ -27,12 +32,12 @@ def get_data(data_folder, prefix): # pragma: no cover with open(os.path.join(data_folder, '{}.input'.format(prefix))) as in_stream: for line in in_stream: inputs.append([float(x) for x in line.split()]) - in_data = numpy.array(inputs, dtype=numpy.float32) + in_data = np.array(inputs, dtype=np.float32) targets = [] with open(os.path.join(data_folder, '{}.target'.format(prefix))) as in_stream: for line in in_stream: targets.append(float(line)) - tar_data = numpy.array(targets, dtype=numpy.float32) + tar_data = np.array(targets, dtype=np.float32) return in_data, tar_data {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} @@ -65,7 +70,11 @@ def load_data(data_dir, hyper_params): # pragma: no cover class MyDataset(Dataset): # pragma: no cover """Dataset class for iterating over the data.""" - def __init__(self, input_data, target_data): + def __init__( + self, + input_data: np.ndarray, + target_data: np.ndarray, + ): """Initialize MyDataset. Args: @@ -79,7 +88,10 @@ def __len__(self): """Return the number of data items in MyDataset.""" return len(self.input_data) - def __getitem__(self, index): + def __getitem__( + self, + index: int, + ): """__getitem__. Args: @@ -90,6 +102,48 @@ def __getitem__(self, index): return input_example, target_example +class MyDataModule(pl.LightningDataModule): # pragma: no cover + """Data module class that prepares dataset parsers and instantiates data loaders.""" + + def __init__( + self, + data_dir: typing.AnyStr, + hyper_params: typing.Dict[typing.AnyStr, typing.Any], + ): + """Validates the hyperparameter config dictionary and sets up internal attributes.""" + super().__init__() + self.data_dir = data_dir + self.batch_size = hyper_params['batch_size'] + self.train_data_parser, self.dev_data_parser = None, None + + def prepare_data(self): + """Downloads/extracts/unpacks the data if needed (we don't).""" + pass + + def setup(self, stage=None): + """Parses and splits all samples across the train/valid/test parsers.""" + # here, we will actually assign train/val datasets for use in dataloaders + if stage == 'fit' or stage is None: + train_input, train_target = get_data(self.data_dir, 'train') + self.train_data_parser = MyDataset(train_input, train_target) + dev_input, dev_target = get_data(self.data_dir, 'dev') + self.dev_data_parser = MyDataset(dev_input, dev_target) + if stage == 'test' or stage is None: + raise NotImplementedError # __TODO__: add code to instantiate the test data parser here + + def train_dataloader(self) -> DataLoader: + """Creates the training dataloader using the training data parser.""" + return DataLoader(self.train_data_parser, batch_size=self.batch_size, shuffle=True) + + def val_dataloader(self): + """Creates the validation dataloader using the validation data parser.""" + return DataLoader(self.dev_data_parser, batch_size=self.batch_size, shuffle=False) + + def test_dataloader(self): + """Creates the testing dataloader using the testing data parser.""" + raise NotImplementedError # __TODO__: add code to instantiate the test data loader here + + def load_data(data_dir, hyper_params): # pragma: no cover """Prepare the data into datasets. @@ -97,18 +151,9 @@ def load_data(data_dir, hyper_params): # pragma: no cover data_dir (str): path to the folder containing the data hyper_params (dict): hyper parameters from the config file - Retruns: - train_dataset (obj): iterable training dataset object - dev_dataset (obj): iterable validation dataset object - - + Returns: + datamodule (obj): the data module used to prepare/instantiate data loaders. """ - # __TODO__ load the data - train_input, train_target = get_data(data_dir, 'train') - train_data = MyDataset(train_input, train_target) - dev_input, dev_target = get_data(data_dir, 'dev') - dev_data = MyDataset(dev_input, dev_target) - train_loader = DataLoader(train_data, batch_size=hyper_params['batch_size'], shuffle=True) - dev_loader = DataLoader(dev_data, batch_size=hyper_params['batch_size'], shuffle=False) - return train_loader, dev_loader + # __TODO__ if you have different data modules, add whatever code is needed to select them here + return MyDataModule(data_dir, hyper_params) {%- endif %} diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py index 0beb8f0..9f22023 100755 --- a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/main.py @@ -10,12 +10,24 @@ import yaml from yaml import load +{%- if cookiecutter.dl_framework == 'pytorch' %} +from pytorch_lightning.loggers import MLFlowLogger +{%- endif %} from {{cookiecutter.project_slug}}.data.data_loader import load_data -from {{cookiecutter.project_slug}}.train import train, load_stats, STAT_FILE_NAME +{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} +from {{cookiecutter.project_slug}}.train import train, load_stats +{%- endif %} +from {{cookiecutter.project_slug}}.train import STAT_FILE_NAME +{%- if cookiecutter.dl_framework == 'pytorch' %} +from {{cookiecutter.project_slug}}.train import load_mlflow +from {{cookiecutter.project_slug}}.train import train +{%- endif %} from {{cookiecutter.project_slug}}.utils.hp_utils import check_and_log_hp from {{cookiecutter.project_slug}}.models.model_loader import load_model -from {{cookiecutter.project_slug}}.models.model_loader import load_optimizer -from {{cookiecutter.project_slug}}.models.model_loader import load_loss +{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} +from {{cookiecutter.project_slug}}.models.optim import load_optimizer +from {{cookiecutter.project_slug}}.models.optim import load_loss +{%- endif %} from {{cookiecutter.project_slug}}.utils.file_utils import rsync_folder from {{cookiecutter.project_slug}}.utils.logging_utils import LoggerWriter, log_exp_details from {{cookiecutter.project_slug}}.utils.reproducibility_utils import set_seed @@ -47,6 +59,11 @@ def main(): help='will disable the progressbar while going over the mini-batch') parser.add_argument('--start-from-scratch', action='store_true', help='will not load any existing saved model - even if present') +{%- if cookiecutter.dl_framework == 'pytorch' %} + parser.add_argument('--gpus', default=None, + help='list of GPUs to use. If not specified, runs on CPU.' + 'Example of GPU usage: 1 means run on GPU 1, 0 on GPU 0.') +{%- endif %} parser.add_argument('--debug', action='store_true') args = parser.parse_args() @@ -86,12 +103,33 @@ def main(): hyper_params = {} # to be done as soon as possible otherwise mlflow will not log with the proper exp. name - if orion.client.cli.IS_ORION_ON: exp_name = os.getenv('ORION_EXPERIMENT_NAME', 'orion_exp') + {%- if cookiecutter.dl_framework == 'pytorch' %} + tags = {'mlflow.runName': os.getenv('ORION_TRIAL_ID')} + {%- endif %} else: exp_name = hyper_params.get('exp_name', 'exp') + {%- if cookiecutter.dl_framework == 'pytorch' %} + tags = {} + {%- endif %} mlflow.set_experiment(exp_name) + {%- if cookiecutter.dl_framework == 'pytorch' %} + save_dir = os.getenv('MLFLOW_TRACKING_URI', './mlruns') + mlf_logger = MLFlowLogger( + experiment_name=exp_name, + tags=tags, + save_dir=save_dir + ) + + if os.path.exists(os.path.join(args.output, STAT_FILE_NAME)): + mlf_logger._run_id = load_mlflow(args.output) + + mlflow.start_run(run_id=mlf_logger.run_id) + run(args, data_dir, output_dir, hyper_params, mlf_logger) + mlflow.end_run() + {%- endif %} + {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} if os.path.exists(os.path.join(args.output, STAT_FILE_NAME)): _, _, _, mlflow_run_id = load_stats(args.output) @@ -100,19 +138,30 @@ def main(): mlflow.start_run() run(args, data_dir, output_dir, hyper_params) mlflow.end_run() + {%- endif %} if args.tmp_folder is not None: rsync_folder(output_dir + os.path.sep, args.output) +{%- if cookiecutter.dl_framework == 'pytorch' %} + + +def run(args, data_dir, output_dir, hyper_params, mlf_logger): +{%- endif %} +{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} def run(args, data_dir, output_dir, hyper_params): +{%- endif %} """Setup and run the dataloaders, training loops, etc. Args: - args (list): arguments passed from the cli + args (object): arguments passed from the cli data_dir (str): path to input folder output_dir (str): path to output folder hyper_params (dict): hyper parameters from the config file +{%- if cookiecutter.dl_framework == 'pytorch' %} + mlf_logger (obj): MLFlow logger callback. +{%- endif %} """ # __TODO__ change the hparam that are used from the training algorithm # (and NOT the model - these will be specified in the model itself) @@ -126,14 +175,25 @@ def run(args, data_dir, output_dir, hyper_params): log_exp_details(os.path.realpath(__file__), args) +{%- if cookiecutter.dl_framework == 'pytorch' %} + datamodule = load_data(data_dir, hyper_params) + model = load_model(hyper_params) + + train(model=model, datamodule=datamodule, output=output_dir, hyper_params=hyper_params, + use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch, + mlf_logger=mlf_logger, gpus=args.gpus) +{%- endif %} +{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} train_loader, dev_loader = load_data(data_dir, hyper_params) model = load_model(hyper_params) optimizer = load_optimizer(hyper_params, model) loss_fun = load_loss(hyper_params) - train(model, optimizer, loss_fun, train_loader, dev_loader, hyper_params['patience'], - output_dir, max_epoch=hyper_params['max_epoch'], - use_progress_bar=not args.disable_progressbar, start_from_scratch=args.start_from_scratch) + train(model=model, optimizer=optimizer, loss_fun=loss_fun, train_loader=train_loader, + dev_loader=dev_loader, output=output_dir, hyper_params=hyper_params, + use_progress_bar=not args.disable_progressbar, + start_from_scratch=args.start_from_scratch) +{%- endif %} if __name__ == '__main__': diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/model_loader.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/model_loader.py index 8320579..2711334 100644 --- a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/model_loader.py +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/model_loader.py @@ -1,13 +1,5 @@ import logging -{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} -import tensorflow as tf -{%- endif %} -{%- if cookiecutter.dl_framework == 'pytorch' %} -import torch -from torch import optim -{%- endif %} - from {{cookiecutter.project_slug}}.models.my_model import MyModel logger = logging.getLogger(__name__) @@ -33,61 +25,4 @@ def load_model(hyper_params): # pragma: no cover model = model_class(hyper_params) logger.info('model info:\n' + str(model) + '\n') - {%- if cookiecutter.dl_framework == 'pytorch' %} - device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - logger.info('using device {}'.format(device)) - if torch.cuda.is_available(): - logger.info(torch.cuda.get_device_name(0)) - {%- endif %} - return model - - -def load_optimizer(hyper_params, model): # pragma: no cover - """Instantiate the optimizer. - - Args: - hyper_params (dict): hyper parameters from the config file - model (obj): A neural network model object. - - Returns: - optimizer (obj): The optimizer for the given model - """ - optimizer_name = hyper_params['optimizer'] - # __TODO__ fix optimizer list - if optimizer_name == 'adam': - optimizer = {%- if cookiecutter.dl_framework == 'pytorch' %} optim.Adam(model.parameters()){%- else %} tf.keras.optimizers.Adam(){%- endif %} - elif optimizer_name == 'sgd': - optimizer = {%- if cookiecutter.dl_framework == 'pytorch' %} optim.SGD(model.parameters()){%- else %} tf.keras.optimizers.SGD(){%- endif %} - else: - raise ValueError('optimizer {} not supported'.format(optimizer_name)) - return optimizer - - -def load_loss(hyper_params): # pragma: no cover - r"""Instantiate the loss. - - You can add some math directly in your docstrings, however don't forget the `r` - to indicate it is being treated as restructured text. For example, an L1-loss can be - defined as: - - .. math:: - \text{loss}(x, y) = \frac{1}{n} \sum_{i} z_{i} - - Args: - hyper_params (dict): hyper parameters from the config file - - Returns: - loss (obj): The loss for the given model - """ - loss_name = hyper_params['loss'] - if loss_name == 'L1': - {%- if cookiecutter.dl_framework == 'pytorch' %} - loss = torch.nn.L1Loss(reduction='sum') - {%- endif %} - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - loss = tf.keras.losses.MeanAbsoluteError() - {%- endif %} - else: - raise ValueError('loss {} not supported'.format(loss_name)) - return loss diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/my_model.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/my_model.py index 6e4880f..4c1fb42 100644 --- a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/my_model.py +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/my_model.py @@ -1,24 +1,77 @@ import logging +import typing {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} import tensorflow as tf {%- endif %} {%- if cookiecutter.dl_framework == 'pytorch' %} -import torch.nn as nn +import torch +import pytorch_lightning as pl + +from {{cookiecutter.project_slug}}.models.optim import load_loss, load_optimizer {%- endif %} from {{cookiecutter.project_slug}}.utils.hp_utils import check_and_log_hp logger = logging.getLogger(__name__) +{%- if cookiecutter.dl_framework == 'pytorch' %} + +class BaseModel(pl.LightningModule): + """Base class for Pytorch Lightning model - useful to reuse the same *_step methods.""" -class MyModel({%- if cookiecutter.dl_framework == 'pytorch' %}nn.Module{%- else %}tf.keras.Model{%- endif %}): # pragma: no cover + def configure_optimizers(self): + """Returns the combination of optimizer(s) and learning rate scheduler(s) to train with. + + Here, we read all the optimization-related hyperparameters from the config dictionary and + create the required optimizer/scheduler combo. + + This function will be called automatically by the pytorch lightning trainer implementation. + See https://pytorch-lightning.readthedocs.io/en/latest/common/optimizers.html for more info + on the expected returned elements. + """ + # we use the generic loading function from the `model_loader` module, but it could be made + # a direct part of the model (useful if we want layer-dynamic optimization) + return load_optimizer(self.hparams, self) + + def _generic_step( + self, + batch: typing.Any, + batch_idx: int, + ) -> typing.Any: + """Runs the prediction + evaluation step for training/validation/testing.""" + input_data, targets = batch + preds = self(input_data) # calls the forward pass of the model + loss = self.loss_fn(preds, targets) + return loss + + def training_step(self, batch, batch_idx): + """Runs a prediction step for training, returning the loss.""" + loss = self._generic_step(batch, batch_idx) + self.log("train_loss", loss) + self.log("epoch", self.current_epoch) + self.log("step", self.global_step) + return loss # this function is required, as the loss returned here is used for backprop + + def validation_step(self, batch, batch_idx): + """Runs a prediction step for validation, logging the loss.""" + loss = self._generic_step(batch, batch_idx) + self.log("val_loss", loss) + + def test_step(self, batch, batch_idx): + """Runs a prediction step for testing, logging the loss.""" + loss = self._generic_step(batch, batch_idx) + self.log("test_loss", loss) +{%- endif %} + + +class MyModel({%- if cookiecutter.dl_framework == 'pytorch' %}BaseModel{%- else %}tf.keras.Model{%- endif %}): # pragma: no cover """Simple Model Class. Inherits from the given framework's model class. This is a simple MLP model. """ - def __init__(self, hyper_params): + def __init__(self, hyper_params: typing.Dict[typing.AnyStr, typing.Any]): """__init__. Args: @@ -27,8 +80,6 @@ def __init__(self, hyper_params): super(MyModel, self).__init__() check_and_log_hp(['size'], hyper_params) - self.hyper_params = hyper_params - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} self.hyper_params = hyper_params self.dense1 = tf.keras.layers.Dense(hyper_params['size']) @@ -45,8 +96,10 @@ def call(self, inputs): return hidden2 {%- endif %} {%- if cookiecutter.dl_framework == 'pytorch' %} - self.linear1 = nn.Linear(5, hyper_params['size']) - self.linear2 = nn.Linear(hyper_params['size'], 1) + self.save_hyperparameters(hyper_params) # they will become available via model.hparams + self.linear1 = torch.nn.Linear(5, hyper_params['size']) + self.linear2 = torch.nn.Linear(hyper_params['size'], 1) + self.loss_fn = load_loss(hyper_params) # 'load_loss' could be part of the model itself... def forward(self, data): """Forward method of the model. @@ -58,7 +111,7 @@ def forward(self, data): tensor: the output of the model computation. """ - hidden = nn.functional.relu(self.linear1(data)) + hidden = torch.nn.functional.relu(self.linear1(data)) result = self.linear2(hidden) return result.squeeze() {%- endif %} diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/optim.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/optim.py new file mode 100644 index 0000000..2d9644a --- /dev/null +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/models/optim.py @@ -0,0 +1,66 @@ +import logging + +{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} +import tensorflow as tf +{%- endif %} +{%- if cookiecutter.dl_framework == 'pytorch' %} +import torch +from torch import optim +{%- endif %} + + +logger = logging.getLogger(__name__) + + +def load_optimizer(hyper_params, model): # pragma: no cover + """Instantiate the optimizer. + + Args: + hyper_params (dict): hyper parameters from the config file + model (obj): A neural network model object. + + Returns: + optimizer (obj): The optimizer for the given model + """ + optimizer_name = hyper_params['optimizer'] + # __TODO__ fix optimizer list + if optimizer_name == 'adam': + optimizer = {%- if cookiecutter.dl_framework == 'pytorch' %} optim.Adam(model.parameters()) + {%- else %} tf.keras.optimizers.Adam() + {%- endif %} + elif optimizer_name == 'sgd': + optimizer = {%- if cookiecutter.dl_framework == 'pytorch' %} optim.SGD(model.parameters()) + {%- else %} tf.keras.optimizers.SGD() + {%- endif %} + else: + raise ValueError('optimizer {} not supported'.format(optimizer_name)) + return optimizer + + +def load_loss(hyper_params): # pragma: no cover + r"""Instantiate the loss. + + You can add some math directly in your docstrings, however don't forget the `r` + to indicate it is being treated as restructured text. For example, an L1-loss can be + defined as: + + .. math:: + \text{loss}(x, y) = \frac{1}{n} \sum_{i} z_{i} + + Args: + hyper_params (dict): hyper parameters from the config file + + Returns: + loss (obj): The loss for the given model + """ + loss_name = hyper_params['loss'] + if loss_name == 'L1': + {%- if cookiecutter.dl_framework == 'pytorch' %} + loss = torch.nn.L1Loss(reduction='sum') + {%- endif %} + {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} + loss = tf.keras.losses.MeanAbsoluteError() + {%- endif %} + else: + raise ValueError('loss {} not supported'.format(loss_name)) + return loss diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/train.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/train.py index bc71af5..9589190 100644 --- a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/train.py +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/train.py @@ -1,22 +1,32 @@ +{%- if cookiecutter.dl_framework == 'pytorch' %} +import datetime +import glob +{%- endif %} import logging import os +{%- if cookiecutter.dl_framework == 'pytorch' %} +import shutil +{%- endif %} import mlflow import orion import yaml {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} +import mlflow.tensorflow as mt import tensorflow as tf +from tensorflow.keras.callbacks import EarlyStopping +from pathlib import Path {%- endif %} -import time {%- if cookiecutter.dl_framework == 'pytorch' %} -import torch +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping {%- endif %} -import tqdm -from mlflow import log_metric from orion.client import report_results from yaml import dump from yaml import load +from {{cookiecutter.project_slug}}.utils.hp_utils import check_and_log_hp + logger = logging.getLogger(__name__) BEST_MODEL_NAME = 'best_model' @@ -24,55 +34,37 @@ STAT_FILE_NAME = 'stats.yaml' -def reload_model(output, model_name, model, optimizer, - start_from_scratch=False): # pragma: no cover - """Reload a model. +def train(**kwargs): # pragma: no cover + """Training loop wrapper. Used to catch exception if Orion is being used.""" + try: + best_dev_metric = train_impl(**kwargs) + except RuntimeError as err: + if orion.client.cli.IS_ORION_ON and 'CUDA out of memory' in str(err): + logger.error(err) + logger.error('model was out of memory - assigning a bad score to tell Orion to avoid' + 'too big model') + best_dev_metric = -999 + else: + raise err - Can be useful for model checkpointing, hyper-parameter optimization, etc. + report_results([dict( + name='dev_metric', + type='objective', + # note the minus - cause orion is always trying to minimize (cit. from the guide) + value=-float(best_dev_metric))]) +{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - Args: - output (str): Output directory. - model_name (str): Name of the saved model. - model (obj): A model object. - optimizer (obj): Optimizer used during training. - start_from_scratch (bool): starts training from scratch even if a saved moel is present. - """ - saved_model = os.path.join(output, model_name) - if start_from_scratch and os.path.exists(saved_model): - logger.info('saved model file "{}" already exists - but NOT loading it ' - '(cause --start_from_scratch)'.format(output)) - return - if os.path.exists(saved_model): - logger.info('saved model file "{}" already exists - loading it'.format(output)) - {%- if cookiecutter.dl_framework == 'pytorch' %} - model.load_state_dict(torch.load(saved_model)) - {%- endif %} - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - ckpt = tf.train.Checkpoint(model=model, optimizer=optimizer) - ckpt_manager_best_model = tf.train.CheckpointManager(ckpt, saved_model, max_to_keep=1) - status = ckpt.restore(ckpt_manager_best_model.latest_checkpoint) - # NOTE: not using assert_consumed because it fails (see - # https://github.com/tensorflow/tensorflow/issues/33150) given that some variables - # are in the saved_model but not in the model. This seems more a bug with tensorflow. - # You can use assert_existing_objects_matched that checks only the variables in the - # model. - # In any case, we use expect_partial here because otherwise the restoring would complain - # when restoring the multitask model for prediction (given that - in that case - we only - # load the model part related to the main task). - # status.assert_consumed() - status.assert_existing_objects_matched() - # status.expect_partial() - {%- endif %} - stats = load_stats(output) - logger.info('model status: {}'.format(stats)) - return stats - if os.path.exists(output): - logger.info('saved model file not found') - return +def load_stats(output): + """Load the latest statistics. - logger.info('output folder not found') - os.makedirs(output) + Args: + output (str): Output directory + """ + with open(os.path.join(output, STAT_FILE_NAME), 'r') as stream: + stats = load(stream, Loader=yaml.FullLoader) + return stats['best_dev_metric'], stats['epoch'], stats['remaining_patience'], \ + stats['mlflow_run_id'] def write_stats(output, best_eval_score, epoch, remaining_patience): @@ -93,54 +85,35 @@ def write_stats(output, best_eval_score, epoch, remaining_patience): dump(to_store, stream) -def load_stats(output): - """Load the latest statistics. - - Args: - output (str): Output directory - """ - with open(os.path.join(output, STAT_FILE_NAME), 'r') as stream: - stats = load(stream, Loader=yaml.FullLoader) - return stats['best_dev_metric'], stats['epoch'], stats['remaining_patience'], \ - stats['mlflow_run_id'] - +def reload_model(output, model_name, start_from_scratch=False): # pragma: no cover + """Reload a model. -def train(model, optimizer, loss_fun, train_loader, dev_loader, patience, output, - max_epoch, use_progress_bar=True, start_from_scratch=False): # pragma: no cover - """Training loop wrapper. Used to catch exception (and to handle them) if Orion is being used. + Can be useful for model checkpointing, hyper-parameter optimization, etc. Args: - model (obj): The neural network model object. - optimizer (obj): Optimizer used during training. - loss_fun (obj): Loss function that will be optimized. - train_loader (obj): Dataloader for the training set. - dev_loader (obj): Dataloader for the validation set. - patience (int): max number of epochs without improving on `best_eval_score`. - After this point, the train ends. output (str): Output directory. - max_epoch (int): Max number of epochs to train for. - use_progress_bar (bool): Use tqdm progress bar (can be disabled when logging). - start_from_scratch (bool): Start training from scratch (ignore checkpoints) + model_name (str): Model name to relaod. + start_from_scratch (bool): starts training from scratch even if a saved moel is present. """ - try: - best_dev_metric = train_impl( - model, optimizer, loss_fun, train_loader, dev_loader, patience, output, - max_epoch, use_progress_bar, start_from_scratch) - except RuntimeError as err: - if orion.client.cli.IS_ORION_ON and 'CUDA out of memory' in str(err): - logger.error(err) - logger.error('model was out of memory - assigning a bad score to tell Orion to avoid' - 'too big model') - best_dev_metric = -999 - else: - raise err + saved_model_folder = os.path.join(output, model_name) + if start_from_scratch and os.path.exists(saved_model_folder): + logger.info('saved model file "{}" already exists - but NOT loading it ' + '(cause --start_from_scratch)'.format(output)) + restored = None + elif os.path.exists(saved_model_folder): + logger.info('loading model from {}'.format(saved_model_folder)) + model = tf.keras.models.load_model( + os.path.join(output, LAST_MODEL_NAME) + ) - report_results([dict( - name='dev_metric', - type='objective', - # note the minus - cause orion is always trying to minimize (cit. from the guide) - value=-float(best_dev_metric))]) -{%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} + stats = load_stats(output) + logger.info('model status: {}'.format(stats)) + + restored = model, stats + else: + logger.info('no model found to restore.') + restored = None + return restored def init_model(model, train_loader): # pragma: no cover @@ -153,11 +126,10 @@ def init_model(model, train_loader): # pragma: no cover model_input, model_target = next(iter(train_loader)) _ = model(model_input) model.summary(print_fn=logger.info) -{%- endif %} -def train_impl(model, optimizer, loss_fun, train_loader, dev_loader, patience, output, - max_epoch, use_progress_bar=True, start_from_scratch=False): # pragma: no cover +def train_impl(model, optimizer, loss_fun, train_loader, dev_loader, output, hyper_params, + use_progress_bar=True, start_from_scratch=False): # pragma: no cover """Main training loop implementation. Args: @@ -166,154 +138,203 @@ def train_impl(model, optimizer, loss_fun, train_loader, dev_loader, patience, o loss_fun (obj): Loss function that will be optimized. train_loader (obj): Dataloader for the training set. dev_loader (obj): Dataloader for the validation set. - patience (int): max number of epochs without improving on `best_eval_score`. - After this point, the train ends. output (str): Output directory. - max_epoch (int): Max number of epochs to train for. + hyper_params (dict): Dict containing hyper-parameters. use_progress_bar (bool): Use tqdm progress bar (can be disabled when logging). start_from_scratch (bool): Start training from scratch (ignore checkpoints) """ - if use_progress_bar: - pb = tqdm.tqdm - else: - def pb(x, total): - return x - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} + check_and_log_hp(['max_epoch', 'patience'], hyper_params) - init_model(model, train_loader) - ckpt_last = tf.train.Checkpoint(model=model, optimizer=optimizer) - ckpt_manager_last_model = tf.train.CheckpointManager( - ckpt_last, os.path.join(output, LAST_MODEL_NAME), max_to_keep=1) - ckpt_best = tf.train.Checkpoint(model=model, optimizer=optimizer) - ckpt_manager_best_model = tf.train.CheckpointManager( - ckpt_best, os.path.join(output, BEST_MODEL_NAME), max_to_keep=1) - {%- endif %} - - stats = reload_model(output, LAST_MODEL_NAME, model, optimizer, start_from_scratch) - if stats is None: + restored = reload_model(output, LAST_MODEL_NAME, start_from_scratch) + + if restored is None: best_dev_metric = None - remaining_patience = patience + remaining_patience = hyper_params['patience'] start_epoch = 0 + + model.compile( + optimizer=optimizer, + loss=loss_fun, + metrics=[], + ) else: + restored_model, stats = restored best_dev_metric, start_epoch, remaining_patience, _ = stats + model = restored_model + + init_model(model, train_loader) + + es = EarlyStoppingWithModelSave( + monitor='val_loss', min_delta=0, patience=hyper_params['patience'], + verbose=use_progress_bar, mode='min', restore_best_weights=False, baseline=best_dev_metric, + output=output, remaining_patience=remaining_patience + ) + + # set tensorflow/keras logging + mt.autolog(every_n_iter=1) + + history = model.fit(x=train_loader, validation_data=dev_loader, callbacks=[es], + epochs=hyper_params['max_epoch'], verbose=use_progress_bar, + initial_epoch=start_epoch) + + best_val_loss = min(history.history['val_loss']) + return best_val_loss + + +class EarlyStoppingWithModelSave(EarlyStopping): + """Keras callback that extends the early stopping. + + Adds the functionality to save the models in the new TF format. (both best and last model) + """ - if remaining_patience <= 0: - logger.warning( - 'remaining patience is zero - not training (and returning best dev score {})'.format( - best_dev_metric)) - return best_dev_metric - if start_epoch >= max_epoch: - logger.warning( - 'start epoch {} > max epoch {} - not training (and returning best dev score ' - '{})'.format(start_epoch, max_epoch, best_dev_metric)) - return best_dev_metric - {%- if cookiecutter.dl_framework == 'pytorch' %} - device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') - model.to(device) - {%- endif %} - - for epoch in range(start_epoch, max_epoch): - - start = time.time() - # train - train_cumulative_loss = 0.0 - examples = 0 - {%- if cookiecutter.dl_framework == 'pytorch' %} - model.train() - train_steps = len(train_loader) - {%- endif %} - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - train_steps = sum(1 for _ in train_loader) - {%- endif %} - for i, data in pb(enumerate(train_loader, 0), total=train_steps): - model_input, model_target = data - # forward + backward + optimize - {%- if cookiecutter.dl_framework == 'pytorch' %} - optimizer.zero_grad() - outputs = model(model_input.to(device)) - model_target = torch.squeeze(model_target.to(device)) - loss = loss_fun(outputs, model_target) - loss.backward() - optimizer.step() - - train_cumulative_loss += loss.item() - {%- endif %} - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - with tf.GradientTape() as tape: - outputs = model(model_input) - loss = loss_fun(model_target, outputs) - gradients = tape.gradient(loss, model.trainable_variables) - optimizer.apply_gradients(zip(gradients, model.trainable_variables)) - - train_cumulative_loss += float(loss.numpy()) - {%- endif %} - examples += model_target.shape[0] - - train_end = time.time() - avg_train_loss = train_cumulative_loss / examples - - # dev - {%- if cookiecutter.dl_framework == 'pytorch' %} - model.eval() - dev_steps = len(dev_loader) - {%- endif %} - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - dev_steps = sum(1 for _ in dev_loader) - {%- endif %} - dev_cumulative_loss = 0.0 - examples = 0 - for i, data in pb(enumerate(dev_loader, 0), total=dev_steps): - model_input, model_target = data - {%- if cookiecutter.dl_framework == 'pytorch' %} - with torch.no_grad(): - outputs = model(model_input.to(device)) - model_target = torch.squeeze(model_target.to(device)) - loss = loss_fun(outputs, model_target) - dev_cumulative_loss += loss.item() - {%- endif %} - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - outputs = model(model_input) - loss = loss_fun(model_target, outputs) - dev_cumulative_loss += float(loss.numpy()) - {%- endif %} - examples += model_target.shape[0] - - avg_dev_loss = dev_cumulative_loss / examples - log_metric("dev_loss", avg_dev_loss, step=epoch) - log_metric("train_loss", avg_train_loss, step=epoch) - - dev_end = time.time() - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - ckpt_manager_last_model.save() - {%- endif %} - {%- if cookiecutter.dl_framework == 'pytorch' %} - torch.save(model.state_dict(), os.path.join(output, LAST_MODEL_NAME)) - {%- endif %} - - if best_dev_metric is None or avg_dev_loss < best_dev_metric: - best_dev_metric = avg_dev_loss - remaining_patience = patience - {%- if cookiecutter.dl_framework in ['tensorflow_cpu', 'tensorflow_gpu'] %} - ckpt_manager_best_model.save() - {%- endif %} - {%- if cookiecutter.dl_framework == 'pytorch' %} - torch.save(model.state_dict(), os.path.join(output, BEST_MODEL_NAME)) - {%- endif %} + def __init__(self, output, remaining_patience, **kwargs): + """Main constructor - initializes the parent. + + output (str): path to folder where to store the models. + remaining_patience (int): patience left when starting early stopping. + (in general it's equal to patience - but it may be less if train is resumed) + """ + super(EarlyStoppingWithModelSave, self).__init__(**kwargs) + self.output = output + self.wait = self.patience - remaining_patience + + def on_train_begin(self, logs=None): + """See parent class doc.""" + super(EarlyStoppingWithModelSave, self).on_train_begin(logs) + Path(self.output).mkdir(parents=True, exist_ok=True) + + # copy-pasted in order to modify what happens when we improve (see comment below) + def on_epoch_end(self, epoch, logs=None): + """See parent class doc.""" + current = self.get_monitor_value(logs) + if current is None: + return + if self.monitor_op(current - self.min_delta, self.best): + self.best = current + self.wait = 0 + if self.restore_best_weights: + self.best_weights = self.model.get_weights() + + self.model.save(os.path.join(self.output, BEST_MODEL_NAME)) else: - remaining_patience -= 1 - - logger.info( - 'done #epoch {:3} => loss {:5.3f} - dev loss {:3.2f} (will try for {} more epoch) - ' - 'train min. {:4.2f} / dev min. {:4.2f}'.format( - epoch, avg_train_loss, avg_dev_loss, remaining_patience, (train_end - start) / 60, - (dev_end - train_end) / 60)) - - write_stats(output, best_dev_metric, epoch + 1, remaining_patience) - log_metric("best_dev_metric", best_dev_metric) - - if remaining_patience <= 0: - logger.info('done! best dev metric is {}'.format(best_dev_metric)) - break - logger.info('training completed (epoch done {} - max epoch {})'.format(epoch + 1, max_epoch)) - logger.info('Finished Training') - return best_dev_metric + self.wait += 1 + if self.wait >= self.patience: + self.stopped_epoch = epoch + self.model.stop_training = True + if self.restore_best_weights: + if self.verbose > 0: + print('Restoring model weights from the end of the best epoch.') + self.model.set_weights(self.best_weights) + + self.model.save(os.path.join(self.output, LAST_MODEL_NAME)) + write_stats(self.output, self.best, epoch, self.patience - self.wait) +{%- endif %} +{%- if cookiecutter.dl_framework == 'pytorch' %} + + +def load_mlflow(output): + """Load the mlflow run id. + + Args: + output (str): Output directory + """ + with open(os.path.join(output, STAT_FILE_NAME), 'r') as stream: + stats = load(stream, Loader=yaml.FullLoader) + return stats['mlflow_run_id'] + + +def write_mlflow(output): + """Write the mlflow info to resume the training plotting.. + + Args: + output (str): Output directory + """ + mlflow_run = mlflow.active_run() + mlflow_run_id = mlflow_run.info.run_id if mlflow_run is not None else 'NO_MLFLOW' + to_store = {'mlflow_run_id': mlflow_run_id} + with open(os.path.join(output, STAT_FILE_NAME), 'w') as stream: + dump(to_store, stream) + + +def train_impl(model, datamodule, output, hyper_params, + use_progress_bar, start_from_scratch, mlf_logger, gpus): # pragma: no cover + """Main training loop implementation. + + Args: + model (obj): The neural network model object. + datamodule (obj): lightning data module that will instantiate data loaders. + output (str): Output directory. + hyper_params (dict): Dict containing hyper-parameters. + use_progress_bar (bool): Use tqdm progress bar (can be disabled when logging). + start_from_scratch (bool): Start training from scratch (ignore checkpoints) + mlf_logger (obj): MLFlow logger callback. + gpus: number of GPUs to use. + """ + check_and_log_hp(['max_epoch', 'patience'], hyper_params) + write_mlflow(output) + + best_model_path = os.path.join(output, BEST_MODEL_NAME) + best_checkpoint_callback = ModelCheckpoint( + dirpath=best_model_path, + filename='model', + save_top_k=1, + verbose=use_progress_bar, + monitor="val_loss", + mode="auto", + period=1, + ) + + last_model_path = os.path.join(output, LAST_MODEL_NAME) + last_checkpoint_callback = ModelCheckpoint( + dirpath=last_model_path, + filename='model', + verbose=use_progress_bar, + period=1, + ) + + resume_from_checkpoint = handle_previous_models(output, last_model_path, best_model_path, + start_from_scratch) + + early_stopping = EarlyStopping("val_loss", mode="auto", patience=hyper_params['patience'], + verbose=use_progress_bar) + trainer = pl.Trainer( + callbacks=[early_stopping, best_checkpoint_callback, last_checkpoint_callback], + checkpoint_callback=True, + logger=mlf_logger, + max_epochs=hyper_params['max_epoch'], + resume_from_checkpoint=resume_from_checkpoint, + gpus=gpus + ) + + trainer.fit(model, datamodule=datamodule) + best_dev_result = float(early_stopping.best_score.cpu().numpy()) + return best_dev_result + + +def handle_previous_models(output, last_model_path, best_model_path, start_from_scratch): + """Moves the previous models in a new timestamp folder.""" + last_models = glob.glob(last_model_path + os.sep + '*') + best_models = glob.glob(best_model_path + os.sep + '*') + + if len(last_models + best_models) > 0: + now = datetime.datetime.now() + timestamp = now.strftime('%Y%m%d_%H%M%S') + new_folder = output + os.path.sep + timestamp + os.mkdir(new_folder) + shutil.move(last_model_path, new_folder) + shutil.move(best_model_path, new_folder) + logger.info(f'old models found - moving them to {new_folder}') + # need to change the last model pointer to the new location + last_models = glob.glob(new_folder + os.path.sep + LAST_MODEL_NAME + os.sep + '*') + + if start_from_scratch: + logger.info('will not load any pre-existent checkpoint (because of "--start-from-scratch")') + resume_from_checkpoint = None + elif len(last_models) >= 1: + resume_from_checkpoint = sorted(last_models)[-1] + logger.info(f'models found - resuming from {resume_from_checkpoint}') + else: + logger.info('no model found - starting training from scratch') + resume_from_checkpoint = None + return resume_from_checkpoint +{%- endif %} diff --git a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/hp_utils.py b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/hp_utils.py index 442a248..492315b 100644 --- a/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/hp_utils.py +++ b/{{cookiecutter.project_slug}}/{{cookiecutter.project_slug}}/utils/hp_utils.py @@ -31,12 +31,13 @@ def check_hp(names, hps, allow_extra=True): missing.add(name) extra = hps.keys() - names + msgs = [] if len(missing) > 0: - logger.error('please add the missing hyper-parameters: {}'.format(missing)) + msgs.append(f'please add the missing hyper-parameters: {missing}') if len(extra) > 0 and not allow_extra: - logger.error('please remove the extra hyper-parameters: {}'.format(extra)) - if len(missing) > 0 or (len(extra) > 0 and not allow_extra): - raise ValueError('fix according to the error message above') + msgs.append(f'please remove the extra hyper-parameters: {extra}') + if len(msgs) > 0: + raise ValueError('\n'.join(msgs)) def log_hp(names, hps): # pragma: no cover