diff --git a/tutorials/tensorflow/mlflow_gcp/README.md b/tutorials/tensorflow/mlflow_gcp/README.md index 283e5a4b..d222d2a6 100644 --- a/tutorials/tensorflow/mlflow_gcp/README.md +++ b/tutorials/tensorflow/mlflow_gcp/README.md @@ -37,8 +37,64 @@ This dataset is provided by a third party. Google provides no representation, warranty, or other guarantees about the validity or any other aspects of this dataset. +### Create a Compute Engine instance + +Create a new Deep Learning Virtual Machine instance + +``` +export IMAGE_FAMILY="tf-latest-cpu" +export ZONE="us-central1-b" +export INSTANCE_NAME="mlflow-server" +gcloud compute instances create $INSTANCE_NAME \ + --zone=$ZONE \ + --image-family=$IMAGE_FAMILY \ + --machine-type=n1-standard-8 \ + --image-project=deeplearning-platform-release \ + --maintenance-policy=TERMINATE \ + --scopes=https://www.googleapis.com/auth/cloud-platform \ + --tags http-server,https-server +``` + +#### Installing MLflow + +Install git, pip and virtual environment + +``` +sudo apt-get install git -y +sudo apt-get install python-pip -y +pip install virtualenv +``` + +Create virtual environment + +``` +virtualenv -p `which python3` mlflow_env +source mlflow_env/bin/activate +``` + +Install MLflow + +``` +pip install mlflow +``` +Verify installation + +``` +pip freeze | grep mlflow +mlflow==1.2.0 +``` + ### **Install dependencies** +In this tutorial we will train a TensorFlow model and use different +parameters. We will use MLflow to track those different parameters and +their metrics. Start by cloning the repo. + +``` +git clone https://github.com/GoogleCloudPlatform/ml-on-gcp.git +cd ml-on-gcp/tutorials/tensorflow/mlflow_gcp/ +``` + Install the python dependencies. ``` @@ -189,19 +245,19 @@ export JOB_NAME=mlflow_$DATE export REGION=us-central1 export GCS_JOB_DIR=gs://mlflow_gcp/jobs/$JOB_NAME -gcloud ai-platform job sumit training $JOB_NAME \ - --stream-logs \ - --runtime-version 1.14 \ - --package-path trainer \ - --module-name trainer.task \ - --region $REGION \ - -- \ - --train-files $TRAIN_FILE \ - --eval-files $EVAL_FILE \ - --job-dir $GCS_JOB_DIR \ - --train-steps $TRAIN_STEPS \ - --eval-steps $EVAL_STEPS - --mlflow-tracking-uri http://:5000 +gcloud ai-platform jobs submit training $JOB_NAME \ + --stream-logs \ + --runtime-version 1.14 \ + --job-dir $GCS_JOB_DIR \ + --package-path trainer \ + --module-name trainer.task \ + --region $REGION \ + -- \ + --train-files $TRAIN_FILE \ + --eval-files $EVAL_FILE \ + --train-steps $TRAIN_STEPS \ + --eval-steps $EVAL_STEPS \ + --mlflow-tracking-uri http://: ``` diff --git a/tutorials/tensorflow/mlflow_gcp/requirements.txt b/tutorials/tensorflow/mlflow_gcp/requirements.txt index 824c9a41..8d38cfb6 100644 --- a/tutorials/tensorflow/mlflow_gcp/requirements.txt +++ b/tutorials/tensorflow/mlflow_gcp/requirements.txt @@ -1,7 +1,8 @@ -numpy>=1.14 -pandas>=0.22 -six>=1.11 -google-api-python-client -google-cloud-storage -tensorflow>=1.14,<2 -mlflow>1.0,<2 +# The pip syntax below allows us to not repeat +# In order to not maintain two separate dependency +# lists in setup.py vs requirements.txt +# See https://caremad.io/posts/2013/07/setup-vs-requirement/ + +--index-url https://pypi.python.org/simple/ + +-e . \ No newline at end of file diff --git a/tutorials/tensorflow/mlflow_gcp/setup.py b/tutorials/tensorflow/mlflow_gcp/setup.py new file mode 100644 index 00000000..b0ba20f4 --- /dev/null +++ b/tutorials/tensorflow/mlflow_gcp/setup.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import find_packages +from setuptools import setup + +REQUIRED_PACKAGES = [ + 'numpy>=1.14', + 'pandas>=0.22', + 'six>=1.11', + 'google-api-python-client', + 'google-cloud-storage', + 'tensorflow>=1.14,<2', + 'mlflow>1.0,<2' +] + +setup( + name='trainer', + version='0.1', + install_requires=REQUIRED_PACKAGES, + packages=find_packages(), + include_package_data=True, + description='AI Platform trainer' +) diff --git a/tutorials/tensorflow/mlflow_gcp/trainer/model_deployment.py b/tutorials/tensorflow/mlflow_gcp/trainer/model_deployment.py index f7e48f1e..1147c98f 100644 --- a/tutorials/tensorflow/mlflow_gcp/trainer/model_deployment.py +++ b/tutorials/tensorflow/mlflow_gcp/trainer/model_deployment.py @@ -33,24 +33,27 @@ def _create_service(): return discovery.build('ml', 'v1') +def copy_artifacts(source_path, destination_path): + """ + + :param source_path: + :param destination_path: + :return: + """ + logging.info( + 'Moving model directory from {} to {}'.format(source_path, + destination_path)) + subprocess.call( + "gsutil -m cp -r {} {}".format(source_path, destination_path), + shell=True) + + class AIPlatformModel(object): def __init__(self, project_id): self._project_id = project_id self._service = _create_service() - def upload_model(self, model_local_path, model_gcs_path): - """ - :param model_local_path: - :param model_gcs_path: - :return: - """ - logging.info( - 'Moving model directory from {} to {}'.format(model_local_path, - model_gcs_path)) - subprocess.call( - "gsutil -m cp -r {} {}".format(model_local_path, model_gcs_path), - shell=True) def model_exists(self, model_name): """ diff --git a/tutorials/tensorflow/mlflow_gcp/trainer/task.py b/tutorials/tensorflow/mlflow_gcp/trainer/task.py index 2e63d9ba..9a483ace 100644 --- a/tutorials/tensorflow/mlflow_gcp/trainer/task.py +++ b/tutorials/tensorflow/mlflow_gcp/trainer/task.py @@ -19,7 +19,9 @@ import argparse import logging +import tempfile import os +import shutil from builtins import int from mlflow import pyfunc @@ -157,7 +159,7 @@ def train_and_evaluate(args): args: dictionary of arguments - see get_args() for details """ - logging.info('Resume training:', args.reuse_job_dir) + logging.info('Resume training: {}'.format(args.reuse_job_dir)) if not args.reuse_job_dir: if tf.io.gfile.exists(args.job_dir): tf.io.gfile.rmtree(args.job_dir) @@ -199,13 +201,23 @@ def train_and_evaluate(args): # Train model with mlflow.start_run() as active_run: run_id = active_run.info.run_id + + class MlflowCallback(tf.keras.callbacks.Callback): + # This function will be called after training completes. + def on_train_end(self, logs=None): + mlflow.log_param('num_layers', len(self.model.layers)) + mlflow.log_param('optimizer_name', + type(self.model.optimizer).__name__) + + mlflow_callback = MlflowCallback() # Setup Learning Rate decay. lr_decay_callback = tf.keras.callbacks.LearningRateScheduler( lambda epoch: args.learning_rate + 0.02 * (0.5 ** (1 + epoch)), verbose=False) # Setup TensorBoard callback. + tensorboard_path = os.path.join(args.job_dir, run_id, 'tensorboard') tensorboard_callback = tf.keras.callbacks.TensorBoard( - os.path.join(args.job_dir, run_id, 'tensorboard'), + tensorboard_path, histogram_freq=1) history = keras_model.fit( training_dataset, @@ -214,7 +226,8 @@ def train_and_evaluate(args): validation_data=validation_dataset, validation_steps=args.eval_steps, verbose=1, - callbacks=[lr_decay_callback, tensorboard_callback]) + callbacks=[lr_decay_callback, tensorboard_callback, + mlflow_callback]) metrics = history.history logging.info(metrics) keras_model.summary() @@ -238,7 +251,19 @@ def train_and_evaluate(args): model_local_path = os.path.join(args.job_dir, run_id, 'model') tf.keras.experimental.export_saved_model(keras_model, model_local_path) # Define artifacts. - logging.info('Model exported to: ', model_local_path) + logging.info('Model exported to: {}'.format(model_local_path)) + # MLflow workaround since is unable to read GCS path. + if model_local_path.startswith('gs://'): + logging.info('Creating temp folder') + temp = tempfile.mkdtemp() + model_deployment.copy_artifacts(model_local_path, temp) + model_local_path = os.path.join(temp, 'model') + if tensorboard_path.startswith('gs://'): + logging.info('Creating temp folder') + temp = tempfile.mkdtemp() + model_deployment.copy_artifacts(tensorboard_path, temp) + tensorboard_path = temp + mlflow.tensorflow.log_model(tf_saved_model_dir=model_local_path, tf_meta_graph_tags=[tag_constants.SERVING], tf_signature_def_key='serving_default', @@ -247,13 +272,17 @@ def train_and_evaluate(args): pyfunc_model = mlflow.pyfunc.load_model( mlflow.get_artifact_uri('model')) logging.info('Uploading TensorFlow events as a run artifact.') - mlflow.log_artifacts(os.path.join(args.job_dir, run_id, 'tensorboard'), - artifact_path='events') - print("\nLaunch TensorBoard with:\n\ntensorboard --logdir=%s" % - os.path.join(mlflow.get_artifact_uri(), 'events')) + mlflow.log_artifacts(tensorboard_path) + logging.info( + 'Launch TensorBoard with:\n\ntensorboard --logdir=%s' % + tensorboard_path) duration = time() - start_time mlflow.log_metric('duration', duration) mlflow.end_run() + if model_local_path.startswith('gs://') and tensorboard_path.startswith( + 'gs://'): + shutil.rmtree(model_local_path) + shutil.rmtree(tensorboard_path) # Deploy to AI Platform. if args.deploy_gcp: @@ -261,15 +290,18 @@ def train_and_evaluate(args): model_helper = model_deployment.AIPlatformModel( project_id=args.project_id) # Copy local model to GCS for deployment. - model_gcs_path = os.path.join('gs://', args.gcs_bucket, run_id, 'model') - model_helper.upload_model(model_local_path, model_gcs_path) + if not model_local_path.startswith('gs://'): + model_gcs_path = os.path.join('gs://', args.gcs_bucket, run_id, + 'model') + model_deployment.copy_artifacts(model_local_path, model_gcs_path) # Create model model_helper.create_model(args.model_name) # Create model version model_helper.deploy_model(model_gcs_path, args.model_name, run_id, args.run_time_version) - print('Model deployment in GCP completed') - print('This model took: ', duration, 'seconds to train and test.') + logging.info('Model deployment in GCP completed') + logging.info( + 'This model took: {} seconds to train and test.'.format(duration)) if __name__ == '__main__': diff --git a/tutorials/tensorflow/mlflow_gcp/trainer/utils.py b/tutorials/tensorflow/mlflow_gcp/trainer/utils.py index 91af24eb..10e23341 100644 --- a/tutorials/tensorflow/mlflow_gcp/trainer/utils.py +++ b/tutorials/tensorflow/mlflow_gcp/trainer/utils.py @@ -205,8 +205,8 @@ def load_data(training_file_path, eval_file_path, *args, **kwargs): """ # TODO Download and clean custom files. - print('Location train file: %s, eval file %s', training_file_path, - eval_file_path) + print('Location train file: {}, eval file {}'.format(training_file_path, + eval_file_path)) training_file_path, eval_file_path = download(DATA_DIR) # This census data uses the value '?' for missing entries. We use