Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scripts to train model and submit a job on Azure ML #3

Merged
merged 2 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
numpy==1.23.5
#pydicom==2.4.4
pydicom==2.4.4
scikit-image==0.20.0
tensorflow==2.12.0
tqdm==4.65.0
pytest==7.4.0
click==8.0.4
azure-ai-ml==1.12.1
9 changes: 9 additions & 0 deletions scripts/azure/machine_learning/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM continuumio/miniconda3

RUN conda update -n base -c defaults conda

COPY conda_dependencies.yaml .
RUN conda env create -f conda_dependencies.yaml -q && \
rm conda_dependencies.yaml && \
conda run pip cache purge && \
conda clean -a -y
17 changes: 17 additions & 0 deletions scripts/azure/machine_learning/conda_dependencies.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: conda
channels:
- conda-forge
- defaults
- anaconda
dependencies:
- python
- pip
- pip:
- numpy==1.23.5
- pydicom==2.4.4
- scikit-image==0.20.0
- tensorflow==2.12.0
- tqdm==4.65.0
- pytest==7.4.0
- click==8.0.4
- azure-ai-ml==1.12.1
4 changes: 4 additions & 0 deletions scripts/azure/machine_learning/docker-context.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: cancer-env
build:
path: Dockerfile
151 changes: 151 additions & 0 deletions scripts/azure/machine_learning/run_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import os
from datetime import datetime

import click
from dotenv import load_dotenv
from azure.ai.ml import MLClient, command, Input
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import AmlCompute, Model
from azure.ai.ml.constants import AssetTypes

from src.config import MODELS


load_dotenv()


def connect_to_workspace():
subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
resource_group = os.getenv("AZURE_RESOURCE_GROUP")
workspace = os.getenv("AZURE_WORKSPACE")

ml_client = MLClient(
DefaultAzureCredential(), subscription_id, resource_group, workspace
)

return ml_client


def get_compute(ml_client):
cpu_compute_target = os.getenv("AZURE_COMPUTE_TARGET")
size = os.getenv("AZURE_COMPUTE_SIZE")
min_instances = os.getenv("AZURE_COMPUTE_MIN_INSTANCES")
max_instances = os.getenv("AZURE_COMPUTE_MAX_INSTANCES")

try:
ml_client.compute.get(cpu_compute_target)
except Exception:
click.echo("Creating a new cpu compute target...")
compute = AmlCompute(
name=cpu_compute_target,
size=size,
min_instances=min_instances,
max_instances=max_instances,
)
ml_client.compute.begin_create_or_update(compute).result()


def submit_job(ml_client, model, optimizer, loss, metric, epochs, batch_size):
code = os.getenv("AZURE_CODE_PATH")
environment = os.getenv("AZURE_ENVIRONMENT")
type_ = os.getenv("AZURE_STORAGE_TYPE")
path = os.getenv("AZURE_STORAGE_PATH")
compute = os.getenv("AZURE_COMPUTE_TARGET")

train_path = os.path.join(path, "train")
test_path = os.path.join(path, "test")

command_job = command(
code=code,
command=(
f"python -m src.scripts.azure.machine_learning.train_{model}"
" --train ${{inputs.train}} --test ${{inputs.test}}"
" --epochs ${{inputs.epochs}} --optimizer ${{inputs.optimizer}}"
" --loss ${{inputs.loss}} --metric ${{inputs.metric}}"
" --batch_size ${{inputs.batch_size}} --model ${{inputs.model}}"
),
environment=environment,
inputs={
"train": Input(
type=type_,
path=train_path,
),
"test": Input(
type=type_,
path=test_path,
),
"optimizer": optimizer,
"loss": loss,
"metric": metric,
"epochs": epochs,
"batch_size": batch_size,
"model": model,
},
compute=compute,
name=f"train_{model}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
)

returned_job = ml_client.jobs.create_or_update(command_job)

return returned_job


def register_model(ml_client, returned_job, run_name, run_description):
run_model = Model(
path=f"azureml://jobs/{returned_job.name}/outputs/artifacts/paths/model/",
name=run_name,
description=run_description,
type=AssetTypes.MLFLOW_MODEL,
)

ml_client.models.create_or_update(run_model)


@click.command()
@click.option("--model", type=click.Choice(MODELS), help="Model to train")
@click.option(
"--optimizer",
type=click.Choice(["adam", "sgd"]),
default="adam",
help="Optimizer to use",
)
@click.option(
"--loss",
type=click.Choice(["binary_crossentropy", "categorical_crossentropy"]),
default="binary_crossentropy",
help="Loss function to use",
)
@click.option(
"--metric",
type=click.Choice(["accuracy", "f1"]),
default="accuracy",
help="Metrics to use",
)
@click.option("--epochs", type=int, default=10, help="Number of epochs to train for")
@click.option("--batch_size", type=int, default=32, help="Batch size to use")
def run(model, optimizer, loss, metric, epochs, batch_size):
if model not in MODELS:
raise ValueError(f"Model {model} not supported")

ml_client = connect_to_workspace()

get_compute(ml_client=ml_client)

returned_job = submit_job(
ml_client=ml_client,
model=model,
optimizer=optimizer,
loss=loss,
metric=metric,
epochs=epochs,
batch_size=batch_size,
)

click.echo("Job created with:")
click.echo(f" - id: {returned_job.id}")
click.echo(f" - name: {returned_job.name}")
click.echo(f" - url: {returned_job.studio_url}")


if __name__ == "__main__":
run() # pylint: disable=no-value-for-parameter
103 changes: 103 additions & 0 deletions scripts/azure/machine_learning/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
import logging
from datetime import datetime

import click
import tensorflow as tf

from src.model.builders import (
ConvNeXtBuilder,
DenseNetBuilder,
EfficientNetBuilder,
EfficientNetV2Builder,
InceptionNetBuilder,
InceptionResNetBuilder,
MobileNetBuilder,
ResNetBuilder,
ResNetV2Builder,
VGGBuilder,
XceptionBuilder,
)
from src.model.director import ModelDirector
from src.dataset.dataset_loader import DatasetLoader
from src.config import EARLY_STOPPING_CONFIG, REDUCE_LR_CONFIG, MODELS


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


@click.command()
@click.command(
"--model", type=click.Choice(MODELS), default="mobilenet", help="Model to train"
)
@click.option(
"--train", type=click.Path(exists=True), help="Path to the training dataset"
)
@click.option("--test", type=click.Path(exists=True), help="Path to the test dataset")
@click.option(
"--optimizer",
type=click.Choice(["adam", "sgd"]),
default="adam",
help="Optimizer to use",
)
@click.option(
"--loss",
type=click.Choice(["binary_crossentropy", "categorical_crossentropy"]),
default="binary_crossentropy",
help="Loss function to use",
)
@click.option(
"--metric",
type=click.Choice(["accuracy", "f1"]),
default="accuracy",
help="Metrics to use",
)
@click.option("--epochs", type=int, default=10, help="Number of epochs to train for")
def run(model, train, test, optimizer, loss, metric, epochs):
logger.info(f"Started training run at {datetime.now()}")
logger.info(
f"Run parameters - optimizer: {optimizer}, loss: {loss}, metrics: {metric}"
)

builder = {
"convnext": ConvNeXtBuilder,
"densenet": DenseNetBuilder,
"efficientnet": EfficientNetBuilder,
"efficientnetv2": EfficientNetV2Builder,
"inceptionnet": InceptionNetBuilder,
"inceptionresnet": InceptionResNetBuilder,
"mobilenet": MobileNetBuilder,
"resnet": ResNetBuilder,
"resnetv2": ResNetV2Builder,
"vgg": VGGBuilder,
"xception": XceptionBuilder,
}[model]()
director = ModelDirector(builder)
model = director.make()
logger.info(f"Built model with {str(builder)}")

train_loader = DatasetLoader(train)
test_loader = DatasetLoader(test)

train_dataset = train_loader.get_dataset()
test_dataset = test_loader.get_dataset()
logger.info("Loaded train and test datasets")

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
logger.info("Compiled model")

ec = tf.keras.callbacks.EarlyStopping(**EARLY_STOPPING_CONFIG)
lr = tf.keras.callbacks.ReduceLROnPlateau(**REDUCE_LR_CONFIG)

model.fit(train_dataset, epochs=epochs, callbacks=[ec, lr])
logger.info("Trained model")

model.evaluate(test_dataset)
logger.info("Evaluated model")

logger.info(f"Finished training at {datetime.now()}")


if __name__ == "__main__":
run() # pylint: disable=no-value-for-parameter
33 changes: 30 additions & 3 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,37 @@
from datetime import datetime

# Get the current date to create a dynamic log filename
current_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
log_filename = f'lung_cancer_detection_{current_date}.log'
current_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_filename = f"lung_cancer_detection_{current_date}.log"
logging.basicConfig(
filename=log_filename,
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
format="%(asctime)s - %(levelname)s - %(message)s",
)

EARLY_STOPPING_CONFIG = {
"monitor": "val_loss",
"min_delta": 0.001,
"patience": 3,
}

REDUCE_LR_CONFIG = {
"monitor": "val_loss",
"factor": 0.1,
"patience": 2,
"min_delta": 0.001,
}

MODELS = [
"convnext",
"densenet",
"efficientnet",
"efficientnetv2",
"inceptionnet",
"inceptionresnet",
"mobilenet",
"resnet",
"resnetv2",
"vgg",
"xception",
]
Loading