Skip to content

Commit

Permalink
Merge pull request #4 from owczr/develop
Browse files Browse the repository at this point in the history
Add features for full training
  • Loading branch information
owczr authored Jan 7, 2024
2 parents 7f9d9ba + 840a41a commit b4e48ba
Show file tree
Hide file tree
Showing 11 changed files with 229 additions and 116 deletions.
9 changes: 9 additions & 0 deletions .amlignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
requirements.txt
README.md
LICENSE
tests/
notebooks/
docs/
.pytest_cache/
.github/

17 changes: 11 additions & 6 deletions scripts/azure/machine_learning/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
FROM continuumio/miniconda3

RUN conda update -n base -c defaults conda
RUN pip install --upgrade pip

RUN pip install numpy==1.23.5
RUN pip install pydicom==2.4.4
RUN pip install scikit-image==0.20.0
RUN pip install tensorflow==2.12.0
RUN pip install tqdm==4.65.0
RUN pip install pytest==7.4.0
RUN pip install click==8.0.4
RUN pip install azure-ai-ml==1.12.1
RUN pip install tensorflow-addons==0.23.0

COPY conda_dependencies.yaml .
RUN conda env create -f conda_dependencies.yaml -q && \
rm conda_dependencies.yaml && \
conda run pip cache purge && \
conda clean -a -y
17 changes: 0 additions & 17 deletions scripts/azure/machine_learning/conda_dependencies.yml

This file was deleted.

58 changes: 31 additions & 27 deletions scripts/azure/machine_learning/run_job.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import logging
from datetime import datetime

import click
Expand All @@ -11,6 +12,9 @@
from src.config import MODELS


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

load_dotenv()


Expand All @@ -35,17 +39,9 @@ def get_compute(ml_client):
try:
ml_client.compute.get(cpu_compute_target)
except Exception:
click.echo("Creating a new cpu compute target...")
compute = AmlCompute(
name=cpu_compute_target,
size=size,
min_instances=min_instances,
max_instances=max_instances,
)
ml_client.compute.begin_create_or_update(compute).result()

click.echo(f"Compute {cpu_compute_target} not found.")

def submit_job(ml_client, model, optimizer, loss, metric, epochs, batch_size):
def submit_job(ml_client, model, optimizer, loss, epochs, batch_size):
code = os.getenv("AZURE_CODE_PATH")
environment = os.getenv("AZURE_ENVIRONMENT")
type_ = os.getenv("AZURE_STORAGE_TYPE")
Expand All @@ -55,14 +51,16 @@ def submit_job(ml_client, model, optimizer, loss, metric, epochs, batch_size):
train_path = os.path.join(path, "train")
test_path = os.path.join(path, "test")

job_name = f"train_{model}_{datetime.now().strftime('%Y%m%d%H%M%S')}"

command_job = command(
code=code,
command=(
f"python -m src.scripts.azure.machine_learning.train_{model}"
"python -m scripts.azure.machine_learning.train"
" --train ${{inputs.train}} --test ${{inputs.test}}"
" --epochs ${{inputs.epochs}} --optimizer ${{inputs.optimizer}}"
" --loss ${{inputs.loss}} --metric ${{inputs.metric}}"
" --batch_size ${{inputs.batch_size}} --model ${{inputs.model}}"
" --loss ${{inputs.loss}} --batch_size ${{inputs.batch_size}}"
" --model ${{inputs.model}} --job_name ${{inputs.job_name}}"
),
environment=environment,
inputs={
Expand All @@ -76,13 +74,13 @@ def submit_job(ml_client, model, optimizer, loss, metric, epochs, batch_size):
),
"optimizer": optimizer,
"loss": loss,
"metric": metric,
"epochs": epochs,
"batch_size": batch_size,
"model": model,
"job_name": job_name,
},
compute=compute,
name=f"train_{model}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
name=job_name,
)

returned_job = ml_client.jobs.create_or_update(command_job)
Expand Down Expand Up @@ -115,15 +113,9 @@ def register_model(ml_client, returned_job, run_name, run_description):
default="binary_crossentropy",
help="Loss function to use",
)
@click.option(
"--metric",
type=click.Choice(["accuracy", "f1"]),
default="accuracy",
help="Metrics to use",
)
@click.option("--epochs", type=int, default=10, help="Number of epochs to train for")
@click.option("--batch_size", type=int, default=32, help="Batch size to use")
def run(model, optimizer, loss, metric, epochs, batch_size):
def run(model, optimizer, loss, epochs, batch_size):
if model not in MODELS:
raise ValueError(f"Model {model} not supported")

Expand All @@ -136,15 +128,27 @@ def run(model, optimizer, loss, metric, epochs, batch_size):
model=model,
optimizer=optimizer,
loss=loss,
metric=metric,
epochs=epochs,
batch_size=batch_size,
)

click.echo("Job created with:")
click.echo(f" - id: {returned_job.id}")
click.echo(f" - name: {returned_job.name}")
click.echo(f" - url: {returned_job.studio_url}")
click.echo(
f"Job {returned_job.name} created.\n"
f" - id: {returned_job.id}\n"
f" - url: {returned_job.studio_url}\n"
)

logger.info(
f"Created a {model} training job at {datetime.now()}\n"
f" - id: {returned_job.id}\n"
f" - name: {returned_job.name}\n"
f" - url: {returned_job.studio_url}\n\n"
"Training parameters:\n"
f" - optimizer: {optimizer}\n"
f" - loss: {loss}\n"
f" - epochs: {epochs}\n"
f" - batch size: {batch_size}\n"
)


if __name__ == "__main__":
Expand Down
6 changes: 6 additions & 0 deletions scripts/azure/machine_learning/test_mobilenet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
python -m scripts.azure.machine_learning.run_job \
--model mobilenet \
--optimizer adam \
--loss binary_crossentropy \
--epochs 2 \
--batch_size 64
17 changes: 17 additions & 0 deletions scripts/azure/machine_learning/test_models.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/sh

OPTIMIZER="adam"
LOSS="binary_crossentropy"
EPOCHS=1
BATCH_SIZE=32

for model in mobilenet nasnet efficientnet efficientnetv2 densenet inceptionnet xception resnet resnetv2 convnext inceptionresnet vgg;
do
python -m scripts.azure.machine_learning.run_job \
--model "$model" \
--optimizer "$OPTIMIZER" \
--loss "$LOSS" \
--epochs "$EPOCHS" \
--batch_size "$BATCH_SIZE"
done

108 changes: 61 additions & 47 deletions scripts/azure/machine_learning/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,32 @@
from datetime import datetime

import click
import mlflow
import numpy as np
import tensorflow as tf
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

from src.model.builders import (
ConvNeXtBuilder,
DenseNetBuilder,
EfficientNetBuilder,
EfficientNetV2Builder,
InceptionNetBuilder,
InceptionResNetBuilder,
MobileNetBuilder,
ResNetBuilder,
ResNetV2Builder,
VGGBuilder,
XceptionBuilder,
)
from src.model.director import ModelDirector
from src.dataset.dataset_loader import DatasetLoader
from src.config import EARLY_STOPPING_CONFIG, REDUCE_LR_CONFIG, MODELS

from src.config import (
RANDOM_SEED,
EARLY_STOPPING_CONFIG,
REDUCE_LR_CONFIG,
MODELS,
BUILDERS,
CALLBACKS,
METRICS,
config_logging
)

config_logging()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger = logging.getLogger("azure")


@click.command()
@click.command(
@click.option(
"--model", type=click.Choice(MODELS), default="mobilenet", help="Model to train"
)
@click.option(
Expand All @@ -47,57 +47,71 @@
default="binary_crossentropy",
help="Loss function to use",
)
@click.option(
"--metric",
type=click.Choice(["accuracy", "f1"]),
default="accuracy",
help="Metrics to use",
)
@click.option("--epochs", type=int, default=10, help="Number of epochs to train for")
def run(model, train, test, optimizer, loss, metric, epochs):
@click.option("--epochs", type=click.INT, default=10, help="Number of epochs to train for")
@click.option("--batch_size", type=click.INT, default=64, help="Batch size for dataset loaders")
@click.option("--job_name", type=click.STRING, help="Azure Machine Learning job name")
def run(model, train, test, optimizer, loss, epochs, batch_size, job_name):
mlflow.set_experiment("lung-cancer-detection")
mlflow_run = mlflow.start_run(run_name=f"train_{model}_{datetime.now().strftime('%Y%m%d%H%M%S')}")

mlflow.log_param("optimizer", optimizer)
mlflow.log_param("loss", loss)
mlflow.log_param("epochs", epochs)
mlflow.log_param("batch_size", batch_size)
mlflow.log_param("random_seed", RANDOM_SEED)

logger.info(f"Started training run at {datetime.now()}")
logger.info(
f"Run parameters - optimizer: {optimizer}, loss: {loss}, metrics: {metric}"
f"Run parameters - optimizer: {optimizer}, loss: {loss}"
)

builder = {
"convnext": ConvNeXtBuilder,
"densenet": DenseNetBuilder,
"efficientnet": EfficientNetBuilder,
"efficientnetv2": EfficientNetV2Builder,
"inceptionnet": InceptionNetBuilder,
"inceptionresnet": InceptionResNetBuilder,
"mobilenet": MobileNetBuilder,
"resnet": ResNetBuilder,
"resnetv2": ResNetV2Builder,
"vgg": VGGBuilder,
"xception": XceptionBuilder,
}[model]()
builder = BUILDERS[model]()

director = ModelDirector(builder)
model = director.make()
logger.info(f"Built model with {str(builder)}")
model_nn = director.make()
logger.info(f"Built model_nn with {str(builder)}")

train_loader = DatasetLoader(train)
test_loader = DatasetLoader(test)

train_loader.set_seed(RANDOM_SEED)
test_loader.set_seed(RANDOM_SEED)

train_dataset = train_loader.get_dataset()
test_dataset = test_loader.get_dataset()
logger.info("Loaded train and test datasets")

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model_nn.compile(optimizer=optimizer, loss=loss, metrics=METRICS)
logger.info("Compiled model")

ec = tf.keras.callbacks.EarlyStopping(**EARLY_STOPPING_CONFIG)
lr = tf.keras.callbacks.ReduceLROnPlateau(**REDUCE_LR_CONFIG)

model.fit(train_dataset, epochs=epochs, callbacks=[ec, lr])
history = model_nn.fit(train_dataset, epochs=epochs, callbacks=CALLBACKS)
logger.info("Trained model")

model.evaluate(test_dataset)
for metric, values in history.history.items():
for step, value in enumerate(values):
mlflow.log_metric(f"{metric}", value, step=step)

results = model_nn.evaluate(test_dataset, return_dict=True)
logger.info("Evaluated model")

for metric, value in results.items():
mlflow.log_metric(f"Final {metric}", value)

logger.info(f"Finished training at {datetime.now()}")

mlflow.tensorflow.save_model(
model=model_nn,
path=os.path.join(job_name, model),
)

mlflow.tensorflow.log_model(
model=model_nn,
registered_model_name=model,
artifact_path=model,
)

mlflow.end_run()


if __name__ == "__main__":
run() # pylint: disable=no-value-for-parameter
17 changes: 17 additions & 0 deletions scripts/azure/machine_learning/train_models.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/sh

OPTIMIZER="adam"
LOSS="binary_crossentropy"
EPOCHS=100
BATCH_SIZE=64

for model in mobilenet nasnet efficientnet efficientnetv2 densenet inceptionnet xception resnet resnetv2 convnext inceptionresnet vgg;
do
python -m scripts.azure.machine_learning.run_job \
--model "$model" \
--optimizer "$OPTIMIZER" \
--loss "$LOSS" \
--epochs "$EPOCHS" \
--batch_size "$BATCH_SIZE"
done

Loading

0 comments on commit b4e48ba

Please sign in to comment.