From e96cabb63044397b8e1a85837ada230cf32dfdb0 Mon Sep 17 00:00:00 2001 From: Jakub Owczarek Date: Thu, 4 Jan 2024 23:21:18 +0100 Subject: [PATCH 1/2] Add scripts to train model and submit a job on Azure ML --- requirements.txt | 2 + scripts/azure/machine_learning/Dockerfile | 9 ++ .../machine_learning/conda_dependencies.yml | 17 ++ .../azure/machine_learning/docker-context.yml | 4 + scripts/azure/machine_learning/run_job.py | 151 ++++++++++++++++++ scripts/azure/machine_learning/train.py | 103 ++++++++++++ src/config.py | 33 +++- 7 files changed, 316 insertions(+), 3 deletions(-) create mode 100644 scripts/azure/machine_learning/Dockerfile create mode 100644 scripts/azure/machine_learning/conda_dependencies.yml create mode 100644 scripts/azure/machine_learning/docker-context.yml create mode 100644 scripts/azure/machine_learning/run_job.py create mode 100644 scripts/azure/machine_learning/train.py diff --git a/requirements.txt b/requirements.txt index f08c6e6..8de4053 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ scikit-image==0.20.0 tensorflow==2.12.0 tqdm==4.65.0 pytest==7.4.0 +click==8.0.4 +azure-ai-ml==1.12.1 \ No newline at end of file diff --git a/scripts/azure/machine_learning/Dockerfile b/scripts/azure/machine_learning/Dockerfile new file mode 100644 index 0000000..b68f1dd --- /dev/null +++ b/scripts/azure/machine_learning/Dockerfile @@ -0,0 +1,9 @@ +FROM continuumio/miniconda3 + +RUN conda update -n base -c defaults conda + +COPY conda_dependencies.yaml . +RUN conda env create -f conda_dependencies.yaml -q && \ + rm conda_dependencies.yaml && \ + conda run pip cache purge && \ + conda clean -a -y diff --git a/scripts/azure/machine_learning/conda_dependencies.yml b/scripts/azure/machine_learning/conda_dependencies.yml new file mode 100644 index 0000000..07d86f8 --- /dev/null +++ b/scripts/azure/machine_learning/conda_dependencies.yml @@ -0,0 +1,17 @@ +name: conda +channels: + - conda-forge + - defaults + - anaconda +dependencies: + - python + - pip + - pip: + - numpy==1.23.5 + - pydicom==2.4.4 + - scikit-image==0.20.0 + - tensorflow==2.12.0 + - tqdm==4.65.0 + - pytest==7.4.0 + - click==8.0.4 + - azure-ai-ml==1.12.1 diff --git a/scripts/azure/machine_learning/docker-context.yml b/scripts/azure/machine_learning/docker-context.yml new file mode 100644 index 0000000..68d29d1 --- /dev/null +++ b/scripts/azure/machine_learning/docker-context.yml @@ -0,0 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json +name: cancer-env +build: + path: Dockerfile \ No newline at end of file diff --git a/scripts/azure/machine_learning/run_job.py b/scripts/azure/machine_learning/run_job.py new file mode 100644 index 0000000..53636bf --- /dev/null +++ b/scripts/azure/machine_learning/run_job.py @@ -0,0 +1,151 @@ +import os +from datetime import datetime + +import click +from dotenv import load_dotenv +from azure.ai.ml import MLClient, command, Input +from azure.identity import DefaultAzureCredential +from azure.ai.ml.entities import AmlCompute, Model +from azure.ai.ml.constants import AssetTypes + +from src.config import MODELS + + +load_dotenv() + + +def connect_to_workspace(): + subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID") + resource_group = os.getenv("AZURE_RESOURCE_GROUP") + workspace = os.getenv("AZURE_WORKSPACE") + + ml_client = MLClient( + DefaultAzureCredential(), subscription_id, resource_group, workspace + ) + + return ml_client + + +def get_compute(ml_client): + cpu_compute_target = os.getenv("AZURE_COMPUTE_TARGET") + size = os.getenv("AZURE_COMPUTE_SIZE") + min_instances = os.getenv("AZURE_COMPUTE_MIN_INSTANCES") + max_instances = os.getenv("AZURE_COMPUTE_MAX_INSTANCES") + + try: + ml_client.compute.get(cpu_compute_target) + except Exception: + click.echo("Creating a new cpu compute target...") + compute = AmlCompute( + name=cpu_compute_target, + size=size, + min_instances=min_instances, + max_instances=max_instances, + ) + ml_client.compute.begin_create_or_update(compute).result() + + +def submit_job(ml_client, model, optimizer, loss, metric, epochs, batch_size): + code = os.getenv("AZURE_CODE_PATH") + environment = os.getenv("AZURE_ENVIRONMENT") + type_ = os.getenv("AZURE_STORAGE_TYPE") + path = os.getenv("AZURE_STORAGE_PATH") + compute = os.getenv("AZURE_COMPUTE_TARGET") + + train_path = os.path.join(path, "train") + test_path = os.path.join(path, "test") + + command_job = command( + code=code, + command=( + f"python -m src.scripts.azure.machine_learning.train_{model}" + " --train ${{inputs.train}} --test ${{inputs.test}}" + " --epochs ${{inputs.epochs}} --optimizer ${{inputs.optimizer}}" + " --loss ${{inputs.loss}} --metric ${{inputs.metric}}" + " --batch_size ${{inputs.batch_size}} --model ${{inputs.model}}" + ), + environment=environment, + inputs={ + "train": Input( + type=type_, + path=train_path, + ), + "test": Input( + type=type_, + path=test_path, + ), + "optimizer": optimizer, + "loss": loss, + "metric": metric, + "epochs": epochs, + "batch_size": batch_size, + "model": model, + }, + compute=compute, + name=f"train_{model}_{datetime.now().strftime('%Y%m%d%H%M%S')}", + ) + + returned_job = ml_client.jobs.create_or_update(command_job) + + return returned_job + + +def register_model(ml_client, returned_job, run_name, run_description): + run_model = Model( + path=f"azureml://jobs/{returned_job.name}/outputs/artifacts/paths/model/", + name=run_name, + description=run_description, + type=AssetTypes.MLFLOW_MODEL, + ) + + ml_client.models.create_or_update(run_model) + + +@click.command() +@click.option("--model", type=click.Choice(MODELS), help="Model to train") +@click.option( + "--optimizer", + type=click.Choice(["adam", "sgd"]), + default="adam", + help="Optimizer to use", +) +@click.option( + "--loss", + type=click.Choice(["binary_crossentropy", "categorical_crossentropy"]), + default="binary_crossentropy", + help="Loss function to use", +) +@click.option( + "--metric", + type=click.Choice(["accuracy", "f1"]), + default="accuracy", + help="Metrics to use", +) +@click.option("--epochs", type=int, default=10, help="Number of epochs to train for") +@click.option("--batch_size", type=int, default=32, help="Batch size to use") +def run(model, optimizer, loss, metric, epochs, batch_size): + if model not in MODELS: + raise ValueError(f"Model {model} not supported") + + ml_client = connect_to_workspace() + + get_compute(ml_client=ml_client) + + returned_job = submit_job( + ml_client=ml_client, + model=model, + optimizer=optimizer, + loss=loss, + metric=metric, + epochs=epochs, + batch_size=batch_size, + ) + + click.echo("Job created with:") + click.echo(f" - id: {returned_job.id}") + click.echo(f" - name: {returned_job.name}") + click.echo(f" - url: {returned_job.studio_url}") + + +if __name__ == "__main__": + run() # pylint: disable=no-value-for-parameter diff --git a/scripts/azure/machine_learning/train.py b/scripts/azure/machine_learning/train.py new file mode 100644 index 0000000..bcf3f7e --- /dev/null +++ b/scripts/azure/machine_learning/train.py @@ -0,0 +1,103 @@ +import os +import logging +from datetime import datetime + +import click +import tensorflow as tf + +from src.model.builders import ( + ConvNeXtBuilder, + DenseNetBuilder, + EfficientNetBuilder, + EfficientNetV2Builder, + InceptionNetBuilder, + InceptionResNetBuilder, + MobileNetBuilder, + ResNetBuilder, + ResNetV2Builder, + VGGBuilder, + XceptionBuilder, +) +from src.model.director import ModelDirector +from src.dataset.dataset_loader import DatasetLoader +from src.config import EARLY_STOPPING_CONFIG, REDUCE_LR_CONFIG, MODELS + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@click.command() +@click.command( + "--model", type=click.Choice(MODELS), default="mobilenet", help="Model to train" +) +@click.option( + "--train", type=click.Path(exists=True), help="Path to the training dataset" +) +@click.option("--test", type=click.Path(exists=True), help="Path to the test dataset") +@click.option( + "--optimizer", + type=click.Choice(["adam", "sgd"]), + default="adam", + help="Optimizer to use", +) +@click.option( + "--loss", + type=click.Choice(["binary_crossentropy", "categorical_crossentropy"]), + default="binary_crossentropy", + help="Loss function to use", +) +@click.option( + "--metric", + type=click.Choice(["accuracy", "f1"]), + default="accuracy", + help="Metrics to use", +) +@click.option("--epochs", type=int, default=10, help="Number of epochs to train for") +def run(model, train, test, optimizer, loss, metric, epochs): + logger.info(f"Started training run at {datetime.now()}") + logger.info( + f"Run parameters - optimizer: {optimizer}, loss: {loss}, metrics: {metric}" + ) + + builder = { + "convnext": ConvNeXtBuilder, + "densenet": DenseNetBuilder, + "efficientnet": EfficientNetBuilder, + "efficientnetv2": EfficientNetV2Builder, + "inceptionnet": InceptionNetBuilder, + "inceptionresnet": InceptionResNetBuilder, + "mobilenet": MobileNetBuilder, + "resnet": ResNetBuilder, + "resnetv2": ResNetV2Builder, + "vgg": VGGBuilder, + "xception": XceptionBuilder, + }[model]() + director = ModelDirector(builder) + model = director.make() + logger.info(f"Built model with {str(builder)}") + + train_loader = DatasetLoader(train) + test_loader = DatasetLoader(test) + + train_dataset = train_loader.get_dataset() + test_dataset = test_loader.get_dataset() + logger.info("Loaded train and test datasets") + + model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) + logger.info("Compiled model") + + ec = tf.keras.callbacks.EarlyStopping(**EARLY_STOPPING_CONFIG) + lr = tf.keras.callbacks.ReduceLROnPlateau(**REDUCE_LR_CONFIG) + + model.fit(train_dataset, epochs=epochs, callbacks=[ec, lr]) + logger.info("Trained model") + + model.evaluate(test_dataset) + logger.info("Evaluated model") + + logger.info(f"Finished training at {datetime.now()}") + + +if __name__ == "__main__": + run() # pylint: disable=no-value-for-parameter diff --git a/src/config.py b/src/config.py index 6cb0aed..c24c1dd 100644 --- a/src/config.py +++ b/src/config.py @@ -2,10 +2,37 @@ from datetime import datetime # Get the current date to create a dynamic log filename -current_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') -log_filename = f'lung_cancer_detection_{current_date}.log' +current_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") +log_filename = f"lung_cancer_detection_{current_date}.log" logging.basicConfig( filename=log_filename, level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' + format="%(asctime)s - %(levelname)s - %(message)s", ) + +EARLY_STOPPING_CONFIG = { + "monitor": "val_loss", + "min_delta": 0.001, + "patience": 3, +} + +REDUCE_LR_CONFIG = { + "monitor": "val_loss", + "factor": 0.1, + "patience": 2, + "min_delta": 0.001, +} + +MODELS = [ + "convnext", + "densenet", + "efficientnet", + "efficientnetv2", + "inceptionnet", + "inceptionresnet", + "mobilenet", + "resnet", + "resnetv2", + "vgg", + "xception", +] From 7e99c43a0af02f1cdb95e57ebdea3b05573184d3 Mon Sep 17 00:00:00 2001 From: Jakub Owczarek Date: Thu, 4 Jan 2024 23:24:15 +0100 Subject: [PATCH 2/2] Uncomment pydicom --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8de4053..8a2f650 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy==1.23.5 -#pydicom==2.4.4 +pydicom==2.4.4 scikit-image==0.20.0 tensorflow==2.12.0 tqdm==4.65.0