Merge pull request #2 from owczr/develop

Add Azure VM scripts
owczr · Jan 4, 2024 · 552cfc6 · 552cfc6
2 parents dcc2f06 + 5af7ed1
commit 552cfc6
Show file tree

Hide file tree

Showing 11 changed files with 177 additions and 1 deletion.
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 numpy==1.23.5
-pydicom==2.4.4
+#pydicom==2.4.4
 scikit-image==0.20.0
 tensorflow==2.12.0
 tqdm==4.65.0

diff --git a/scripts/azure_ml/__init__.py → scripts/azure/__init__.py b/scripts/azure_ml/__init__.py → scripts/azure/__init__.py
diff --git a/scripts/azure/machine_learning/__init__.py b/scripts/azure/machine_learning/__init__.py
diff --git a/scripts/azure/virtual_machine/__init__.py b/scripts/azure/virtual_machine/__init__.py
diff --git a/scripts/azure/virtual_machine/download_dataset.sh b/scripts/azure/virtual_machine/download_dataset.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# This script downloads the LIDC-IDRI dataset from the TCIA website.
+
+# URL variables
+nbia_url="https://cbiit-download.nci.nih.gov/nbia/releases/ForTCIA/NBIADataRetriever_4.4.1/nbia-data-retriever-4.4.1.deb"
+images_url="https://wiki.cancerimagingarchive.net/download/attachments/1966254/TCIA_LIDC-IDRI_20200921.tcia?version=1&modificationDate=1600709265077&api=v2"
+annotations_url="https://wiki.cancerimagingarchive.net/download/attachments/1966254/LIDC-XML-only.zip?version=1&modificationDate=1530215018015&api=v2"
+
+# Make directory for the dataset
+dataset_dir="/mnt/data"
+echo -e "-- Using directory $dataset_dir\n"
+
+# Download the NBIA Data Retriever
+nbia_file_path="$dataset_dir/nbia-data-retriever.deb"
+
+echo -e "-- Downloading the NBIA Data Retriever...\n"
+wget -O $nbia_file_path $nbia_url
+echo -e "-- Downloaded to: $nbia_file_path\n"
+
+# Download the manifest file
+images_file_path="$dataset_dir/images.tcia"
+
+echo -e "-- Downloading the manifest file...\n"
+wget -O $images_file_path $images_url
+echo -e "-- Downloaded to: $images_file_path\n"
+
+# Download the annotations file
+annotations_file_path="$dataset_dir/annotations.zip"
+
+echo -e "-- Downloading the annotations file...\n"
+wget -O $annotations_file_path $annotations_url
+echo -e "-- Downloaded to: $annotations_file_path\n"
+
+# Ensure java is installed
+echo -e "-- Checking if java is installed...\n"
+if ! command -v java &> /dev/null
+then
+    echo -e "-- Java is not installed. Installing...\n"
+    sudo -S apt-get install default-jre
+    echo -e "-- Java installed.\n"
+else
+    echo -e "-- Java is installed.\n"
+fi
+
+# Install the NBIA Data Retriever
+echo "-- Installing the NBIA Data Retriever...\n"
+sudo -S dpkg -r $nbia_file_path; sudo -S dpkg -i $nbia_file_path
+echo -e "-- Installed.\n"
+
+# Download the dataset
+output_dir="$dataset_dir/images"
+manifest_path="$images_file_path"
+
+echo -e "-- Downloading the dataset...\n"
+/opt/nbia-data-retriever/nbia-data-retriever --cli $manifest_path -d $output_dir -v -f 
+echo -e "-- Downloaded to: $output_dir\n" 
+
diff --git a/scripts/azure/virtual_machine/move_files.sh b/scripts/azure/virtual_machine/move_files.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+input_dir="/home/jo-engineers-thesis/dataset/images/images/LIDC-IDRI"
+output_dir="/mnt/data/images/images/LIDC-IDRI"
+
+#mv "$input_dir"/* "$output_dir"/
+for file in "$input_dir"/*; do
+	mv $file "$output_dir"/
+done
diff --git a/scripts/azure/virtual_machine/process_dataset.py b/scripts/azure/virtual_machine/process_dataset.py
@@ -0,0 +1,24 @@
+import os
+
+import click
+
+from src.preprocessing.dataset_processor import DatasetProcessor
+
+
+@click.command()
+@click.option("-i", "--input_path", type=click.Path(exists=True, file_okay=False, dir_okay=True),
+    help="Path to directory containing patient data with Dicom images")
+@click.option("-o", "--output_path", type=click.Path(file_okay=False, dir_okay=True, writable=True),
+    help="Path to output directory where processed dicoms will be saved")
+@click.option("-t", "--train_size", type=float, default=0.8,
+    help="Train size for train/test split")
+def run(input_path, output_path, train_size):
+    try:
+        dp = DatasetProcessor(input_path)
+        dp.process_and_save(output_path)
+        click.echo(f"Processing completed. Data saved to {output_path}")
+    except Exception as e:
+        click.echo(f"An error occurred: {e}", err=True)
+
+if __name__ == "__main__":
+    run()
diff --git a/scripts/azure/virtual_machine/subset_files.sh b/scripts/azure/virtual_machine/subset_files.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+SOURCE_DIR=$1
+DESTINATION_DIR=$2
+NUMBER_OF_FILES=$3
+
+echo "Creating a subset of $NUMBER_OF_FILES files from $SOURCE_DIR in $DESTINATION_DIR"
+find "$SOURCE_DIR" -type f | shuf -n "$NUMBER_OF_FILES" | xargs -I {} cp {} "$DESTINATION_DIR"
+echo "Done!"
diff --git a/scripts/azure/virtual_machine/train_test_split.py b/scripts/azure/virtual_machine/train_test_split.py
@@ -0,0 +1,23 @@
+import os
+
+import click
+
+from src.preprocessing.dataset_processor import DatasetProcessor
+
+
+@click.command()
+@click.option("-d", "--dataset_path", type=click.Path(exists=True, file_okay=False, dir_okay=True),
+    help="Path to directory containing processed dataset")
+@click.option("-t", "--train_size", type=float, default=0.8,
+    help="Train size for train/test split")
+def run(dataset_path, train_size):
+    try:
+        dp = DatasetProcessor(dataset_path)
+        click.echo(f"Splitting processed dataset at {dataset_path}\nTrain split: {train_size}")
+        dp.train_test_split(dataset_path, train_size=train_size)
+        click.echo(f"Splitting completed.")
+    except Exception as e:
+        click.echo(f"An error occurred: {e}", err=True)
+
+if __name__ == "__main__":
+    run()
diff --git a/scripts/azure/virtual_machine/upload_dataset.sh b/scripts/azure/virtual_machine/upload_dataset.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+LOCAL_DATA_PATH=$1
+STORAGE_ACCOUNT_NAME=$2
+CONTAINER_NAME=$3
+BLOB_DIRECTORY_NAME=$4
+
+upload_directory() {
+    local dir_path=$1
+    local blob_dir_path=$2
+    for filepath in "$dir_path"/*; do
+	if [ -d "$filepath" ]; then
+	    upload_directory "$filepath" "$blob_dir_path/$(basename "$filepath")"
+    	elif [ -f "$filepath" ]; then
+            echo "Uploading $filepath to $blob_dir_path/$(basename "$filepath")"
+	    az storage blob upload --account-name $STORAGE_ACCOUNT_NAME \
+				   --container-name $CONTAINER_NAME --file "$filepath" \
+				   --name "$blob_dir_path/$(basename "$filepath")"
+	fi
+    done
+}
+
+echo "Started uplading $LOCAL_DATA_PATH to Azure Storage with arguments:"
+echo "  --account-name $STORAGE_ACCOUNT_NAME"
+echo "  --container-name $CONTAINER_NAME"
+
+echo "Uploading train directory..."
+upload_directory "$LOCAL_DATA_PATH/train" "$BLOB_DIRECTORY_NAME/train"
+echo "Done!"
+
+echo "Uploading test directory..."
+upload_directory "$LOCAL_DATA_PATH/test" "$BLOB_DIRECTORY_NAME/test"
+echo "Done!"
+
+echo "Upload complete!"
+
diff --git a/scripts/azure/virtual_machine/upload_dataset_2.sh b/scripts/azure/virtual_machine/upload_dataset_2.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+LOCAL_DATA_PATH=$1
+STORAGE_ACCOUNT_NAME=$2
+CONTAINER_NAME=$3
+SAS_TOKEN=$4
+
+REMOTE_STORAGE_PATH="https://$STORAGE_ACCOUNT_NAME.blob.core.windows.net/$CONTAINER_NAME"
+
+echo "Uploading $LOCAL_DATA_PATH to $REMOTE_STORAGE_PATH"
+echo "Uploading train directory..."
+azcopy copy "$LOCAL_DATA_PATH/train" "$REMOTE_STORAGE_PATH?$SAS_TOKEN" --recursive=true 
+
+echo "Uploading test directory..."
+azcopy copy "$LOCAL_DATA_PATH/test" "$REMOTE_STORAGE_PATH?$SAS_TOKEN" --recursive=true
+
+echo "Upload complete!"
+