Skip to content

Commit

Permalink
Merge pull request #2 from owczr/develop
Browse files Browse the repository at this point in the history
Add Azure VM scripts
  • Loading branch information
owczr authored Jan 4, 2024
2 parents dcc2f06 + 5af7ed1 commit 552cfc6
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 1 deletion.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
numpy==1.23.5
pydicom==2.4.4
#pydicom==2.4.4
scikit-image==0.20.0
tensorflow==2.12.0
tqdm==4.65.0
Expand Down
File renamed without changes.
Empty file.
Empty file.
57 changes: 57 additions & 0 deletions scripts/azure/virtual_machine/download_dataset.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash
# This script downloads the LIDC-IDRI dataset from the TCIA website.

# URL variables
nbia_url="https://cbiit-download.nci.nih.gov/nbia/releases/ForTCIA/NBIADataRetriever_4.4.1/nbia-data-retriever-4.4.1.deb"
images_url="https://wiki.cancerimagingarchive.net/download/attachments/1966254/TCIA_LIDC-IDRI_20200921.tcia?version=1&modificationDate=1600709265077&api=v2"
annotations_url="https://wiki.cancerimagingarchive.net/download/attachments/1966254/LIDC-XML-only.zip?version=1&modificationDate=1530215018015&api=v2"

# Make directory for the dataset
dataset_dir="/mnt/data"
echo -e "-- Using directory $dataset_dir\n"

# Download the NBIA Data Retriever
nbia_file_path="$dataset_dir/nbia-data-retriever.deb"

echo -e "-- Downloading the NBIA Data Retriever...\n"
wget -O $nbia_file_path $nbia_url
echo -e "-- Downloaded to: $nbia_file_path\n"

# Download the manifest file
images_file_path="$dataset_dir/images.tcia"

echo -e "-- Downloading the manifest file...\n"
wget -O $images_file_path $images_url
echo -e "-- Downloaded to: $images_file_path\n"

# Download the annotations file
annotations_file_path="$dataset_dir/annotations.zip"

echo -e "-- Downloading the annotations file...\n"
wget -O $annotations_file_path $annotations_url
echo -e "-- Downloaded to: $annotations_file_path\n"

# Ensure java is installed
echo -e "-- Checking if java is installed...\n"
if ! command -v java &> /dev/null
then
echo -e "-- Java is not installed. Installing...\n"
sudo -S apt-get install default-jre
echo -e "-- Java installed.\n"
else
echo -e "-- Java is installed.\n"
fi

# Install the NBIA Data Retriever
echo "-- Installing the NBIA Data Retriever...\n"
sudo -S dpkg -r $nbia_file_path; sudo -S dpkg -i $nbia_file_path
echo -e "-- Installed.\n"

# Download the dataset
output_dir="$dataset_dir/images"
manifest_path="$images_file_path"

echo -e "-- Downloading the dataset...\n"
/opt/nbia-data-retriever/nbia-data-retriever --cli $manifest_path -d $output_dir -v -f
echo -e "-- Downloaded to: $output_dir\n"

9 changes: 9 additions & 0 deletions scripts/azure/virtual_machine/move_files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

input_dir="/home/jo-engineers-thesis/dataset/images/images/LIDC-IDRI"
output_dir="/mnt/data/images/images/LIDC-IDRI"

#mv "$input_dir"/* "$output_dir"/
for file in "$input_dir"/*; do
mv $file "$output_dir"/
done
24 changes: 24 additions & 0 deletions scripts/azure/virtual_machine/process_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os

import click

from src.preprocessing.dataset_processor import DatasetProcessor


@click.command()
@click.option("-i", "--input_path", type=click.Path(exists=True, file_okay=False, dir_okay=True),
help="Path to directory containing patient data with Dicom images")
@click.option("-o", "--output_path", type=click.Path(file_okay=False, dir_okay=True, writable=True),
help="Path to output directory where processed dicoms will be saved")
@click.option("-t", "--train_size", type=float, default=0.8,
help="Train size for train/test split")
def run(input_path, output_path, train_size):
try:
dp = DatasetProcessor(input_path)
dp.process_and_save(output_path)
click.echo(f"Processing completed. Data saved to {output_path}")
except Exception as e:
click.echo(f"An error occurred: {e}", err=True)

if __name__ == "__main__":
run()
9 changes: 9 additions & 0 deletions scripts/azure/virtual_machine/subset_files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

SOURCE_DIR=$1
DESTINATION_DIR=$2
NUMBER_OF_FILES=$3

echo "Creating a subset of $NUMBER_OF_FILES files from $SOURCE_DIR in $DESTINATION_DIR"
find "$SOURCE_DIR" -type f | shuf -n "$NUMBER_OF_FILES" | xargs -I {} cp {} "$DESTINATION_DIR"
echo "Done!"
23 changes: 23 additions & 0 deletions scripts/azure/virtual_machine/train_test_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os

import click

from src.preprocessing.dataset_processor import DatasetProcessor


@click.command()
@click.option("-d", "--dataset_path", type=click.Path(exists=True, file_okay=False, dir_okay=True),
help="Path to directory containing processed dataset")
@click.option("-t", "--train_size", type=float, default=0.8,
help="Train size for train/test split")
def run(dataset_path, train_size):
try:
dp = DatasetProcessor(dataset_path)
click.echo(f"Splitting processed dataset at {dataset_path}\nTrain split: {train_size}")
dp.train_test_split(dataset_path, train_size=train_size)
click.echo(f"Splitting completed.")
except Exception as e:
click.echo(f"An error occurred: {e}", err=True)

if __name__ == "__main__":
run()
36 changes: 36 additions & 0 deletions scripts/azure/virtual_machine/upload_dataset.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

LOCAL_DATA_PATH=$1
STORAGE_ACCOUNT_NAME=$2
CONTAINER_NAME=$3
BLOB_DIRECTORY_NAME=$4

upload_directory() {
local dir_path=$1
local blob_dir_path=$2
for filepath in "$dir_path"/*; do
if [ -d "$filepath" ]; then
upload_directory "$filepath" "$blob_dir_path/$(basename "$filepath")"
elif [ -f "$filepath" ]; then
echo "Uploading $filepath to $blob_dir_path/$(basename "$filepath")"
az storage blob upload --account-name $STORAGE_ACCOUNT_NAME \
--container-name $CONTAINER_NAME --file "$filepath" \
--name "$blob_dir_path/$(basename "$filepath")"
fi
done
}

echo "Started uplading $LOCAL_DATA_PATH to Azure Storage with arguments:"
echo " --account-name $STORAGE_ACCOUNT_NAME"
echo " --container-name $CONTAINER_NAME"

echo "Uploading train directory..."
upload_directory "$LOCAL_DATA_PATH/train" "$BLOB_DIRECTORY_NAME/train"
echo "Done!"

echo "Uploading test directory..."
upload_directory "$LOCAL_DATA_PATH/test" "$BLOB_DIRECTORY_NAME/test"
echo "Done!"

echo "Upload complete!"

18 changes: 18 additions & 0 deletions scripts/azure/virtual_machine/upload_dataset_2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

LOCAL_DATA_PATH=$1
STORAGE_ACCOUNT_NAME=$2
CONTAINER_NAME=$3
SAS_TOKEN=$4

REMOTE_STORAGE_PATH="https://$STORAGE_ACCOUNT_NAME.blob.core.windows.net/$CONTAINER_NAME"

echo "Uploading $LOCAL_DATA_PATH to $REMOTE_STORAGE_PATH"
echo "Uploading train directory..."
azcopy copy "$LOCAL_DATA_PATH/train" "$REMOTE_STORAGE_PATH?$SAS_TOKEN" --recursive=true

echo "Uploading test directory..."
azcopy copy "$LOCAL_DATA_PATH/test" "$REMOTE_STORAGE_PATH?$SAS_TOKEN" --recursive=true

echo "Upload complete!"

0 comments on commit 552cfc6

Please sign in to comment.