From 260c707bf12a374dbb2d6f80ef42fb4988c4dea2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 30 Sep 2024 12:53:28 -0700 Subject: [PATCH 01/95] [gpu] clean-up of sources.list and keyring file assertion --- gpu/install_gpu_driver.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 10b1aa061..10821449e 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1126,8 +1126,8 @@ function clean_up_sources_lists() { # # bigtop (primary) # - local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" + local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" @@ -1203,6 +1203,16 @@ function clean_up_sources_lists() { sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list fi + # + # cran-r + # + if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + rm -f /usr/share/keyrings/cran-r.gpg + curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7' | \ + gpg --dearmor -o /usr/share/keyrings/cran-r.gpg + sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list + fi + # # mysql # From c248aaf75e9483dffa010053504b3914768d8f69 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 4 Oct 2024 14:07:28 -0700 Subject: [PATCH 02/95] merge from master * allow main to access dkms certs * remove full upgrade * tested sources.list cleanup function * only unhold systemd on debian12 where the build breaks otherwise --- gpu/install_gpu_driver.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 10821449e..6c7587d90 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1234,8 +1234,6 @@ if is_debian ; then apt-mark unhold systemd libsystemd0 ; fi fi -configure_dkms_certs - main clear_dkms_key From f942492b18ad9858929064272091ffd9bffb3020 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 10 Oct 2024 11:22:36 -0700 Subject: [PATCH 03/95] merged from custom-images/examples/secure-boot/install_gpu_driver.sh --- gpu/install_gpu_driver.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 6c7587d90..10821449e 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1234,6 +1234,8 @@ if is_debian ; then apt-mark unhold systemd libsystemd0 ; fi fi +configure_dkms_certs + main clear_dkms_key From d6a86cb7a5b263d1f92a1b2ed177c2d93b74ab1e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 10 Oct 2024 12:32:07 -0700 Subject: [PATCH 04/95] added comments for difficut to understand functions --- gpu/install_gpu_driver.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 10821449e..b8a65109b 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -56,8 +56,10 @@ function remove_old_backports { done } +# Return true if the first argument is equal to or less than the second argument function compare_versions_lte { [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; } +# Return true if the first argument is less than the second argument function compare_versions_lt() { [ "$1" = "$2" ] && return 1 || compare_versions_lte $1 $2 } From 3e8007e3ce15f1ef6127346423bc41135b67e095 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 8 Aug 2024 09:36:00 -0700 Subject: [PATCH 05/95] tested with 24.06 ; using conda for cuda 12 --- rapids/rapids.sh | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 9cc708855..c5496932e 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -63,16 +63,17 @@ function get_metadata_attribute() { /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" } -readonly DEFAULT_DASK_RAPIDS_VERSION="23.12" +readonly DEFAULT_DASK_RAPIDS_VERSION="24.06" readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="22.10.0" +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.06" if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then readonly DEFAULT_CUDA_VERSION="11.8" readonly DEFAULT_XGBOOST_VERSION="2.0.3" readonly SPARK_VERSION="${SPARK_VERSION_ENV}" + readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="" else readonly DEFAULT_CUDA_VERSION="10.1" readonly DEFAULT_XGBOOST_VERSION="1.0.0" @@ -87,14 +88,9 @@ readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RUN_WORKER_ON_MASTER=$(get_metadata_attribute 'dask-cuda-worker-on-master' 'true') # RAPIDS config -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) -if [[ "${CUDA_VERSION%%.*}" == 12 ]]; then - # at the time of writing 20240721 there is no support for the 12.x - # releases of cudatoolkit package in mamba. For the time being, - # we will use a maximum of 11.8 - CUDA_VERSION="11.8" -fi -readonly CUDA_VERSION +readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) +function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } +function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } # SPARK config readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) @@ -124,14 +120,21 @@ function execute_with_retries() { } function install_dask_rapids() { - if is_debian11 || is_debian12 || is_ubuntu20 || is_ubuntu22 ; then - local python_ver="3.10" - else - local python_ver="3.9" + if is_cuda12 ; then + # This task takes a lot of memory and time. The 15G available in n1-standard-4 is insufficient + conda config --set channel_priority flexible + conda create -n "rapids-${RAPIDS_VERSION}" -c rapidsai -c conda-forge -c nvidia \ + "rapids=${RAPIDS_VERSION}" python="3.11" "cuda-version=${CUDA_VERSION}" + elif is_cuda11 ; then + if is_debian11 || is_debian12 || is_ubuntu20 || is_ubuntu22 ; then + local python_ver="3.10" + else + local python_ver="3.9" + fi + # Install cudatoolkit, pandas, rapids and cudf + mamba install -m -n 'dask-rapids' -y --no-channel-priority -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ + "cudatoolkit=11.0" "pandas<1.5" "rapids" "cudf" "python=${python_ver}" fi - # Install RAPIDS, cudatoolkit - mamba install -m -n 'dask-rapids' -y --no-channel-priority -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ - "cudatoolkit=${CUDA_VERSION}" "pandas<1.5" "rapids=${RAPIDS_VERSION}" "python=${python_ver}" } function install_spark_rapids() { From c3855469299b4dd44debf212bf3ddb09ae2f10b7 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 8 Aug 2024 09:36:00 -0700 Subject: [PATCH 06/95] tested with 24.06 ; using conda for cuda 12 inlined functions and re-ordered definitions using 22.08 max for cuda 11 --- rapids/rapids.sh | 108 +++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 65 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index c5496932e..c70253dc2 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -7,67 +7,32 @@ if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" fi -function os_id() { - grep '^ID=' /etc/os-release | cut -d= -f2 | xargs -} - -function os_version() { - grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs -} - -function os_codename() { - grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs -} - -function is_rocky() { - [[ "$(os_id)" == 'rocky' ]] -} - -function is_ubuntu() { - [[ "$(os_id)" == 'ubuntu' ]] -} - -function is_ubuntu20() { - is_ubuntu && [[ "$(os_version)" == '20.04'* ]] -} - -function is_ubuntu22() { - is_ubuntu && [[ "$(os_version)" == '22.04'* ]] -} - -function is_debian() { - [[ "$(os_id)" == 'debian' ]] -} - -function is_debian11() { - is_debian && [[ "$(os_version)" == '11'* ]] -} - -function is_debian12() { - is_debian && [[ "$(os_version)" == '12'* ]] -} - -function os_vercat() { - if is_ubuntu ; then - os_version | sed -e 's/[^0-9]//g' - elif is_rocky ; then - os_version | sed -e 's/[^0-9].*$//g' - else - os_version - fi -} +function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } +function os_version() { grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; } +function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; } +function is_rocky() { [[ "$(os_id)" == 'rocky' ]] ; } +function is_rocky8() { is_rocky && [[ "$(os_version)" == '8'* ]] ; } +function is_rocky9() { is_rocky && [[ "$(os_version)" == '9'* ]] ; } +function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } +function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } +function is_ubuntu20() { is_ubuntu && [[ "$(os_version)" == '20.04'* ]] ; } +function is_ubuntu22() { is_ubuntu && [[ "$(os_version)" == '22.04'* ]] ; } +function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } +function is_debian10() { is_debian && [[ "$(os_version)" == '10'* ]] ; } +function is_debian11() { is_debian && [[ "$(os_version)" == '11'* ]] ; } +function is_debian12() { is_debian && [[ "$(os_version)" == '12'* ]] ; } +function os_vercat() { if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' + elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' + else os_version ; fi ; } function get_metadata_attribute() { local -r attribute_name=$1 - local -r default_value=$2 + local -r default_value="${2:-}" /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" } -readonly DEFAULT_DASK_RAPIDS_VERSION="24.06" -readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) - readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.06" +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.06.0" if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then readonly DEFAULT_CUDA_VERSION="11.8" @@ -81,16 +46,24 @@ else readonly SPARK_VERSION="2.x" fi +# RAPIDS config +readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) +function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } +function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } + +if is_cuda11 ; then DEFAULT_DASK_RAPIDS_VERSION="22.08" +else DEFAULT_DASK_RAPIDS_VERSION="24.06" ; fi + +readonly DEFAULT_DASK_RAPIDS_VERSION +readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) + readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RUN_WORKER_ON_MASTER=$(get_metadata_attribute 'dask-cuda-worker-on-master' 'true') -# RAPIDS config -readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) -function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } -function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } + # SPARK config readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) @@ -121,20 +94,25 @@ function execute_with_retries() { function install_dask_rapids() { if is_cuda12 ; then - # This task takes a lot of memory and time. The 15G available in n1-standard-4 is insufficient - conda config --set channel_priority flexible - conda create -n "rapids-${RAPIDS_VERSION}" -c rapidsai -c conda-forge -c nvidia \ - "rapids=${RAPIDS_VERSION}" python="3.11" "cuda-version=${CUDA_VERSION}" + local pandas_spec='pandas' + local python_ver="3.11" elif is_cuda11 ; then + local pandas_spec='pandas<1.5' if is_debian11 || is_debian12 || is_ubuntu20 || is_ubuntu22 ; then local python_ver="3.10" else local python_ver="3.9" fi - # Install cudatoolkit, pandas, rapids and cudf - mamba install -m -n 'dask-rapids' -y --no-channel-priority -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ - "cudatoolkit=11.0" "pandas<1.5" "rapids" "cudf" "python=${python_ver}" fi + + # Install cuda, pandas, rapids, dask and cudf + mamba="/opt/conda/default/bin/mamba" + "${mamba}" install -m -n 'dask-rapids' -y --no-channel-priority \ + -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ + "cuda-version=${CUDA_VERSION}" \ + "${pandas_spec}" \ + "rapids=${RAPIDS_VERSION}" \ + dask cudf python="${python_ver}" } function install_spark_rapids() { From 4bf628aa19df9d2f251c4560a03c62055a392f12 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 8 Aug 2024 21:52:52 -0700 Subject: [PATCH 07/95] removed os check functions and the use of them --- rapids/rapids.sh | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index c70253dc2..6a29ae277 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -7,24 +7,6 @@ if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" fi -function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } -function os_version() { grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; } -function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; } -function is_rocky() { [[ "$(os_id)" == 'rocky' ]] ; } -function is_rocky8() { is_rocky && [[ "$(os_version)" == '8'* ]] ; } -function is_rocky9() { is_rocky && [[ "$(os_version)" == '9'* ]] ; } -function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } -function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } -function is_ubuntu20() { is_ubuntu && [[ "$(os_version)" == '20.04'* ]] ; } -function is_ubuntu22() { is_ubuntu && [[ "$(os_version)" == '22.04'* ]] ; } -function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } -function is_debian10() { is_debian && [[ "$(os_version)" == '10'* ]] ; } -function is_debian11() { is_debian && [[ "$(os_version)" == '11'* ]] ; } -function is_debian12() { is_debian && [[ "$(os_version)" == '12'* ]] ; } -function os_vercat() { if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' - elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' - else os_version ; fi ; } - function get_metadata_attribute() { local -r attribute_name=$1 local -r default_value="${2:-}" @@ -98,11 +80,7 @@ function install_dask_rapids() { local python_ver="3.11" elif is_cuda11 ; then local pandas_spec='pandas<1.5' - if is_debian11 || is_debian12 || is_ubuntu20 || is_ubuntu22 ; then - local python_ver="3.10" - else - local python_ver="3.9" - fi + local python_ver="3.9" fi # Install cuda, pandas, rapids, dask and cudf @@ -112,7 +90,7 @@ function install_dask_rapids() { "cuda-version=${CUDA_VERSION}" \ "${pandas_spec}" \ "rapids=${RAPIDS_VERSION}" \ - dask cudf python="${python_ver}" + dask cudf "python=${python_ver}" } function install_spark_rapids() { From e370b80a3e394232ca83e3424145319b97934269 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 8 Aug 2024 22:10:56 -0700 Subject: [PATCH 08/95] capturing runtime of mamba install --- rapids/rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 6a29ae277..a37d6949c 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -85,7 +85,7 @@ function install_dask_rapids() { # Install cuda, pandas, rapids, dask and cudf mamba="/opt/conda/default/bin/mamba" - "${mamba}" install -m -n 'dask-rapids' -y --no-channel-priority \ + time "${mamba}" install -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ "cuda-version=${CUDA_VERSION}" \ "${pandas_spec}" \ From cecf837f91e6cb310e7ded9f3cbec58fc35e8625 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 8 Aug 2024 23:49:05 -0700 Subject: [PATCH 09/95] retry failed mamba with conda --- rapids/rapids.sh | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index a37d6949c..f224a35af 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -14,7 +14,7 @@ function get_metadata_attribute() { } readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.06.0" +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.0" if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then readonly DEFAULT_CUDA_VERSION="11.8" @@ -34,7 +34,7 @@ function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } if is_cuda11 ; then DEFAULT_DASK_RAPIDS_VERSION="22.08" -else DEFAULT_DASK_RAPIDS_VERSION="24.06" ; fi +else DEFAULT_DASK_RAPIDS_VERSION="24.08" ; fi readonly DEFAULT_DASK_RAPIDS_VERSION readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) @@ -78,19 +78,40 @@ function install_dask_rapids() { if is_cuda12 ; then local pandas_spec='pandas' local python_ver="3.11" + local cuda_spec='cuda-version>=12,<=12.5' + local dask_spec='dask' elif is_cuda11 ; then local pandas_spec='pandas<1.5' local python_ver="3.9" + local cuda_spec='cuda-version>=11,<12.0a0' + local dask_spec='dask' fi # Install cuda, pandas, rapids, dask and cudf + local is_installed="0" mamba="/opt/conda/default/bin/mamba" - time "${mamba}" install -m -n 'dask-rapids' -y --no-channel-priority \ - -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ - "cuda-version=${CUDA_VERSION}" \ - "${pandas_spec}" \ - "rapids=${RAPIDS_VERSION}" \ - dask cudf "python=${python_ver}" + conda="/opt/conda/default/bin/conda" + "${conda}" config --set channel_priority flexible + for installer in "${mamba}" "${conda}" ; do + set +e + time "${installer}" install -m -n 'dask-rapids' -y --no-channel-priority \ + -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ + "cuda-version=${CUDA_VERSION%%.*}" \ + "${pandas_spec}" \ + "rapids=${RAPIDS_VERSION}" \ + "${dask_spec}" \ + cudf "python=${python_ver}" + if [[ "$?" == "0" ]] ; then + is_installed="1" + continue + fi + set -e + done + if [[ "${is_installed}" == "0" ]]; then + echo "failed to install dask" + return 1 + fi + set -e } function install_spark_rapids() { From 6f91fb1587f3936e1fb26be48ad995434cf6d0b9 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 8 Aug 2024 23:54:34 -0700 Subject: [PATCH 10/95] increase machine type ; reduce disk size ; test 11.8 (12.4 is default) --- rapids/test_rapids.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 4df22249b..1dc934282 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -70,10 +70,10 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, configuration, self.DASK_INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-4", + machine_type="n1-standard-8", master_accelerator=accelerator, worker_accelerator=accelerator, - boot_disk_size="200GB", + boot_disk_size="100GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -95,10 +95,10 @@ def test_rapids_spark(self, configuration, machine_suffixes, accelerator): self.INIT_ACTIONS, optional_components=optional_components, metadata=metadata, - machine_type="n1-standard-4", + machine_type="n1-standard-8", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="200GB", + boot_disk_size="100GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -107,7 +107,7 @@ def test_rapids_spark(self, configuration, machine_suffixes, accelerator): # Only need to do this once self.verify_spark_job() - @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4")) + @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "11.8")) def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version): @@ -121,10 +121,10 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, configuration, self.INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-4", + machine_type="n1-standard-8", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="200GB", + boot_disk_size="100GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: From 48205a84210a66f16959bd9b64aef978dc8f2f83 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 9 Aug 2024 08:28:58 -0700 Subject: [PATCH 11/95] spark does not yet have 24.08.0 --- rapids/rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index f224a35af..de2de77b5 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -14,7 +14,7 @@ function get_metadata_attribute() { } readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.0" +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.06.1" if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then readonly DEFAULT_CUDA_VERSION="11.8" From d085df22d6a32d0ea0ee05754c80457bdfd0c759 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 9 Aug 2024 14:13:38 -0700 Subject: [PATCH 12/95] tested with 2.1 and 2.2 --- dask/dask.sh | 1 + rapids/rapids.sh | 34 +++++++++++++++++++--------------- rapids/test_rapids.py | 2 ++ 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index f492f27f6..48558f154 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -453,6 +453,7 @@ function install_dask() { echo "failed to install dask" return 1 fi + set -e } function main() { diff --git a/rapids/rapids.sh b/rapids/rapids.sh index de2de77b5..afd1905a2 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -17,7 +17,7 @@ readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[ readonly DEFAULT_SPARK_RAPIDS_VERSION="24.06.1" if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then - readonly DEFAULT_CUDA_VERSION="11.8" + readonly DEFAULT_CUDA_VERSION="12.4" readonly DEFAULT_XGBOOST_VERSION="2.0.3" readonly SPARK_VERSION="${SPARK_VERSION_ENV}" readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="" @@ -33,8 +33,8 @@ readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VER function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } -if is_cuda11 ; then DEFAULT_DASK_RAPIDS_VERSION="22.08" -else DEFAULT_DASK_RAPIDS_VERSION="24.08" ; fi +if is_cuda11 ; then DEFAULT_DASK_RAPIDS_VERSION="22.06" +else DEFAULT_DASK_RAPIDS_VERSION="24.06" ; fi readonly DEFAULT_DASK_RAPIDS_VERSION readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) @@ -76,34 +76,38 @@ function execute_with_retries() { function install_dask_rapids() { if is_cuda12 ; then - local pandas_spec='pandas' local python_ver="3.11" - local cuda_spec='cuda-version>=12,<=12.5' - local dask_spec='dask' + local cuda_spec="cuda-version>=12,<=12.5" + local dask_spec="dask" elif is_cuda11 ; then - local pandas_spec='pandas<1.5' local python_ver="3.9" - local cuda_spec='cuda-version>=11,<12.0a0' - local dask_spec='dask' + local cuda_spec="cuda-version>=11,<12.0a0" + local dask_spec="dask" fi - # Install cuda, pandas, rapids, dask and cudf + # Install cuda, rapids, dask local is_installed="0" mamba="/opt/conda/default/bin/mamba" conda="/opt/conda/default/bin/conda" "${conda}" config --set channel_priority flexible + + if [[ -d /opt/conda/miniconda3/envs/dask-rapids ]]; then + local operation="install" + else + local operation="create" + fi + for installer in "${mamba}" "${conda}" ; do set +e - time "${installer}" install -m -n 'dask-rapids' -y --no-channel-priority \ + time "${installer}" "${operation}" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ - "cuda-version=${CUDA_VERSION%%.*}" \ - "${pandas_spec}" \ + "${cuda_spec}" \ "rapids=${RAPIDS_VERSION}" \ "${dask_spec}" \ - cudf "python=${python_ver}" + "python=${python_ver}" if [[ "$?" == "0" ]] ; then is_installed="1" - continue + break fi set -e done diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 1dc934282..e023a2c1f 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -15,6 +15,8 @@ class RapidsTestCase(DataprocTestCase): ] GPU_P100 = "type=nvidia-tesla-p100" + GPU_A100 = "type=nvidia-tesla-a100,count=2" + GPU_H100 = "type=nvidia-h100-80gb,count=2" GPU_T4 = "type=nvidia-tesla-t4" # Tests for RAPIDS init action From aae3c863125c6793463490940f5d636ef8218935 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 9 Aug 2024 15:57:52 -0700 Subject: [PATCH 13/95] always create environment ; run test scripts with python from envs/dask-rapids/bin --- rapids/rapids.sh | 8 ++------ rapids/test_rapids.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index afd1905a2..244f39358 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -91,15 +91,11 @@ function install_dask_rapids() { conda="/opt/conda/default/bin/conda" "${conda}" config --set channel_priority flexible - if [[ -d /opt/conda/miniconda3/envs/dask-rapids ]]; then - local operation="install" - else - local operation="create" - fi + local operation="create" for installer in "${mamba}" "${conda}" ; do set +e - time "${installer}" "${operation}" -m -n 'dask-rapids' -y --no-channel-priority \ + time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ "${cuda_spec}" \ "rapids=${RAPIDS_VERSION}" \ diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index e023a2c1f..7268cf49b 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -32,7 +32,7 @@ def verify_dask_instance(self, name): os.path.join( os.path.dirname(os.path.abspath(__file__)), self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME), name) - verify_cmd = "/opt/conda/default/bin/python {}".format( + verify_cmd = "/opt/conda/miniconda3/envs/dask-rapids/bin/python {}".format( self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME, name) From eb95860d2e932a7dccd25a9e8fa7d9d28c51aadb Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 9 Aug 2024 17:43:21 -0700 Subject: [PATCH 14/95] skipping dask with yarn runtime tests for now --- dask/test_dask.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dask/test_dask.py b/dask/test_dask.py index a5d4314e4..6d5dfaf9c 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -47,6 +47,10 @@ def test_dask(self, configuration, instances, runtime): if self.getImageVersion() < pkg_resources.parse_version("2.0"): self.skipTest("Not supported in pre-2.0 images") + # https://github.com/dask/dask-yarn/pull/162 + if runtime != "standalone": + self.skipTest("dask-yarn known to fail presently.") + metadata = None if runtime: metadata = "dask-runtime={}".format(runtime) From 9a4d5366bc8a25701fe317d2141628fa8290d4fe Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 9 Aug 2024 17:43:39 -0700 Subject: [PATCH 15/95] added copyright block --- rapids/rapids.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 244f39358..e84779ff5 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -1,5 +1,22 @@ #!/bin/bash +# Copyright 2019,2020,2021,2022,2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This initialization action script will install rapids on a Dataproc +# cluster. + set -euxo pipefail # Detect dataproc image version from its various names From 97dd7ad931571d51ffced492e7bb55b98bb51f52 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 10 Aug 2024 13:00:01 -0700 Subject: [PATCH 16/95] temporary changes to improve test performance --- cloudbuild/presubmit.sh | 1 + integration_tests/dataproc_test_case.py | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index b09789a52..51ab7023f 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,6 +70,7 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then + continue # remove this and merge all changes to integration_tests/ and cloudbuild/ into prince's branch before squash/merge echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index c2e6577b1..a0ae04046 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -17,7 +17,7 @@ FLAGS = flags.FLAGS flags.DEFINE_string('image', None, 'Dataproc image URL') -flags.DEFINE_string('image_version', None, 'Dataproc image version, e.g. 1.4') +flags.DEFINE_string('image_version', None, 'Dataproc image version, e.g. 2.2') flags.DEFINE_boolean('skip_cleanup', False, 'Skip cleanup of test resources') FLAGS(sys.argv) @@ -178,7 +178,9 @@ def createCluster(self, args.append("--zone={}".format(self.cluster_zone)) if not FLAGS.skip_cleanup: - args.append("--max-age=2h") + args.append("--max-age=30m") + + args.append("--max-idle=5m") cmd = "{} dataproc clusters create {} {}".format( "gcloud beta" if beta else "gcloud", self.name, " ".join(args)) @@ -239,7 +241,7 @@ def getClusterName(self): @staticmethod def getImageVersion(): - # Get a numeric version from the version flag: '1.5-debian10' -> '1.5'. + # Get a numeric version from the version flag: '2.2-debian10' -> '2.2'. # Special case a 'preview' image versions and return a large number # instead to make it a higher image version in comparisons version = FLAGS.image_version @@ -248,7 +250,7 @@ def getImageVersion(): @staticmethod def getImageOs(): - # Get OS string from the version flag: '1.5-debian10' -> 'debian'. + # Get OS string from the version flag: '2.2-debian10' -> 'debian'. # If image version specified without OS suffix ('2.0') # then return 'debian' by default version = FLAGS.image_version From 86c76713fa6c825833de633059fbd2ed7526ecd8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 10 Aug 2024 13:05:35 -0700 Subject: [PATCH 17/95] increasing machine type, attempting 2024.06 again now that I have fixed the conda mismatch --- rapids/test_rapids.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 7268cf49b..66a49f78c 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -72,7 +72,7 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, configuration, self.DASK_INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-8", + machine_type="n1-standard-16", master_accelerator=accelerator, worker_accelerator=accelerator, boot_disk_size="100GB", @@ -97,7 +97,7 @@ def test_rapids_spark(self, configuration, machine_suffixes, accelerator): self.INIT_ACTIONS, optional_components=optional_components, metadata=metadata, - machine_type="n1-standard-8", + machine_type="n1-standard-16", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, boot_disk_size="100GB", @@ -123,7 +123,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, configuration, self.INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-8", + machine_type="n1-standard-16", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, boot_disk_size="100GB", From 151597f03b953f550f98bd4f795c77d477c2a06e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 10 Aug 2024 16:53:09 -0700 Subject: [PATCH 18/95] refactored code a bit --- dask/test_dask.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dask/test_dask.py b/dask/test_dask.py index 6d5dfaf9c..6f7c0e515 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -47,10 +47,6 @@ def test_dask(self, configuration, instances, runtime): if self.getImageVersion() < pkg_resources.parse_version("2.0"): self.skipTest("Not supported in pre-2.0 images") - # https://github.com/dask/dask-yarn/pull/162 - if runtime != "standalone": - self.skipTest("dask-yarn known to fail presently.") - metadata = None if runtime: metadata = "dask-runtime={}".format(runtime) @@ -72,6 +68,8 @@ def test_dask(self, configuration, instances, runtime): if runtime == "standalone": self.verify_dask_standalone(name, master_hostname) else: + # https://github.com/dask/dask-yarn/pull/162 + self.skipTest("dask-yarn known to fail presently.") self.verify_dask_yarn(name) if __name__ == '__main__': From a1ab571378b8c60b9a7d9d72629135a2ea9bd728 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 10 Aug 2024 18:21:24 -0700 Subject: [PATCH 19/95] how did this get in this change? --- bigtable/bigtable.sh | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/bigtable/bigtable.sh b/bigtable/bigtable.sh index 34d914759..7b45a91eb 100755 --- a/bigtable/bigtable.sh +++ b/bigtable/bigtable.sh @@ -42,8 +42,7 @@ readonly BIGTABLE_HBASE_CLIENT_2X_VERSION='2.12.0' readonly BIGTABLE_HBASE_CLIENT_2X_JAR="bigtable-hbase-2.x-hadoop-${BIGTABLE_HBASE_CLIENT_2X_VERSION}.jar" readonly BIGTABLE_HBASE_CLIENT_2X_URL="${BIGTABLE_HBASE_CLIENT_2X_REPO}/${BIGTABLE_HBASE_CLIENT_2X_VERSION}/${BIGTABLE_HBASE_CLIENT_2X_JAR}" -readonly region="$(/usr/share/google/get_metadata_value attributes/dataproc-region)" -readonly SCH_REPO="gs://dataproc-initialization-actions-${region}/jars/bigtable" +readonly SCH_REPO="https://repo.hortonworks.com/content/repositories/releases/com/hortonworks" readonly SHC_VERSION='1.1.1-2.1-s_2.11' readonly SHC_JAR="shc-core-${SHC_VERSION}.jar" readonly SHC_EXAMPLES_JAR="shc-examples-${SHC_VERSION}.jar" @@ -59,24 +58,21 @@ readonly BIGTABLE_PROJECT="$(/usr/share/google/get_metadata_value attributes/big /usr/share/google/get_metadata_value ../project/project-id)" function remove_old_backports { - if is_debian12 ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this - # problem, we will use archive.debian.org for the oldoldstable repo + # problem, we will remove any reference to backports repos older than oldstable # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 - debdists="https://deb.debian.org/debian/dists" - oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); - oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); - stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); - - matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) - - for filename in "${matched_files[@]}"; do - # Fetch from archive.debian.org for ${oldoldstable}-backports - perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } - {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" - done + oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}'); + stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}'); + + matched_files=( $(grep -rsil '\-backports' /etc/apt/sources.list*||:) ) + if [[ -n "$matched_files" ]]; then + for filename in "${matched_files[@]}"; do + grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \ + sed -i -e 's/^.*-backports.*$//' "$filename" + done + fi } function retry_command() { @@ -112,10 +108,12 @@ function install_bigtable_client() { function install_shc() { mkdir -p "/usr/lib/spark/external" local out="/usr/lib/spark/external/${SHC_JAR}" - gsutil cp -r "${SHC_URL}" "${out}" + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${SHC_URL}" -O "${out}" ln -s "${out}" "/usr/lib/spark/external/shc-core.jar" local example_out="/usr/lib/spark/examples/jars/${SHC_EXAMPLES_JAR}" - gsutil cp -r "${SHC_EXAMPLES_URL}" "${example_out}" + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${SHC_EXAMPLES_URL}" -O "${example_out}" ln -s "${example_out}" "/usr/lib/spark/examples/jars/shc-examples.jar" } @@ -244,8 +242,7 @@ function install_hbase() { local VARIANT="bin" local BASENAME="hbase-${HBASE_VERSION}-${VARIANT}.tar.gz" echo "hbase dist basename: ${BASENAME}" - curl -fsSL -o "/tmp/${BASENAME}" --retry-connrefused --retry 3 --retry-max-time 5 \ - "https://archive.apache.org/dist/hbase/${HBASE_VERSION}/${BASENAME}" || err 'Unable to download tar' + wget -q -nv "https://archive.apache.org/dist/hbase/${HBASE_VERSION}/${BASENAME}" -P /tmp || err 'Unable to download tar' # extract binaries from bundle mkdir -p "/tmp/hbase-${HBASE_VERSION}/" "${HBASE_HOME}" From 62262db38d831a607a37e4e8923766052bcb1da0 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 10 Aug 2024 18:32:22 -0700 Subject: [PATCH 20/95] we are seeing an error in this config file ; investigate --- cloudbuild/run-presubmit-on-k8s.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cloudbuild/run-presubmit-on-k8s.sh b/cloudbuild/run-presubmit-on-k8s.sh index 810213832..5e0739448 100644 --- a/cloudbuild/run-presubmit-on-k8s.sh +++ b/cloudbuild/run-presubmit-on-k8s.sh @@ -12,6 +12,8 @@ gcloud container clusters get-credentials "${CLOUDSDK_CONTAINER_CLUSTER}" LOGS_SINCE_TIME=$(date --iso-8601=seconds) +cat /builder/home/.kube/config + kubectl run "${POD_NAME}" \ --image="${IMAGE}" \ --restart=Never \ From 77f9fa05bdc584cf4510f5d22bc533330cd794d1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 10 Aug 2024 13:00:01 -0700 Subject: [PATCH 21/95] temporary changes to improve test performance --- integration_tests/dataproc_test_case.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index a0ae04046..96a96ecfc 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -126,6 +126,7 @@ def createCluster(self, "mlvm.sh" in i or "rapids.sh" in i or \ "spark-rapids.sh" in i or "horovod.sh" in i: args.append("--no-shielded-secure-boot") + break if optional_components: args.append("--optional-components={}".format( From 8ccbc27be33be43de5ba213d5188f1cc1aefdb39 Mon Sep 17 00:00:00 2001 From: Prince Datta Date: Thu, 8 Aug 2024 20:25:33 +0530 Subject: [PATCH 22/95] Adding disable shielded boot flag and disk type ssd flag to enhance the cluster creation (#1209) * Adding disable shielded boot flag and disk type ssd flag to enhance the cluster creation * Disabling secure boot for all the gpu dependent init action scripts. * Disabling secure boot for all the gpu dependent init action scripts. --- integration_tests/dataproc_test_case.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 96a96ecfc..a0ae04046 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -126,7 +126,6 @@ def createCluster(self, "mlvm.sh" in i or "rapids.sh" in i or \ "spark-rapids.sh" in i or "horovod.sh" in i: args.append("--no-shielded-secure-boot") - break if optional_components: args.append("--optional-components={}".format( From 25f0d96967266b9fadd5bf0c37ac9dbf23fc3dc2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 10:16:35 -0700 Subject: [PATCH 23/95] tested on debian11 w/ cuda11 --- rapids/rapids.sh | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index e84779ff5..676329511 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -93,34 +93,43 @@ function execute_with_retries() { function install_dask_rapids() { if is_cuda12 ; then - local python_ver="3.11" + local python_ver="3.10" local cuda_spec="cuda-version>=12,<=12.5" local dask_spec="dask" + local numba_spec="numba" elif is_cuda11 ; then local python_ver="3.9" - local cuda_spec="cuda-version>=11,<12.0a0" + local cuda_spec="cuda-version>=11,<11.6" local dask_spec="dask" + local numba_spec="numba<0.56" fi + + local CONDA_PACKAGES=("${cuda_spec}" + "rapids=${RAPIDS_VERSION}" + "${dask_spec}" + "dask-bigquery" + "dask-ml" + "dask-sql" + "${numba_spec}" + ) + # Install cuda, rapids, dask local is_installed="0" mamba="/opt/conda/default/bin/mamba" conda="/opt/conda/default/bin/conda" - "${conda}" config --set channel_priority flexible - - local operation="create" for installer in "${mamba}" "${conda}" ; do set +e time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ - "${cuda_spec}" \ - "rapids=${RAPIDS_VERSION}" \ - "${dask_spec}" \ + ${CONDA_PACKAGES[*]} \ "python=${python_ver}" if [[ "$?" == "0" ]] ; then is_installed="1" break + else + "${conda}" config --set channel_priority flexible fi set -e done @@ -186,8 +195,8 @@ EOF configure_systemd_dask_service() { echo "Configuring systemd Dask service for RAPIDS..." local -r dask_worker_local_dir="/tmp/dask" - local conda_env_bin - conda_env_bin=$(conda info --base)/bin + local conda_env="/opt/conda/miniconda3/envs/dask-rapids" + local conda_env_bin="${conda_env}/bin" # Replace Dask Launcher file with dask-cuda config systemctl stop ${DASK_SERVICE} From c6991e840e5dc8c15970944255050d1c77f37f2b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 11:36:24 -0700 Subject: [PATCH 24/95] added skein tests for dask-yarn --- dask/test_dask.py | 5 +++++ dask/verify_skein.py | 13 +++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 dask/verify_skein.py diff --git a/dask/test_dask.py b/dask/test_dask.py index 6f7c0e515..100b647d0 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -10,9 +10,13 @@ class DaskTestCase(DataprocTestCase): COMPONENT = 'dask' INIT_ACTIONS = ['dask/dask.sh'] + SKEIN_TEST_SCRIPT = 'verify_skein.py' DASK_YARN_TEST_SCRIPT = 'verify_dask_yarn.py' DASK_STANDALONE_TEST_SCRIPT = 'verify_dask_standalone.py' + def verify_skein(self, name): + self._run_dask_test_script(name, self.SKEIN_TEST_SCRIPT) + def verify_dask_yarn(self, name): self._run_dask_test_script(name, self.DASK_YARN_TEST_SCRIPT) @@ -70,6 +74,7 @@ def test_dask(self, configuration, instances, runtime): else: # https://github.com/dask/dask-yarn/pull/162 self.skipTest("dask-yarn known to fail presently.") + self.verify_skein(name) self.verify_dask_yarn(name) if __name__ == '__main__': diff --git a/dask/verify_skein.py b/dask/verify_skein.py new file mode 100644 index 000000000..5b80edd07 --- /dev/null +++ b/dask/verify_skein.py @@ -0,0 +1,13 @@ +#https://github.com/dask/dask-yarn/issues/101#issuecomment-539529524 +import skein + +spec = skein.ApplicationSpec.from_yaml(""" +name: verify-skein +queue: default + +master: + script: echo "Things worked!" +""") + +client = skein.Client() +client.submit(spec) From 52f5fecbaed943db457e328a9f01036322e06afc Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 11:57:20 -0700 Subject: [PATCH 25/95] accidentally using the wrong bigtable.sh in this PR ; checking out master version --- bigtable/bigtable.sh | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/bigtable/bigtable.sh b/bigtable/bigtable.sh index 7b45a91eb..34d914759 100755 --- a/bigtable/bigtable.sh +++ b/bigtable/bigtable.sh @@ -42,7 +42,8 @@ readonly BIGTABLE_HBASE_CLIENT_2X_VERSION='2.12.0' readonly BIGTABLE_HBASE_CLIENT_2X_JAR="bigtable-hbase-2.x-hadoop-${BIGTABLE_HBASE_CLIENT_2X_VERSION}.jar" readonly BIGTABLE_HBASE_CLIENT_2X_URL="${BIGTABLE_HBASE_CLIENT_2X_REPO}/${BIGTABLE_HBASE_CLIENT_2X_VERSION}/${BIGTABLE_HBASE_CLIENT_2X_JAR}" -readonly SCH_REPO="https://repo.hortonworks.com/content/repositories/releases/com/hortonworks" +readonly region="$(/usr/share/google/get_metadata_value attributes/dataproc-region)" +readonly SCH_REPO="gs://dataproc-initialization-actions-${region}/jars/bigtable" readonly SHC_VERSION='1.1.1-2.1-s_2.11' readonly SHC_JAR="shc-core-${SHC_VERSION}.jar" readonly SHC_EXAMPLES_JAR="shc-examples-${SHC_VERSION}.jar" @@ -58,21 +59,24 @@ readonly BIGTABLE_PROJECT="$(/usr/share/google/get_metadata_value attributes/big /usr/share/google/get_metadata_value ../project/project-id)" function remove_old_backports { + if is_debian12 ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this - # problem, we will remove any reference to backports repos older than oldstable + # problem, we will use archive.debian.org for the oldoldstable repo # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 - oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}'); - stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}'); - - matched_files=( $(grep -rsil '\-backports' /etc/apt/sources.list*||:) ) - if [[ -n "$matched_files" ]]; then - for filename in "${matched_files[@]}"; do - grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \ - sed -i -e 's/^.*-backports.*$//' "$filename" - done - fi + debdists="https://deb.debian.org/debian/dists" + oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); + oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + + matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) + + for filename in "${matched_files[@]}"; do + # Fetch from archive.debian.org for ${oldoldstable}-backports + perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } + {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" + done } function retry_command() { @@ -108,12 +112,10 @@ function install_bigtable_client() { function install_shc() { mkdir -p "/usr/lib/spark/external" local out="/usr/lib/spark/external/${SHC_JAR}" - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${SHC_URL}" -O "${out}" + gsutil cp -r "${SHC_URL}" "${out}" ln -s "${out}" "/usr/lib/spark/external/shc-core.jar" local example_out="/usr/lib/spark/examples/jars/${SHC_EXAMPLES_JAR}" - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${SHC_EXAMPLES_URL}" -O "${example_out}" + gsutil cp -r "${SHC_EXAMPLES_URL}" "${example_out}" ln -s "${example_out}" "/usr/lib/spark/examples/jars/shc-examples.jar" } @@ -242,7 +244,8 @@ function install_hbase() { local VARIANT="bin" local BASENAME="hbase-${HBASE_VERSION}-${VARIANT}.tar.gz" echo "hbase dist basename: ${BASENAME}" - wget -q -nv "https://archive.apache.org/dist/hbase/${HBASE_VERSION}/${BASENAME}" -P /tmp || err 'Unable to download tar' + curl -fsSL -o "/tmp/${BASENAME}" --retry-connrefused --retry 3 --retry-max-time 5 \ + "https://archive.apache.org/dist/hbase/${HBASE_VERSION}/${BASENAME}" || err 'Unable to download tar' # extract binaries from bundle mkdir -p "/tmp/hbase-${HBASE_VERSION}/" "${HBASE_HOME}" From aad851ac8b5381adf3f4b4a2f98c9e4d79af68bc Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 12:26:21 -0700 Subject: [PATCH 26/95] using correct conda env for dask-yarn environment --- rapids/rapids.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 676329511..f007a1fd9 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -192,10 +192,10 @@ EOF fi } +readonly conda_env="/opt/conda/miniconda3/envs/dask-rapids" configure_systemd_dask_service() { echo "Configuring systemd Dask service for RAPIDS..." local -r dask_worker_local_dir="/tmp/dask" - local conda_env="/opt/conda/miniconda3/envs/dask-rapids" local conda_env_bin="${conda_env}/bin" # Replace Dask Launcher file with dask-cuda config @@ -228,7 +228,6 @@ EOF function configure_dask_yarn() { local base - base=$(conda info --base) # Replace config file on cluster. cat <"${DASK_YARN_CONFIG_FILE}" @@ -238,7 +237,7 @@ function configure_dask_yarn() { # https://yarn.dask.org/en/latest/configuration.html#default-configuration yarn: - environment: python://${base}/bin/python + environment: environment://${conda_env} worker: count: 2 From e20aa9a0012940d4b3ed8dd0449eadd0ecfa1f7b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 12:36:40 -0700 Subject: [PATCH 27/95] added skein test for dask --- dask/test_skein.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 dask/test_skein.py diff --git a/dask/test_skein.py b/dask/test_skein.py new file mode 100644 index 000000000..2846fafb8 --- /dev/null +++ b/dask/test_skein.py @@ -0,0 +1,13 @@ +#https://github.com/dask/dask-yarn/issues/101#issuecomment-539529524 +import skein + +spec = skein.ApplicationSpec.from_yaml(""" +name: debug-skein +queue: root + +master: + script: echo "Things worked!" +""") + +client = skein.Client() +client.submit(spec) From fd9449b247fbb801fd6c540e85dba1fb7789390d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 14:37:16 -0700 Subject: [PATCH 28/95] that was the wrong filename --- dask/test_skein.py | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 dask/test_skein.py diff --git a/dask/test_skein.py b/dask/test_skein.py deleted file mode 100644 index 2846fafb8..000000000 --- a/dask/test_skein.py +++ /dev/null @@ -1,13 +0,0 @@ -#https://github.com/dask/dask-yarn/issues/101#issuecomment-539529524 -import skein - -spec = skein.ApplicationSpec.from_yaml(""" -name: debug-skein -queue: root - -master: - script: echo "Things worked!" -""") - -client = skein.Client() -client.submit(spec) From c69d951e6e6077ba7547db7212cf879f468c02af Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 17:21:50 -0700 Subject: [PATCH 29/95] perform the skein tests before skipping the dask ones --- dask/test_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask/test_dask.py b/dask/test_dask.py index 100b647d0..6298fa99b 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -72,9 +72,9 @@ def test_dask(self, configuration, instances, runtime): if runtime == "standalone": self.verify_dask_standalone(name, master_hostname) else: + self.verify_skein(name) # https://github.com/dask/dask-yarn/pull/162 self.skipTest("dask-yarn known to fail presently.") - self.verify_skein(name) self.verify_dask_yarn(name) if __name__ == '__main__': From 5b23ddb308f6cc045a37d20adbefbf7f84f6b37e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 17:22:06 -0700 Subject: [PATCH 30/95] whitespace changes --- rapids/rapids.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index f007a1fd9..5d7c991fc 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -62,8 +62,6 @@ readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-maste readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RUN_WORKER_ON_MASTER=$(get_metadata_attribute 'dask-cuda-worker-on-master' 'true') - - # SPARK config readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) @@ -104,7 +102,6 @@ function install_dask_rapids() { local numba_spec="numba<0.56" fi - local CONDA_PACKAGES=("${cuda_spec}" "rapids=${RAPIDS_VERSION}" "${dask_spec}" From 536aef9a967de8e6dbff789b18d06c43fab07e44 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 12 Aug 2024 17:30:19 -0700 Subject: [PATCH 31/95] removing the excessive logging --- cloudbuild/run-presubmit-on-k8s.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/cloudbuild/run-presubmit-on-k8s.sh b/cloudbuild/run-presubmit-on-k8s.sh index 5e0739448..810213832 100644 --- a/cloudbuild/run-presubmit-on-k8s.sh +++ b/cloudbuild/run-presubmit-on-k8s.sh @@ -12,8 +12,6 @@ gcloud container clusters get-credentials "${CLOUDSDK_CONTAINER_CLUSTER}" LOGS_SINCE_TIME=$(date --iso-8601=seconds) -cat /builder/home/.kube/config - kubectl run "${POD_NAME}" \ --image="${IMAGE}" \ --restart=Never \ From b476bae0706cf975aea5f3d9feea3458df68785c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 12:40:02 -0700 Subject: [PATCH 32/95] taking master hostname from argv ; added array test --- dask/dask.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dask/dask.sh b/dask/dask.sh index 48558f154..c411b2a96 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -103,6 +103,7 @@ LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" echo "dask worker starting, logging to \${LOGFILE}" ${DASK_CONDA_ENV}/bin/dask worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 EOF + ) chmod 750 "${DASK_WORKER_LAUNCHER}" @@ -396,6 +397,7 @@ EOF } function install_dask() { + local DASK_VERSION='2024.6' if is_cuda12 ; then local python_spec="python=3.10" local cuda_spec="cuda-version>=12,<=12.5" @@ -453,7 +455,6 @@ function install_dask() { echo "failed to install dask" return 1 fi - set -e } function main() { From f7aed9274ab2c9074af8e6c618172e1cbc713e8f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 15:34:44 -0700 Subject: [PATCH 33/95] defining two separate services to ease debugging --- dask/dask.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dask/dask.sh b/dask/dask.sh index c411b2a96..5569574b4 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -95,6 +95,7 @@ function install_systemd_dask_worker() { mkdir -p "${dask_worker_local_dir}" + local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh" cat <"${DASK_WORKER_LAUNCHER}" From c9d41f4391d858e5aa5075cacd876324c936b4f7 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 17:43:30 -0700 Subject: [PATCH 34/95] dask service tests are passing --- dask/dask.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dask/dask.sh b/dask/dask.sh index 5569574b4..8f18dbdea 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -478,6 +478,8 @@ function main() { systemctl start "${DASK_WORKER_SERVICE}" systemctl status "${DASK_WORKER_SERVICE}" fi + systemctl start "${DASK_SCHEDULER_SERVICE}" + systemctl status "${DASK_SCHEDULER_SERVICE}" configure_knox_for_dask From b6273c88fb7d24cbd007d8f8165c3ab3bb7c582c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 17:51:32 -0700 Subject: [PATCH 35/95] refactored yarn tests to its own py file ; updated rapids.sh to separate services into their own units --- rapids/rapids.sh | 82 +++++++++++++++++++------------ rapids/verify_rapids_dask.py | 19 ------- rapids/verify_rapids_dask_yarn.py | 19 +++++++ 3 files changed, 70 insertions(+), 50 deletions(-) create mode 100644 rapids/verify_rapids_dask_yarn.py diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 5d7c991fc..44cac0ea8 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -59,7 +59,7 @@ readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) -readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') +readonly RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RUN_WORKER_ON_MASTER=$(get_metadata_attribute 'dask-cuda-worker-on-master' 'true') # SPARK config @@ -71,8 +71,11 @@ readonly XGBOOST_GPU_SUB_VERSION=$(get_metadata_attribute 'spark-gpu-sub-version readonly SCALA_VER="2.12" # Dask config +readonly DASK_RUNTIME="$(/usr/share/google/get_metadata_value attributes/dask-runtime || echo 'standalone')" readonly DASK_LAUNCHER=/usr/local/bin/dask-launcher.sh readonly DASK_SERVICE=dask-cluster +readonly DASK_WORKER_SERVICE=dask-worker +readonly DASK_SCHEDULER_SERVICE=dask-scheduler readonly DASK_YARN_CONFIG_FILE=/etc/dask/config.yaml # Dataproc configurations @@ -190,42 +193,59 @@ EOF } readonly conda_env="/opt/conda/miniconda3/envs/dask-rapids" -configure_systemd_dask_service() { - echo "Configuring systemd Dask service for RAPIDS..." - local -r dask_worker_local_dir="/tmp/dask" - local conda_env_bin="${conda_env}/bin" +enable_worker_service="0" +function install_systemd_dask_worker() { + echo "Installing systemd Dask Worker service..." + local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}" - # Replace Dask Launcher file with dask-cuda config - systemctl stop ${DASK_SERVICE} + mkdir -p "${dask_worker_local_dir}" - if [[ "${ROLE}" == "Master" ]]; then - cat <"${DASK_LAUNCHER}" + local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh" + + cat <"${DASK_WORKER_LAUNCHER}" #!/bin/bash -if [[ "${RUN_WORKER_ON_MASTER}" == true ]]; then - nvidia-smi -c DEFAULT - echo "dask-cuda-worker starting, logging to /var/log/dask-cuda-worker.log." - ${conda_env_bin}/dask-cuda-worker ${MASTER}:8786 --local-directory=${dask_worker_local_dir} --memory-limit=auto > /var/log/dask-cuda-worker.log 2>&1 & -fi -echo "dask-scheduler starting, logging to /var/log/dask-scheduler.log." -${conda_env_bin}/dask-scheduler > /var/log/dask-scheduler.log 2>&1 +LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" +ERRLOG="/var/log/${DASK_WORKER_SERVICE}-error.log" +nvidia-smi -c DEFAULT +echo "dask-cuda-worker starting, logging to \${LOGFILE} and \${ERRLOG}" +${conda_env}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>> "\${ERRLOG}" EOF - else - nvidia-smi -c DEFAULT - cat <"${DASK_LAUNCHER}" -#!/bin/bash -${conda_env_bin}/dask-cuda-worker ${MASTER}:8786 --local-directory=${dask_worker_local_dir} --memory-limit=auto > /var/log/dask-cuda-worker.log 2>&1 + + chmod 750 "${DASK_WORKER_LAUNCHER}" + + local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service" + cat <"${dask_service_file}" +[Unit] +Description=Dask Worker Service +[Service] +Type=simple +Restart=on-failure +ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}' +[Install] +WantedBy=multi-user.target EOF - fi - chmod 750 "${DASK_LAUNCHER}" + chmod a+r "${dask_service_file}" systemctl daemon-reload - echo "Restarting Dask cluster..." - systemctl start "${DASK_SERVICE}" + + # Enable the service + if [[ "${ROLE}" != "Master" ]]; then + enable_worker_service="1" + else + # Enable service on single-node cluster (no workers) + local worker_count="$(/usr/share/google/get_metadata_value attributes/dataproc-worker-count)" + if [[ "${worker_count}" == "0" || "${RUN_WORKER_ON_MASTER}" == "true" ]]; then + enable_worker_service="1" + fi + fi + + if [[ "${enable_worker_service}" == "1" ]]; then + systemctl enable "${DASK_WORKER_SERVICE}" + systemctl restart "${DASK_WORKER_SERVICE}" + fi } function configure_dask_yarn() { - local base - # Replace config file on cluster. cat <"${DASK_YARN_CONFIG_FILE}" # Config file for Dask Yarn. @@ -244,15 +264,15 @@ EOF } function main() { - if [[ "${RUNTIME}" == "DASK" ]]; then + if [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then # Install RAPIDS install_dask_rapids # In "standalone" mode, Dask relies on a shell script to launch. # In "yarn" mode, it relies a config.yaml file. - if [[ -f "${DASK_LAUNCHER}" ]]; then - configure_systemd_dask_service - elif [[ -f "${DASK_YARN_CONFIG_FILE}" ]]; then + if [[ "${DASK_RUNTIME}" == "standalone" ]]; then + install_systemd_dask_worker + elif [[ "${DASK_RUNTIME}" == "yarn" ]]; then configure_dask_yarn fi echo "RAPIDS installed with Dask runtime" diff --git a/rapids/verify_rapids_dask.py b/rapids/verify_rapids_dask.py index f8c888478..10f662215 100644 --- a/rapids/verify_rapids_dask.py +++ b/rapids/verify_rapids_dask.py @@ -2,11 +2,9 @@ import dask_cudf import xgboost -from dask.distributed import Client import dask.array as da import numpy as np - def test_rapids(): # confirm RAPIDS and xgboost are available df = cudf.DataFrame() @@ -19,21 +17,4 @@ def test_rapids(): ds = dask_cudf.from_cudf(df['c'], npartitions=2) ds.compute() - -def test_dask_yarn(): - try: - from dask_yarn import YarnCluster - except: - return - - # Validate dask_yarn configuration - cluster = YarnCluster() - client = Client(cluster) - - cluster.scale(4) - x = da.sum(np.ones(5)) - x.compute() - - test_rapids() -test_dask_yarn() diff --git a/rapids/verify_rapids_dask_yarn.py b/rapids/verify_rapids_dask_yarn.py new file mode 100644 index 000000000..9c6850cc4 --- /dev/null +++ b/rapids/verify_rapids_dask_yarn.py @@ -0,0 +1,19 @@ +from dask.distributed import Client +import dask.array as da +import numpy as np + +def test_dask_yarn(): + try: + from dask_yarn import YarnCluster + except: + return + + # Validate dask_yarn configuration + cluster = YarnCluster() + client = Client(cluster) + + cluster.scale(4) + x = da.sum(np.ones(5)) + x.compute() + +test_dask_yarn() # known to fail for recent relases of rapids From 8d18024040302f9e7ea11045be87ebe78369f4ca Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 20:45:50 -0700 Subject: [PATCH 36/95] tested with debian and rocky --- dask/dask.sh | 2 -- rapids/rapids.sh | 10 +++++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index 8f18dbdea..5569574b4 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -478,8 +478,6 @@ function main() { systemctl start "${DASK_WORKER_SERVICE}" systemctl status "${DASK_WORKER_SERVICE}" fi - systemctl start "${DASK_SCHEDULER_SERVICE}" - systemctl status "${DASK_SCHEDULER_SERVICE}" configure_knox_for_dask diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 44cac0ea8..a8b354c74 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -92,6 +92,7 @@ function execute_with_retries() { return 1 } +readonly conda_env="/opt/conda/miniconda3/envs/dask-rapids" function install_dask_rapids() { if is_cuda12 ; then local python_ver="3.10" @@ -121,7 +122,8 @@ function install_dask_rapids() { for installer in "${mamba}" "${conda}" ; do set +e - time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ + test -d "${conda_env}" || \ + time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ ${CONDA_PACKAGES[*]} \ "python=${python_ver}" @@ -192,7 +194,6 @@ EOF fi } -readonly conda_env="/opt/conda/miniconda3/envs/dask-rapids" enable_worker_service="0" function install_systemd_dask_worker() { echo "Installing systemd Dask Worker service..." @@ -205,10 +206,9 @@ function install_systemd_dask_worker() { cat <"${DASK_WORKER_LAUNCHER}" #!/bin/bash LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" -ERRLOG="/var/log/${DASK_WORKER_SERVICE}-error.log" nvidia-smi -c DEFAULT -echo "dask-cuda-worker starting, logging to \${LOGFILE} and \${ERRLOG}" -${conda_env}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>> "\${ERRLOG}" +echo "dask-cuda-worker starting, logging to \${LOGFILE}" +${conda_env}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_WORKER_LAUNCHER}" From f88df7baf4ba9453d7116f185a33bf05abef8e57 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 21:18:34 -0700 Subject: [PATCH 37/95] added skein test --- dask/test_skein.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 dask/test_skein.py diff --git a/dask/test_skein.py b/dask/test_skein.py new file mode 100644 index 000000000..2846fafb8 --- /dev/null +++ b/dask/test_skein.py @@ -0,0 +1,13 @@ +#https://github.com/dask/dask-yarn/issues/101#issuecomment-539529524 +import skein + +spec = skein.ApplicationSpec.from_yaml(""" +name: debug-skein +queue: root + +master: + script: echo "Things worked!" +""") + +client = skein.Client() +client.submit(spec) From d71470f3ac081db2f81efba582e0d33e712deb96 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 21:20:31 -0700 Subject: [PATCH 38/95] reduced operations slightly when setting master hostname --- dask/test_dask.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dask/test_dask.py b/dask/test_dask.py index 6298fa99b..915ec0326 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -61,6 +61,7 @@ def test_dask(self, configuration, instances, runtime): metadata=metadata, timeout_in_minutes=20) + master_hostname = self.getClusterName if configuration == 'HA': master_hostname = self.getClusterName() + '-m-0' else: From aa68bc867bb20156376e9104fe2160a96d4f857d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 21:48:50 -0700 Subject: [PATCH 39/95] python operators. amirite? --- dask/test_dask.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dask/test_dask.py b/dask/test_dask.py index 915ec0326..6298fa99b 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -61,7 +61,6 @@ def test_dask(self, configuration, instances, runtime): metadata=metadata, timeout_in_minutes=20) - master_hostname = self.getClusterName if configuration == 'HA': master_hostname = self.getClusterName() + '-m-0' else: From facb14bcb91b78df23ebbd25a2fb6364bf50170a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 22:37:55 -0700 Subject: [PATCH 40/95] status fails ; list-units | grep works --- rapids/rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index a8b354c74..92fb66c7b 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -288,7 +288,7 @@ function main() { if [[ "${ROLE}" == "Master" ]]; then systemctl restart hadoop-yarn-resourcemanager.service # Restart NodeManager on Master as well if this is a single-node-cluster. - if systemctl status hadoop-yarn-nodemanager; then + if systemctl list-units | grep hadoop-yarn-nodemanager; then systemctl restart hadoop-yarn-nodemanager.service fi else From 8559fdd3f3ac7553200347efe82c595666920a89 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 22:44:36 -0700 Subject: [PATCH 41/95] explicitly including cudf --- rapids/rapids.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 92fb66c7b..ad62dc3cf 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -112,6 +112,7 @@ function install_dask_rapids() { "dask-bigquery" "dask-ml" "dask-sql" + "cudf" "${numba_spec}" ) From c3ea72333d5cc0bfb5401f818061af92a6dbf017 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 13 Aug 2024 22:53:15 -0700 Subject: [PATCH 42/95] corrected variable name --- rapids/rapids.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index ad62dc3cf..8dde8f167 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -277,12 +277,12 @@ function main() { configure_dask_yarn fi echo "RAPIDS installed with Dask runtime" - elif [[ "${RUNTIME}" == "SPARK" ]]; then + elif [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then install_spark_rapids configure_spark echo "RAPIDS initialized with Spark runtime" else - echo "Unsupported RAPIDS Runtime: ${RUNTIME}" + echo "Unsupported RAPIDS Runtime: ${RAPIDS_RUNTIME}" exit 1 fi From 6a14ff19323ed06005f6611f2580745ba69783af Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 14 Aug 2024 14:30:59 -0700 Subject: [PATCH 43/95] working with cuda12 + yarn as dask runtime specifying a recent dask for rapids with cuda12 specifying yarn yaml environment using path to python applied fixes to gpu driver installer from gpu-20240813 --- dask/dask.sh | 1 - dask/test_dask.py | 7 ------- dask/verify_skein.py | 13 ------------- rapids/rapids.sh | 23 ++++++++++++----------- 4 files changed, 12 insertions(+), 32 deletions(-) delete mode 100644 dask/verify_skein.py diff --git a/dask/dask.sh b/dask/dask.sh index 5569574b4..e33d8e60b 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -398,7 +398,6 @@ EOF } function install_dask() { - local DASK_VERSION='2024.6' if is_cuda12 ; then local python_spec="python=3.10" local cuda_spec="cuda-version>=12,<=12.5" diff --git a/dask/test_dask.py b/dask/test_dask.py index 6298fa99b..a5d4314e4 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -10,13 +10,9 @@ class DaskTestCase(DataprocTestCase): COMPONENT = 'dask' INIT_ACTIONS = ['dask/dask.sh'] - SKEIN_TEST_SCRIPT = 'verify_skein.py' DASK_YARN_TEST_SCRIPT = 'verify_dask_yarn.py' DASK_STANDALONE_TEST_SCRIPT = 'verify_dask_standalone.py' - def verify_skein(self, name): - self._run_dask_test_script(name, self.SKEIN_TEST_SCRIPT) - def verify_dask_yarn(self, name): self._run_dask_test_script(name, self.DASK_YARN_TEST_SCRIPT) @@ -72,9 +68,6 @@ def test_dask(self, configuration, instances, runtime): if runtime == "standalone": self.verify_dask_standalone(name, master_hostname) else: - self.verify_skein(name) - # https://github.com/dask/dask-yarn/pull/162 - self.skipTest("dask-yarn known to fail presently.") self.verify_dask_yarn(name) if __name__ == '__main__': diff --git a/dask/verify_skein.py b/dask/verify_skein.py deleted file mode 100644 index 5b80edd07..000000000 --- a/dask/verify_skein.py +++ /dev/null @@ -1,13 +0,0 @@ -#https://github.com/dask/dask-yarn/issues/101#issuecomment-539529524 -import skein - -spec = skein.ApplicationSpec.from_yaml(""" -name: verify-skein -queue: default - -master: - script: echo "Things worked!" -""") - -client = skein.Client() -client.submit(spec) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 8dde8f167..980a5aeb4 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -97,7 +97,7 @@ function install_dask_rapids() { if is_cuda12 ; then local python_ver="3.10" local cuda_spec="cuda-version>=12,<=12.5" - local dask_spec="dask" + local dask_spec="dask>=2024.5" local numba_spec="numba" elif is_cuda11 ; then local python_ver="3.9" @@ -106,15 +106,16 @@ function install_dask_rapids() { local numba_spec="numba<0.56" fi - local CONDA_PACKAGES=("${cuda_spec}" - "rapids=${RAPIDS_VERSION}" - "${dask_spec}" - "dask-bigquery" - "dask-ml" - "dask-sql" - "cudf" - "${numba_spec}" - ) + local CONDA_PACKAGES=( + "${cuda_spec}" + "rapids=${RAPIDS_VERSION}" + "${dask_spec}" + "dask-bigquery" + "dask-ml" + "dask-sql" + "cudf" + "${numba_spec}" + ) # Install cuda, rapids, dask local is_installed="0" @@ -255,7 +256,7 @@ function configure_dask_yarn() { # https://yarn.dask.org/en/latest/configuration.html#default-configuration yarn: - environment: environment://${conda_env} + environment: python://${conda_env}/bin/python worker: count: 2 From 8e93293082782cf78b08f43ed246137c54837ce8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 14 Aug 2024 14:51:46 -0700 Subject: [PATCH 44/95] removed pinning for numba as per jakirkham --- rapids/rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 980a5aeb4..4818827c8 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -103,7 +103,7 @@ function install_dask_rapids() { local python_ver="3.9" local cuda_spec="cuda-version>=11,<11.6" local dask_spec="dask" - local numba_spec="numba<0.56" + local numba_spec="numba" fi local CONDA_PACKAGES=( From 1b82dc1b5b5dbaf3e050646ff9e48a45e9633ad3 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 14 Aug 2024 15:00:10 -0700 Subject: [PATCH 45/95] easing the version constraints some --- rapids/rapids.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 4818827c8..1a7f87f22 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -95,12 +95,12 @@ function execute_with_retries() { readonly conda_env="/opt/conda/miniconda3/envs/dask-rapids" function install_dask_rapids() { if is_cuda12 ; then - local python_ver="3.10" - local cuda_spec="cuda-version>=12,<=12.5" + local python_ver="python>=3.10" + local cuda_spec="cuda-version>=12,<12.6" local dask_spec="dask>=2024.5" local numba_spec="numba" elif is_cuda11 ; then - local python_ver="3.9" + local python_spec="python>=3.9" local cuda_spec="cuda-version>=11,<11.6" local dask_spec="dask" local numba_spec="numba" @@ -128,7 +128,7 @@ function install_dask_rapids() { time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ ${CONDA_PACKAGES[*]} \ - "python=${python_ver}" + "${python_spec}" if [[ "$?" == "0" ]] ; then is_installed="1" break From 7d65472111461585c33b20914cd4afa5bf302e9e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 14 Aug 2024 17:30:51 -0700 Subject: [PATCH 46/95] fully changing the variable name --- rapids/rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 1a7f87f22..4e974c94f 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -95,7 +95,7 @@ function execute_with_retries() { readonly conda_env="/opt/conda/miniconda3/envs/dask-rapids" function install_dask_rapids() { if is_cuda12 ; then - local python_ver="python>=3.10" + local python_spec="python>=3.10" local cuda_spec="cuda-version>=12,<12.6" local dask_spec="dask>=2024.5" local numba_spec="numba" From 7cdf483ee23ea3b626b54044dffb04a63997ff5a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 15 Aug 2024 13:25:14 -0700 Subject: [PATCH 47/95] removing test_skein.py --- dask/test_skein.py | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 dask/test_skein.py diff --git a/dask/test_skein.py b/dask/test_skein.py deleted file mode 100644 index 2846fafb8..000000000 --- a/dask/test_skein.py +++ /dev/null @@ -1,13 +0,0 @@ -#https://github.com/dask/dask-yarn/issues/101#issuecomment-539529524 -import skein - -spec = skein.ApplicationSpec.from_yaml(""" -name: debug-skein -queue: root - -master: - script: echo "Things worked!" -""") - -client = skein.Client() -client.submit(spec) From ca74b49180f42531f3fd350aee7f6e159972e7f5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 15 Aug 2024 13:25:46 -0700 Subject: [PATCH 48/95] removed extra lines from rebase --- dask/dask.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index e33d8e60b..f492f27f6 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -95,7 +95,6 @@ function install_systemd_dask_worker() { mkdir -p "${dask_worker_local_dir}" - local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh" cat <"${DASK_WORKER_LAUNCHER}" @@ -104,7 +103,6 @@ LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" echo "dask worker starting, logging to \${LOGFILE}" ${DASK_CONDA_ENV}/bin/dask worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 EOF - ) chmod 750 "${DASK_WORKER_LAUNCHER}" From 2e7979f1c0765261e746a8f7b5cbf6d01a357c7c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 15 Aug 2024 13:27:38 -0700 Subject: [PATCH 49/95] reducing line count --- rapids/rapids.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 4e974c94f..f9c964344 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -83,10 +83,9 @@ readonly SPARK_CONF_DIR='/etc/spark/conf' function execute_with_retries() { local -r cmd=$1 - for ((i = 0; i < 10; i++)); do + for i in {0..9} ; do if eval "$cmd"; then - return 0 - fi + return 0 ; fi sleep 5 done return 1 From de965face3ce8841b7a407b62356267b08655905 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 15 Aug 2024 16:56:59 -0700 Subject: [PATCH 50/95] relaxed cuda version to 11.8 --- rapids/rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index f9c964344..debd813a5 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -100,7 +100,7 @@ function install_dask_rapids() { local numba_spec="numba" elif is_cuda11 ; then local python_spec="python>=3.9" - local cuda_spec="cuda-version>=11,<11.6" + local cuda_spec="cuda-version>=11,<=11.8" local dask_spec="dask" local numba_spec="numba" fi From d01e3499d17bad523b3790ac515dfcafc8c7c506 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 15 Aug 2024 22:38:22 -0700 Subject: [PATCH 51/95] disabling rocky9 tests for now --- rapids/rapids.sh | 13 +++++++------ rapids/test_rapids.py | 11 +++++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index debd813a5..042f8cd08 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -82,12 +82,13 @@ readonly DASK_YARN_CONFIG_FILE=/etc/dask/config.yaml readonly SPARK_CONF_DIR='/etc/spark/conf' function execute_with_retries() { - local -r cmd=$1 + local -r cmd="$*" for i in {0..9} ; do if eval "$cmd"; then return 0 ; fi sleep 5 done + echo "Cmd '${cmd}' failed." return 1 } @@ -149,20 +150,20 @@ function install_spark_rapids() { local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' if [[ "${SPARK_VERSION}" == "3"* ]]; then - wget -nv --timeout=30 --tries=5 --retry-connrefused \ + execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${dmlc_repo_url}/xgboost4j-spark-gpu_${SCALA_VER}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${SCALA_VER}-${XGBOOST_VERSION}.jar" \ -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ + execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${dmlc_repo_url}/xgboost4j-gpu_${SCALA_VER}/${XGBOOST_VERSION}/xgboost4j-gpu_${SCALA_VER}-${XGBOOST_VERSION}.jar" \ -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ + execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${nvidia_repo_url}/rapids-4-spark_${SCALA_VER}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${SCALA_VER}-${SPARK_RAPIDS_VERSION}.jar" \ -P /usr/lib/spark/jars/ else - wget -nv --timeout=30 --tries=5 --retry-connrefused \ + execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${rapids_repo_url}/xgboost4j-spark_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j-spark_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ + execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${rapids_repo_url}/xgboost4j_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ -P /usr/lib/spark/jars/ fi diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 66a49f78c..04672c7dd 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -48,6 +48,7 @@ def verify_spark_job(self): self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME), instance_name) self.assert_instance_command( instance_name, """echo :quit | spark-shell \ + --conf spark.rapids.sql.incompatibleOps.enabled=true \ --conf spark.executor.resource.gpu.amount=1 \ --conf spark.task.resource.gpu.amount=1 \ --conf spark.dynamicAllocation.enabled=false -i {}""".format( @@ -107,7 +108,10 @@ def test_rapids_spark(self, configuration, machine_suffixes, accelerator): self.verify_spark_instance("{}-{}".format(self.getClusterName(), machine_suffix)) # Only need to do this once - self.verify_spark_job() + if self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1"): + print("not supported on rocky 2.2") + else: + self.verify_spark_job() @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "11.8")) def test_non_default_cuda_versions(self, configuration, machine_suffixes, @@ -133,7 +137,10 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, self.verify_spark_instance("{}-{}".format(self.getClusterName(), machine_suffix)) # Only need to do this once - self.verify_spark_job() + if self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1"): + print("not supported on rocky 2.2") + else: + self.verify_spark_job() if __name__ == "__main__": From 6aa28a30ed24820b7e2d7dc614311176fb1d2d15 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 16 Aug 2024 08:36:53 -0700 Subject: [PATCH 52/95] skipping the whole test on rocky9 for now --- rapids/test_rapids.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 04672c7dd..7d8e55f69 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -87,6 +87,9 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, ("STANDARD", ["w-0"], GPU_T4)) def test_rapids_spark(self, configuration, machine_suffixes, accelerator): + if self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1"): + self.skipTest("Test is known to fail on 2.2-rocky9") + if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in pre 2.0 images") optional_components = None @@ -108,10 +111,7 @@ def test_rapids_spark(self, configuration, machine_suffixes, accelerator): self.verify_spark_instance("{}-{}".format(self.getClusterName(), machine_suffix)) # Only need to do this once - if self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1"): - print("not supported on rocky 2.2") - else: - self.verify_spark_job() + self.verify_spark_job() @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "11.8")) def test_non_default_cuda_versions(self, configuration, machine_suffixes, From 467ce89862d9bff82279321e47d11091605e81de Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 16 Aug 2024 10:33:13 -0700 Subject: [PATCH 53/95] trying 24.08 --- rapids/rapids.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 042f8cd08..30c57b775 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -50,10 +50,7 @@ readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VER function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } -if is_cuda11 ; then DEFAULT_DASK_RAPIDS_VERSION="22.06" -else DEFAULT_DASK_RAPIDS_VERSION="24.06" ; fi - -readonly DEFAULT_DASK_RAPIDS_VERSION +readonly DEFAULT_DASK_RAPIDS_VERSION="24.08" readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) From 33b8d5ecc5c807ea54678992612fb8dab351eb7f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 16 Aug 2024 14:15:20 -0700 Subject: [PATCH 54/95] increase max cluster age for rocky9 ; using CUDA_VERSION=11.8 for non-spark rapids runtime (this should be changed) --- integration_tests/dataproc_test_case.py | 8 ++++---- rapids/rapids.sh | 9 +++++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index a0ae04046..149c4d5fa 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -122,9 +122,9 @@ def createCluster(self, args.append("--public-ip-address") for i in init_actions: - if "install_gpu_driver.sh" in i or \ - "mlvm.sh" in i or "rapids.sh" in i or \ - "spark-rapids.sh" in i or "horovod.sh" in i: + if "install_gpu_driver.sh" in i or "horovod.sh" in i or \ + "dask-rapids.sh" in i or "mlvm.sh" in i or \ + "spark-rapids.sh" in i: args.append("--no-shielded-secure-boot") if optional_components: @@ -178,7 +178,7 @@ def createCluster(self, args.append("--zone={}".format(self.cluster_zone)) if not FLAGS.skip_cleanup: - args.append("--max-age=30m") + args.append("--max-age=60m") args.append("--max-idle=5m") diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 30c57b775..35180ada3 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -46,7 +46,13 @@ else fi # RAPIDS config -readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) +readonly RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') +if [[ "${RAPIDS_RUNTIME}" != "SPARK" ]]; then # to match install_gpu_driver.sh ; they should both probably be removed + DEFAULT_CUDA_VERSION='11.8' +fi +CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) + +readonly CUDA_VERSION function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } @@ -56,7 +62,6 @@ readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) -readonly RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RUN_WORKER_ON_MASTER=$(get_metadata_attribute 'dask-cuda-worker-on-master' 'true') # SPARK config From 2c1c6a0003fd13b956896e2c38333d9968e0d1f5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 16 Aug 2024 14:52:05 -0700 Subject: [PATCH 55/95] increase timeout for init actions as well as max-age from previous commit --- rapids/test_rapids.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 7d8e55f69..86ac9e16d 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -77,7 +77,7 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, master_accelerator=accelerator, worker_accelerator=accelerator, boot_disk_size="100GB", - timeout_in_minutes=30) + timeout_in_minutes=60) for machine_suffix in machine_suffixes: self.verify_dask_instance("{}-{}".format(self.getClusterName(), @@ -105,7 +105,7 @@ def test_rapids_spark(self, configuration, machine_suffixes, accelerator): master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, boot_disk_size="100GB", - timeout_in_minutes=30) + timeout_in_minutes=60) for machine_suffix in machine_suffixes: self.verify_spark_instance("{}-{}".format(self.getClusterName(), @@ -131,7 +131,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, boot_disk_size="100GB", - timeout_in_minutes=30) + timeout_in_minutes=60) for machine_suffix in machine_suffixes: self.verify_spark_instance("{}-{}".format(self.getClusterName(), From f4b6dda881f66330f62bb49d077dc0a353f8bf5c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 16 Aug 2024 16:45:24 -0700 Subject: [PATCH 56/95] reverted attempt to change a r/o variable --- rapids/rapids.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 35180ada3..c24ab7326 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -34,12 +34,12 @@ readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[ readonly DEFAULT_SPARK_RAPIDS_VERSION="24.06.1" if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then - readonly DEFAULT_CUDA_VERSION="12.4" + DEFAULT_CUDA_VERSION="12.4" readonly DEFAULT_XGBOOST_VERSION="2.0.3" readonly SPARK_VERSION="${SPARK_VERSION_ENV}" readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="" else - readonly DEFAULT_CUDA_VERSION="10.1" + DEFAULT_CUDA_VERSION="10.1" readonly DEFAULT_XGBOOST_VERSION="1.0.0" readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="Beta5" readonly SPARK_VERSION="2.x" @@ -50,6 +50,7 @@ readonly RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') if [[ "${RAPIDS_RUNTIME}" != "SPARK" ]]; then # to match install_gpu_driver.sh ; they should both probably be removed DEFAULT_CUDA_VERSION='11.8' fi +readonly DEFAULT_CUDA_VERSION CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) readonly CUDA_VERSION From d72bb0614c30e785a7ad735d99dbdd79b9334dc0 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 16 Aug 2024 17:35:10 -0700 Subject: [PATCH 57/95] trying with 24.08 --- rapids/rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index c24ab7326..b0830e431 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -31,7 +31,7 @@ function get_metadata_attribute() { } readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.06.1" +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.0" if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then DEFAULT_CUDA_VERSION="12.4" From e22cb459cca7a2dc1846f1b70b78a48a8d46bc39 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 16 Aug 2024 22:57:24 -0700 Subject: [PATCH 58/95] removing spark from the rapids tests --- rapids/rapids.sh | 16 ++++----- rapids/test_rapids.py | 80 ------------------------------------------- 2 files changed, 8 insertions(+), 88 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index b0830e431..6bade611d 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -31,8 +31,6 @@ function get_metadata_attribute() { } readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.0" - if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then DEFAULT_CUDA_VERSION="12.4" readonly DEFAULT_XGBOOST_VERSION="2.0.3" @@ -66,6 +64,7 @@ readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-maste readonly RUN_WORKER_ON_MASTER=$(get_metadata_attribute 'dask-cuda-worker-on-master' 'true') # SPARK config +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.0" readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) readonly XGBOOST_GPU_SUB_VERSION=$(get_metadata_attribute 'spark-gpu-sub-version' ${DEFAULT_XGBOOST_GPU_SUB_VERSION}) @@ -98,9 +97,9 @@ function execute_with_retries() { readonly conda_env="/opt/conda/miniconda3/envs/dask-rapids" function install_dask_rapids() { if is_cuda12 ; then - local python_spec="python>=3.10" - local cuda_spec="cuda-version>=12,<12.6" - local dask_spec="dask>=2024.5" + local python_spec="python>=3.11" + local cuda_spec="cuda-version>=12,<13" + local dask_spec="dask>=2024.8" local numba_spec="numba" elif is_cuda11 ; then local python_spec="python>=3.9" @@ -177,9 +176,10 @@ function configure_spark() { cat >>${SPARK_CONF_DIR}/spark-defaults.conf < pkg_resources.parse_version("2.1"): - self.skipTest("Test is known to fail on 2.2-rocky9") - - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") - optional_components = None - - metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK") - - self.createCluster( - configuration, - self.INIT_ACTIONS, - optional_components=optional_components, - metadata=metadata, - machine_type="n1-standard-16", - master_accelerator=accelerator if configuration == "SINGLE" else None, - worker_accelerator=accelerator, - boot_disk_size="100GB", - timeout_in_minutes=60) - - for machine_suffix in machine_suffixes: - self.verify_spark_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - # Only need to do this once - self.verify_spark_job() - - @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "11.8")) - def test_non_default_cuda_versions(self, configuration, machine_suffixes, - accelerator, cuda_version): - - if self.getImageVersion() < pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") - - metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" - ",cuda-version={}".format(cuda_version)) - - self.createCluster( - configuration, - self.INIT_ACTIONS, - metadata=metadata, - machine_type="n1-standard-16", - master_accelerator=accelerator if configuration == "SINGLE" else None, - worker_accelerator=accelerator, - boot_disk_size="100GB", - timeout_in_minutes=60) - - for machine_suffix in machine_suffixes: - self.verify_spark_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - # Only need to do this once - if self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1"): - print("not supported on rocky 2.2") - else: - self.verify_spark_job() - - if __name__ == "__main__": absltest.main() From 973c81b7bce151bb21521cdb0dcbac9507731ae8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Sep 2024 15:14:48 -0700 Subject: [PATCH 59/95] 2.2.20 is known to work --- cloudbuild/cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml index b06df98ea..b590f4d9f 100644 --- a/cloudbuild/cloudbuild.yaml +++ b/cloudbuild/cloudbuild.yaml @@ -80,7 +80,7 @@ steps: id: 'dataproc-2.2-debian12-tests' waitFor: ['gcr-push'] entrypoint: 'bash' - args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '2.2-debian12'] + args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '2.2.20-debian12'] env: - 'COMMIT_SHA=$COMMIT_SHA' - 'CLOUDSDK_COMPUTE_REGION=us-central1' From 9963dfbf19dc5ecf8dc137274c83720c8359b891 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Sep 2024 15:21:43 -0700 Subject: [PATCH 60/95] using new fangled key management path --- cloudbuild/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index e191e8cf5..ae1f7789e 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -9,8 +9,9 @@ COPY --chown=ia-tests:ia-tests . /init-actions # Install Bazel: # https://docs.bazel.build/versions/master/install-ubuntu.html -RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list -RUN curl https://bazel.build/bazel-release.pub.gpg | apt-key add - +ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg +RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list +RUN curl https://bazel.build/bazel-release.pub.gpg | gpg --dearmor -o "${bazel_kr_path}" RUN apt-get update && apt-get install -y openjdk-8-jdk python3-setuptools bazel USER ia-tests From 5bbb8fc788258d3e3911724e889e5cd5fefcdf3a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Sep 2024 15:35:02 -0700 Subject: [PATCH 61/95] explicitly specifying path to curl ; also installing curl --- cloudbuild/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index ae1f7789e..16874e8fd 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -11,7 +11,8 @@ COPY --chown=ia-tests:ia-tests . /init-actions # https://docs.bazel.build/versions/master/install-ubuntu.html ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list -RUN curl https://bazel.build/bazel-release.pub.gpg | gpg --dearmor -o "${bazel_kr_path}" +RUN apt-get install -y -qq curl && apt-get clean +RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | gpg --dearmor -o "${bazel_kr_path}" RUN apt-get update && apt-get install -y openjdk-8-jdk python3-setuptools bazel USER ia-tests From ee13c9ab5fafde6f07c578562a3f94f63d6467b9 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Sep 2024 15:38:12 -0700 Subject: [PATCH 62/95] perform update before install --- cloudbuild/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index 16874e8fd..0fad0e9ff 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -11,7 +11,7 @@ COPY --chown=ia-tests:ia-tests . /init-actions # https://docs.bazel.build/versions/master/install-ubuntu.html ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list -RUN apt-get install -y -qq curl && apt-get clean +RUN apt-get update ; apt-get install -y -qq curl && apt-get clean RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | gpg --dearmor -o "${bazel_kr_path}" RUN apt-get update && apt-get install -y openjdk-8-jdk python3-setuptools bazel From c28bb4b911c85176734a4f009d40b23c24b2cdde Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 10 Oct 2024 19:01:23 -0700 Subject: [PATCH 63/95] modified to run as a custom-images script --- dask/dask.sh | 61 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index f492f27f6..946608d9e 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -27,10 +27,49 @@ function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } +function print_metadata_value() { + local readonly tmpfile=$(mktemp) + http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ + -s -o ${tmpfile} 2>/dev/null) + local readonly return_code=$? + # If the command completed successfully, print the metadata value to stdout. + if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then + cat ${tmpfile} + fi + rm -f ${tmpfile} + return ${return_code} +} + +function print_metadata_value_if_exists() { + local return_code=1 + local readonly url=$1 + print_metadata_value ${url} + return_code=$? + return ${return_code} +} + +function get_metadata_value() { + set +x + local readonly varname=$1 + local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 + # Print the instance metadata value. + print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + return_code=$? + # If the instance doesn't have the value, try the project. + if [[ ${return_code} != 0 ]]; then + print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + return_code=$? + fi + set -x + return ${return_code} +} + function get_metadata_attribute() { - local -r attribute_name=$1 + set +x + local -r attribute_name="$1" local -r default_value="${2:-}" - /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" + get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" + set -x } readonly DEFAULT_CUDA_VERSION="12.4" @@ -38,7 +77,7 @@ readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VER function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } -readonly DASK_RUNTIME="$(/usr/share/google/get_metadata_value attributes/dask-runtime || echo 'standalone')" +readonly DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" # Dask 'standalone' config readonly DASK_SERVICE=dask-cluster @@ -87,8 +126,8 @@ EOF } enable_worker_service="0" -ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" -MASTER="$(/usr/share/google/get_metadata_value attributes/dataproc-master)" +ROLE="$(get_metadata_attribute dataproc-role)" +MASTER="$(get_metadata_attribute dataproc-master)" function install_systemd_dask_worker() { echo "Installing systemd Dask Worker service..." local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}" @@ -125,9 +164,9 @@ EOF if [[ "${ROLE}" != "Master" ]]; then enable_worker_service="1" else - local RUN_WORKER_ON_MASTER="$(/usr/share/google/get_metadata_value attributes/dask-worker-on-master || echo 'true')" + local RUN_WORKER_ON_MASTER="$(get_metadata_attribute dask-worker-on-master || echo 'true')" # Enable service on single-node cluster (no workers) - local worker_count="$(/usr/share/google/get_metadata_value attributes/dataproc-worker-count)" + local worker_count="$(get_metadata_attribute dataproc-worker-count)" if [[ "${worker_count}" == "0" ]]; then RUN_WORKER_ON_MASTER='true'; fi if [[ "${RUN_WORKER_ON_MASTER}" == "true" ]]; then @@ -431,8 +470,8 @@ function install_dask() { # Install dask local is_installed="0" - mamba="/opt/conda/default/bin/mamba" - conda="/opt/conda/default/bin/conda" + mamba="/opt/conda/miniconda3/bin/mamba" + conda="/opt/conda/miniconda3/bin/conda" set +e for installer in "${mamba}" "${conda}" ; do @@ -478,7 +517,7 @@ function main() { configure_knox_for_dask - local DASK_CLOUD_LOGGING="$(/usr/share/google/get_metadata_value attributes/dask-cloud-logging || echo 'false')" + local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then configure_fluentd_for_dask fi @@ -492,3 +531,5 @@ function main() { main + +df -h From 531a472ab9dfd59e54d0f5f1e1d48a6f22848ada Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 11 Oct 2024 14:22:08 -0700 Subject: [PATCH 64/95] remove delta from master for gpu/ --- gpu/install_gpu_driver.sh | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index b8a65109b..10b1aa061 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -56,10 +56,8 @@ function remove_old_backports { done } -# Return true if the first argument is equal to or less than the second argument function compare_versions_lte { [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; } -# Return true if the first argument is less than the second argument function compare_versions_lt() { [ "$1" = "$2" ] && return 1 || compare_versions_lte $1 $2 } @@ -1128,8 +1126,8 @@ function clean_up_sources_lists() { # # bigtop (primary) # - local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" + if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" @@ -1205,16 +1203,6 @@ function clean_up_sources_lists() { sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list fi - # - # cran-r - # - if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then - rm -f /usr/share/keyrings/cran-r.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7' | \ - gpg --dearmor -o /usr/share/keyrings/cran-r.gpg - sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list - fi - # # mysql # From 062f087452d14b58594df1caec79ba240ec51209 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 11 Oct 2024 14:32:27 -0700 Subject: [PATCH 65/95] recently tested to have worked with n1-standard-4 and 54GB --- rapids/test_rapids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index b54200481..e92822574 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -53,10 +53,10 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, configuration, self.DASK_INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-16", + machine_type="n1-standard-4", master_accelerator=accelerator, worker_accelerator=accelerator, - boot_disk_size="100GB", + boot_disk_size="60GB", timeout_in_minutes=60) for machine_suffix in machine_suffixes: From 050f8c4dd824647054ea8f64198e21eadb2652b3 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 11 Oct 2024 14:36:24 -0700 Subject: [PATCH 66/95] reduce log noise from Dockerfile --- cloudbuild/Dockerfile | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index 0fad0e9ff..7491cf7d4 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -1,4 +1,4 @@ -# This Dockerfile spins up a container where presubmit tests are run. +# This Dockerfile builds the container from which presubmit tests are run # Cloud Build orchestrates this process. FROM gcr.io/cloud-builders/gcloud @@ -10,9 +10,15 @@ COPY --chown=ia-tests:ia-tests . /init-actions # Install Bazel: # https://docs.bazel.build/versions/master/install-ubuntu.html ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg -RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list -RUN apt-get update ; apt-get install -y -qq curl && apt-get clean -RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | gpg --dearmor -o "${bazel_kr_path}" -RUN apt-get update && apt-get install -y openjdk-8-jdk python3-setuptools bazel +RUN apt-get install -y -qq curl && \ + apt-get clean +RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | \ + gpg --dearmor -o "${bazel_kr_path}" +RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis.com/bazel-apt stable jdk1.8" | \ + dd of=/etc/apt/sources.list.d/bazel.list status=none && \ + apt-get update -qq +RUN apt-get autoremove -y -qq && \ + apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel && \ + apt-get clean USER ia-tests From aa4afb92c18c9b11d8e7f884f29074cceec5a898 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 11 Oct 2024 14:40:25 -0700 Subject: [PATCH 67/95] removing delta from dask on master --- dask/dask.sh | 61 +++++++++------------------------------------------- 1 file changed, 10 insertions(+), 51 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index 946608d9e..f492f27f6 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -27,49 +27,10 @@ function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } -function print_metadata_value() { - local readonly tmpfile=$(mktemp) - http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ - -s -o ${tmpfile} 2>/dev/null) - local readonly return_code=$? - # If the command completed successfully, print the metadata value to stdout. - if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then - cat ${tmpfile} - fi - rm -f ${tmpfile} - return ${return_code} -} - -function print_metadata_value_if_exists() { - local return_code=1 - local readonly url=$1 - print_metadata_value ${url} - return_code=$? - return ${return_code} -} - -function get_metadata_value() { - set +x - local readonly varname=$1 - local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 - # Print the instance metadata value. - print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} - return_code=$? - # If the instance doesn't have the value, try the project. - if [[ ${return_code} != 0 ]]; then - print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} - return_code=$? - fi - set -x - return ${return_code} -} - function get_metadata_attribute() { - set +x - local -r attribute_name="$1" + local -r attribute_name=$1 local -r default_value="${2:-}" - get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" - set -x + /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" } readonly DEFAULT_CUDA_VERSION="12.4" @@ -77,7 +38,7 @@ readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VER function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } -readonly DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" +readonly DASK_RUNTIME="$(/usr/share/google/get_metadata_value attributes/dask-runtime || echo 'standalone')" # Dask 'standalone' config readonly DASK_SERVICE=dask-cluster @@ -126,8 +87,8 @@ EOF } enable_worker_service="0" -ROLE="$(get_metadata_attribute dataproc-role)" -MASTER="$(get_metadata_attribute dataproc-master)" +ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" +MASTER="$(/usr/share/google/get_metadata_value attributes/dataproc-master)" function install_systemd_dask_worker() { echo "Installing systemd Dask Worker service..." local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}" @@ -164,9 +125,9 @@ EOF if [[ "${ROLE}" != "Master" ]]; then enable_worker_service="1" else - local RUN_WORKER_ON_MASTER="$(get_metadata_attribute dask-worker-on-master || echo 'true')" + local RUN_WORKER_ON_MASTER="$(/usr/share/google/get_metadata_value attributes/dask-worker-on-master || echo 'true')" # Enable service on single-node cluster (no workers) - local worker_count="$(get_metadata_attribute dataproc-worker-count)" + local worker_count="$(/usr/share/google/get_metadata_value attributes/dataproc-worker-count)" if [[ "${worker_count}" == "0" ]]; then RUN_WORKER_ON_MASTER='true'; fi if [[ "${RUN_WORKER_ON_MASTER}" == "true" ]]; then @@ -470,8 +431,8 @@ function install_dask() { # Install dask local is_installed="0" - mamba="/opt/conda/miniconda3/bin/mamba" - conda="/opt/conda/miniconda3/bin/conda" + mamba="/opt/conda/default/bin/mamba" + conda="/opt/conda/default/bin/conda" set +e for installer in "${mamba}" "${conda}" ; do @@ -517,7 +478,7 @@ function main() { configure_knox_for_dask - local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" + local DASK_CLOUD_LOGGING="$(/usr/share/google/get_metadata_value attributes/dask-cloud-logging || echo 'false')" if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then configure_fluentd_for_dask fi @@ -531,5 +492,3 @@ function main() { main - -df -h From c75d120f446600befaf6fd0029eb03bf40f42d4d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 11 Oct 2024 16:31:18 -0700 Subject: [PATCH 68/95] update verify_dask_instance test to use systemd unit defined in dask and rapids init actions --- dask/dask.sh | 4 ++++ rapids/test_rapids.py | 8 +++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index f492f27f6..8dbe0cf4b 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -465,6 +465,10 @@ function main() { # Create Dask service install_systemd_dask_service + # Create empty dask config file when running standalone + mkdir -p /etc/dask + touch /etc/dask/config.yaml + if [[ "$(hostname -s)" == "${MASTER}" ]]; then systemctl start "${DASK_SCHEDULER_SERVICE}" systemctl status "${DASK_SCHEDULER_SERVICE}" diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index e92822574..6d37f0b29 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -9,8 +9,7 @@ class RapidsTestCase(DataprocTestCase): COMPONENT = "rapids" - INIT_ACTIONS = ["gpu/install_gpu_driver.sh", "rapids/rapids.sh"] - DASK_INIT_ACTIONS = [ + INIT_ACTIONS = [ "gpu/install_gpu_driver.sh", "dask/dask.sh", "rapids/rapids.sh" ] @@ -24,8 +23,7 @@ class RapidsTestCase(DataprocTestCase): def verify_dask_instance(self, name): self.assert_instance_command( - name, "pgrep -f dask-cuda-worker || " - "grep 'class: \"dask_cuda.CUDAWorker\"' /etc/dask/config.yaml") + name, '[[ "$(systemctl show dask-worker -p SubState --value)" == "running" ]]') self.upload_test_file( os.path.join( @@ -51,7 +49,7 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, self.createCluster( configuration, - self.DASK_INIT_ACTIONS, + self.INIT_ACTIONS, metadata=metadata, machine_type="n1-standard-4", master_accelerator=accelerator, From 85ac0acb192a306055cb0d448c997f45d235cde0 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 14 Oct 2024 10:57:09 -0700 Subject: [PATCH 69/95] removing quotes from systemctl command --- rapids/test_rapids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 6d37f0b29..267575c16 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -23,7 +23,7 @@ class RapidsTestCase(DataprocTestCase): def verify_dask_instance(self, name): self.assert_instance_command( - name, '[[ "$(systemctl show dask-worker -p SubState --value)" == "running" ]]') + name, '[[ $(systemctl show dask-worker -p SubState --value) == running ]]') self.upload_test_file( os.path.join( From 3314334443474c00bb48c44b3a111108919ba36c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 14 Oct 2024 13:50:59 -0700 Subject: [PATCH 70/95] protecting from empty string state --- rapids/test_rapids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 267575c16..7f2cea08e 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -23,7 +23,7 @@ class RapidsTestCase(DataprocTestCase): def verify_dask_instance(self, name): self.assert_instance_command( - name, '[[ $(systemctl show dask-worker -p SubState --value) == running ]]') + name, '[[ X$(systemctl show dask-worker -p SubState --value)X == XrunningX ]]') self.upload_test_file( os.path.join( From c158a55599f6025814d774935958bbe665f8eddb Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 14 Oct 2024 14:15:56 -0700 Subject: [PATCH 71/95] replacing removed dask-runtime=yarn instance test --- rapids/test_rapids.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 7f2cea08e..48878973a 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -23,7 +23,8 @@ class RapidsTestCase(DataprocTestCase): def verify_dask_instance(self, name): self.assert_instance_command( - name, '[[ X$(systemctl show dask-worker -p SubState --value)X == XrunningX ]]') + name, "[[ \"X$(systemctl show dask-worker -p SubState --value)X\" == \"XrunningX\" ]] || " + "grep 'class: \"dask_cuda.CUDAWorker\"' /etc/dask/config.yaml") self.upload_test_file( os.path.join( From 3eda60d5ea89291da16fd308237d341f0684510e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 23 Oct 2024 23:29:14 -0700 Subject: [PATCH 72/95] [dask-rapids] merge from custom-images rapids/BUILD * removed dependence on verify_xgboost_spark.scala - this belongs in [spark-rapids] * removed dependence on dask rapids/rapids.sh * added utility functions * reverted dask_spec="dask>=2024.5" * using realpath to /opt/conda/miniconda3/bin/mamba instead of default symlink * remove conda environment [dask] if installed * asserting existence of directory depended on by the script when run as custom-images script * created exit_handler and prepare_to_install functions to set up and clean up rapids/test_rapids.py * refactored to make use of systemd unit defined in rapids.sh * added retry to ssh * removed condition to keep tests from running on 2.0 images --- rapids/BUILD | 2 - rapids/rapids.sh | 103 ++++++++++++++++++++++++++++++++++++++++-- rapids/test_rapids.py | 40 ++++++++++------ 3 files changed, 127 insertions(+), 18 deletions(-) diff --git a/rapids/BUILD b/rapids/BUILD index c4db3e191..c5e2d3569 100644 --- a/rapids/BUILD +++ b/rapids/BUILD @@ -8,8 +8,6 @@ py_test( srcs = ["test_rapids.py"], data = [ "rapids.sh", - "verify_xgboost_spark.scala", - "//dask:dask.sh", "//gpu:install_gpu_driver.sh", ], local = True, diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 6bade611d..ba6a1f1ba 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -19,6 +19,11 @@ set -euxo pipefail +function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } +function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } +function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } +function is_debuntu() { is_debian || is_ubuntu ; } + # Detect dataproc image version from its various names if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" @@ -99,7 +104,7 @@ function install_dask_rapids() { if is_cuda12 ; then local python_spec="python>=3.11" local cuda_spec="cuda-version>=12,<13" - local dask_spec="dask>=2024.8" + local dask_spec="dask>=2024.5" local numba_spec="numba" elif is_cuda11 ; then local python_spec="python>=3.9" @@ -121,8 +126,10 @@ function install_dask_rapids() { # Install cuda, rapids, dask local is_installed="0" - mamba="/opt/conda/default/bin/mamba" - conda="/opt/conda/default/bin/conda" + mamba="/opt/conda/miniconda3/bin/mamba" + conda="/opt/conda/miniconda3/bin/conda" + + "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]" for installer in "${mamba}" "${conda}" ; do set +e @@ -252,6 +259,7 @@ EOF function configure_dask_yarn() { # Replace config file on cluster. + mkdir -p "$(dirname "${DASK_YARN_CONFIG_FILE}")" cat <"${DASK_YARN_CONFIG_FILE}" # Config file for Dask Yarn. # @@ -301,4 +309,93 @@ function main() { fi } +function exit_handler() { + # Free conda cache + /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1 + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # remove the tmpfs conda pkgs_dirs + if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do + if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then + rm -rf ${shmdir}/* + umount -f ${shmdir} + fi + done + + # Clean up OS package cache ; re-hold systemd package + if is_debuntu ; then + apt-get -y -qq clean + apt-get -y -qq autoremove + else + dnf clean all + fi + + # print disk usage statistics + if is_debuntu ; then + # Rocky doesn't have sort -h and fails when the argument is passed + du --max-depth 3 -hx / | sort -h | tail -10 + fi + + # Process disk usage logs from installation period + rm /tmp/keep-running-df + sleep 6s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem Size Used Avail Use% Mounted on +#/dev/vda2 6.8G 2.5G 4.0G 39% / + df -h + perl -e '$max=( sort + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } )[-1]; +print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + set +e + dd if=/dev/zero of=/zero ; sync ; rm -f /zero + set -e + fi +} + +trap exit_handler EXIT + +function prepare_to_install(){ + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + # Write to a ramdisk instead of churning the persistent disk + if [[ ${free_mem} -ge 5250000 ]]; then + mkdir -p /mnt/shm + mount -t tmpfs tmpfs /mnt/shm + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs /mnt/shm + mount -t tmpfs tmpfs /mnt/shm + + # Download pip packages to tmpfs + pip config set global.cache-dir /mnt/shm || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi + fi + + # Monitor disk usage in a screen session + apt-get install -y -qq screen + rm -f /tmp/disk-usage.log + touch /tmp/keep-running-df + screen -d -m -US keep-running-df \ + bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df -h / | tee -a /tmp/disk-usage.log ; sleep 5s ; done' +} + +prepare_to_install + main diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 48878973a..05a87a4da 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -1,4 +1,5 @@ import os +import time import pkg_resources from absl.testing import absltest @@ -6,11 +7,10 @@ from integration_tests.dataproc_test_case import DataprocTestCase - class RapidsTestCase(DataprocTestCase): COMPONENT = "rapids" INIT_ACTIONS = [ - "gpu/install_gpu_driver.sh", "dask/dask.sh", "rapids/rapids.sh" + "gpu/install_gpu_driver.sh", "rapids/rapids.sh" ] GPU_P100 = "type=nvidia-tesla-p100" @@ -21,15 +21,24 @@ class RapidsTestCase(DataprocTestCase): # Tests for RAPIDS init action DASK_RAPIDS_TEST_SCRIPT_FILE_NAME = "verify_rapids_dask.py" - def verify_dask_instance(self, name): + def verify_dask_worker_service(self, name): + # Retry the first ssh to ensure it has enough time to propagate SSH keys + for try_number in range(0, 3): + try: + self.assert_instance_command( + name, "[[ X$(systemctl show dask-worker -p SubState --value)X == XrunningX ]]") + break + except: + time.sleep(2**try_number) + + def verify_dask_config(self, name): self.assert_instance_command( - name, "[[ \"X$(systemctl show dask-worker -p SubState --value)X\" == \"XrunningX\" ]] || " - "grep 'class: \"dask_cuda.CUDAWorker\"' /etc/dask/config.yaml") + name, "grep 'class: \"dask_cuda.CUDAWorker\"' /etc/dask/config.yaml") - self.upload_test_file( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), - self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME), name) + def run_dask_script(self, name): + test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), + self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME) + self.upload_test_file(test_filename, name) verify_cmd = "/opt/conda/miniconda3/envs/dask-rapids/bin/python {}".format( self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) @@ -41,9 +50,6 @@ def verify_dask_instance(self, name): def test_rapids_dask(self, configuration, machine_suffixes, accelerator, dask_runtime): - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre 2.0 images") - metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=DASK" if dask_runtime: metadata += ",dask-runtime={}".format(dask_runtime) @@ -59,8 +65,16 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, timeout_in_minutes=60) for machine_suffix in machine_suffixes: - self.verify_dask_instance("{}-{}".format(self.getClusterName(), + if dask_runtime == 'standalone' or dask_runtime == None: + self.verify_dask_worker_service("{}-{}".format(self.getClusterName(), + machine_suffix)) + elif dask_runtime == 'yarn': + self.verify_dask_config("{}-{}".format(self.getClusterName(), machine_suffix)) + self.run_dask_script("{}-{}".format(self.getClusterName(), + machine_suffix)) + + if __name__ == "__main__": absltest.main() From dbfa4c0965999017627fb5b6c51d1f629aeec00e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 24 Oct 2024 00:03:03 -0700 Subject: [PATCH 73/95] revert to master --- dask/dask.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index 8dbe0cf4b..f492f27f6 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -465,10 +465,6 @@ function main() { # Create Dask service install_systemd_dask_service - # Create empty dask config file when running standalone - mkdir -p /etc/dask - touch /etc/dask/config.yaml - if [[ "$(hostname -s)" == "${MASTER}" ]]; then systemctl start "${DASK_SCHEDULER_SERVICE}" systemctl status "${DASK_SCHEDULER_SERVICE}" From 1c9c7fedcce27b991dbad651c9ce247c02e761d1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 24 Oct 2024 13:53:57 -0700 Subject: [PATCH 74/95] refactored to match dask ; removed all spark code paths (see spark-rapids) --- rapids/rapids.sh | 660 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 460 insertions(+), 200 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index ba6a1f1ba..aa5000a64 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -21,72 +21,56 @@ set -euxo pipefail function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } +function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } function is_debuntu() { is_debian || is_ubuntu ; } -# Detect dataproc image version from its various names -if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" -fi - -function get_metadata_attribute() { - local -r attribute_name=$1 - local -r default_value="${2:-}" - /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +function print_metadata_value() { + local readonly tmpfile=$(mktemp) + http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ + -s -o ${tmpfile} 2>/dev/null) + local readonly return_code=$? + # If the command completed successfully, print the metadata value to stdout. + if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then + cat ${tmpfile} + fi + rm -f ${tmpfile} + return ${return_code} } -readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -if [[ "${SPARK_VERSION_ENV%%.*}" == "3" ]]; then - DEFAULT_CUDA_VERSION="12.4" - readonly DEFAULT_XGBOOST_VERSION="2.0.3" - readonly SPARK_VERSION="${SPARK_VERSION_ENV}" - readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="" -else - DEFAULT_CUDA_VERSION="10.1" - readonly DEFAULT_XGBOOST_VERSION="1.0.0" - readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="Beta5" - readonly SPARK_VERSION="2.x" -fi - -# RAPIDS config -readonly RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') -if [[ "${RAPIDS_RUNTIME}" != "SPARK" ]]; then # to match install_gpu_driver.sh ; they should both probably be removed - DEFAULT_CUDA_VERSION='11.8' -fi -readonly DEFAULT_CUDA_VERSION -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) - -readonly CUDA_VERSION -function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } -function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } - -readonly DEFAULT_DASK_RAPIDS_VERSION="24.08" -readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) - -readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) -readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) - -readonly RUN_WORKER_ON_MASTER=$(get_metadata_attribute 'dask-cuda-worker-on-master' 'true') - -# SPARK config -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.0" -readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) -readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) -readonly XGBOOST_GPU_SUB_VERSION=$(get_metadata_attribute 'spark-gpu-sub-version' ${DEFAULT_XGBOOST_GPU_SUB_VERSION}) +function print_metadata_value_if_exists() { + local return_code=1 + local readonly url=$1 + print_metadata_value ${url} + return_code=$? + return ${return_code} +} -# Scala config -readonly SCALA_VER="2.12" +function get_metadata_value() { + set +x + local readonly varname=$1 + local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 + # Print the instance metadata value. + print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + return_code=$? + # If the instance doesn't have the value, try the project. + if [[ ${return_code} != 0 ]]; then + print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + return_code=$? + fi + set -x + return ${return_code} +} -# Dask config -readonly DASK_RUNTIME="$(/usr/share/google/get_metadata_value attributes/dask-runtime || echo 'standalone')" -readonly DASK_LAUNCHER=/usr/local/bin/dask-launcher.sh -readonly DASK_SERVICE=dask-cluster -readonly DASK_WORKER_SERVICE=dask-worker -readonly DASK_SCHEDULER_SERVICE=dask-scheduler -readonly DASK_YARN_CONFIG_FILE=/etc/dask/config.yaml +function get_metadata_attribute() ( + set +x + local -r attribute_name="$1" + local -r default_value="${2:-}" + get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +) -# Dataproc configurations -readonly SPARK_CONF_DIR='/etc/spark/conf' +function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } +function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } function execute_with_retries() { local -r cmd="$*" @@ -99,114 +83,30 @@ function execute_with_retries() { return 1 } -readonly conda_env="/opt/conda/miniconda3/envs/dask-rapids" -function install_dask_rapids() { - if is_cuda12 ; then - local python_spec="python>=3.11" - local cuda_spec="cuda-version>=12,<13" - local dask_spec="dask>=2024.5" - local numba_spec="numba" - elif is_cuda11 ; then - local python_spec="python>=3.9" - local cuda_spec="cuda-version>=11,<=11.8" - local dask_spec="dask" - local numba_spec="numba" - fi - - local CONDA_PACKAGES=( - "${cuda_spec}" - "rapids=${RAPIDS_VERSION}" - "${dask_spec}" - "dask-bigquery" - "dask-ml" - "dask-sql" - "cudf" - "${numba_spec}" - ) - - # Install cuda, rapids, dask - local is_installed="0" - mamba="/opt/conda/miniconda3/bin/mamba" - conda="/opt/conda/miniconda3/bin/conda" - - "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]" - - for installer in "${mamba}" "${conda}" ; do - set +e - test -d "${conda_env}" || \ - time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ - -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ - ${CONDA_PACKAGES[*]} \ - "${python_spec}" - if [[ "$?" == "0" ]] ; then - is_installed="1" - break - else - "${conda}" config --set channel_priority flexible - fi - set -e - done - if [[ "${is_installed}" == "0" ]]; then - echo "failed to install dask" - return 1 - fi - set -e -} +function configure_dask_yarn() { + readonly DASK_YARN_CONFIG_DIR=/etc/dask/ + readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml + # Minimal custom configuration is required for this + # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage + # for information on tuning Dask-Yarn environments. + mkdir -p "${DASK_YARN_CONFIG_DIR}" -function install_spark_rapids() { - local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' - local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - - if [[ "${SPARK_VERSION}" == "3"* ]]; then - execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_${SCALA_VER}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${SCALA_VER}-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_${SCALA_VER}/${XGBOOST_VERSION}/xgboost4j-gpu_${SCALA_VER}-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_${SCALA_VER}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${SCALA_VER}-${SPARK_RAPIDS_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - else - execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${rapids_repo_url}/xgboost4j-spark_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j-spark_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - execute_with_retries wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${rapids_repo_url}/xgboost4j_${SPARK_VERSION}/${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}/xgboost4j_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - fi -} + cat <"${DASK_YARN_CONFIG_FILE}" +# Config file for Dask Yarn. +# +# These values are joined on top of the default config, found at +# https://yarn.dask.org/en/latest/configuration.html#default-configuration -function configure_spark() { - if [[ "${SPARK_VERSION}" == "3"* ]]; then - cat >>${SPARK_CONF_DIR}/spark-defaults.conf <>${SPARK_CONF_DIR}/spark-defaults.conf <> "\${LOGFILE}" 2>&1 +${DASK_CONDA_ENV}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_WORKER_LAUNCHER}" @@ -244,8 +144,9 @@ EOF if [[ "${ROLE}" != "Master" ]]; then enable_worker_service="1" else + local RUN_WORKER_ON_MASTER=$(get_metadata_attribute dask-cuda-worker-on-master 'true') # Enable service on single-node cluster (no workers) - local worker_count="$(/usr/share/google/get_metadata_value attributes/dataproc-worker-count)" + local worker_count="$(get_metadata_attribute dataproc-worker-count)" if [[ "${worker_count}" == "0" || "${RUN_WORKER_ON_MASTER}" == "true" ]]; then enable_worker_service="1" fi @@ -257,47 +158,371 @@ EOF fi } -function configure_dask_yarn() { - # Replace config file on cluster. - mkdir -p "$(dirname "${DASK_YARN_CONFIG_FILE}")" - cat <"${DASK_YARN_CONFIG_FILE}" -# Config file for Dask Yarn. -# -# These values are joined on top of the default config, found at -# https://yarn.dask.org/en/latest/configuration.html#default-configuration +function install_systemd_dask_scheduler() { + # only run scheduler on primary master + if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi + echo "Installing systemd Dask Scheduler service..." + local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}" -yarn: - environment: python://${conda_env}/bin/python + mkdir -p "${dask_scheduler_local_dir}" - worker: - count: 2 - gpus: 1 - class: "dask_cuda.CUDAWorker" + local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh" + + cat <"${DASK_SCHEDULER_LAUNCHER}" +#!/bin/bash +LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log" +echo "dask scheduler starting, logging to \${LOGFILE}" +${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1 +EOF + + chmod 750 "${DASK_SCHEDULER_LAUNCHER}" + + local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service" + cat <"${dask_service_file}" +[Unit] +Description=Dask Scheduler Service +[Service] +Type=simple +Restart=on-failure +ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}' +[Install] +WantedBy=multi-user.target EOF + chmod a+r "${dask_service_file}" + + systemctl daemon-reload + + # Enable the service + systemctl enable "${DASK_SCHEDULER_SERVICE}" +} + +function install_systemd_dask_service() { + install_systemd_dask_scheduler + install_systemd_dask_worker +} + +function restart_knox() { + systemctl stop knox + rm -rf "${KNOX_HOME}/data/deployments/*" + systemctl start knox +} + +function configure_knox_for_dask() { + if [[ ! -d "${KNOX_HOME}" ]]; then + echo "Skip configuring Knox rules for Dask" + return 0 + fi + + local DASK_UI_PORT=8787 + if [[ -f /etc/knox/conf/topologies/default.xml ]]; then + sed -i \ + "/<\/topology>/i DASK<\/role>http://localhost:${DASK_UI_PORT}<\/url><\/service> DASKWS<\/role>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \ + /etc/knox/conf/topologies/default.xml + fi + + mkdir -p "${KNOX_DASK_DIR}" + + cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +EOF + + cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +EOF + + mkdir -p "${KNOX_DASKWS_DIR}" + + cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF' + + + + + + + + + + + + + + + + + + + +EOF + + cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF' + + + + + + + +EOF + + chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}" + + # Do not restart knox during pre-init script run + if [[ -n "${ROLE}" ]]; then + restart_knox + fi +} + +function configure_fluentd_for_dask() { + if [[ "$(hostname -s)" == "${MASTER}" ]]; then + cat >/etc/google-fluentd/config.d/dataproc-dask.conf < + @type tail + path /var/log/dask-scheduler.log + pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos + read_from_head true + tag google.dataproc.dask-scheduler + + @type none + + + + + @type record_transformer + + filename dask-scheduler.log + + +EOF + fi + + if [[ "${enable_worker_service}" == "1" ]]; then + cat >>/etc/google-fluentd/config.d/dataproc-dask.conf < + @type tail + path /var/log/dask-worker.log + pos_file /var/tmp/fluentd.dataproc.dask.worker.pos + read_from_head true + tag google.dataproc.dask-worker + + @type none + + + + + @type record_transformer + + filename dask-worker.log + + +EOF + fi + + systemctl restart google-fluentd +} + +function install_dask_rapids() { + if is_cuda12 ; then + local python_spec="python>=3.11" + local cuda_spec="cuda-version>=12,<13" + local dask_spec="dask>=2024.5" + local numba_spec="numba" + elif is_cuda11 ; then + local python_spec="python>=3.9" + local cuda_spec="cuda-version>=11,<12.0a0" + local dask_spec="dask" + local numba_spec="numba" + fi + + CONDA_PACKAGES=() + if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then + # Pin `distributed` and `dask` package versions to old release + # because `dask-yarn` 0.9 uses skein in a way which + # is not compatible with `distributed` package 2022.2 and newer: + # https://github.com/dask/dask-yarn/issues/155 + + dask_spec="dask<2022.2" + python_spec="python>=3.7,<3.8.0a0" + if is_ubuntu18 ; then + # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic + CONDA_PACKAGES+=("fiona<1.8.22") + fi + CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2") + fi + + CONDA_PACKAGES+=( + "${cuda_spec}" + "rapids=${RAPIDS_VERSION}" + "${dask_spec}" + "dask-bigquery" + "dask-ml" + "dask-sql" + "cudf" + "${numba_spec}" + ) + + # Install cuda, rapids, dask + local is_installed="0" + mamba="/opt/conda/miniconda3/bin/mamba" + conda="/opt/conda/miniconda3/bin/conda" + + "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]" + + ( set +e + for installer in "${mamba}" "${conda}" ; do + test -d "${DASK_CONDA_ENV}" || \ + time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ + -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ + ${CONDA_PACKAGES[*]} \ + "${python_spec}" + local retval=$? + sync + if [[ "$retval" == "0" ]] ; then + is_installed="1" + break + fi + "${conda}" config --set channel_priority flexible + done + if [[ "${is_installed}" == "0" ]]; then + echo "failed to install dask" + return 1 + fi + ) } function main() { - if [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then - # Install RAPIDS - install_dask_rapids - - # In "standalone" mode, Dask relies on a shell script to launch. - # In "yarn" mode, it relies a config.yaml file. - if [[ "${DASK_RUNTIME}" == "standalone" ]]; then - install_systemd_dask_worker - elif [[ "${DASK_RUNTIME}" == "yarn" ]]; then - configure_dask_yarn + # Install Dask with RAPIDS + install_dask_rapids + + # In "standalone" mode, Dask relies on a systemd unit to launch. + # In "yarn" mode, it relies a config.yaml file. + if [[ "${DASK_RUNTIME}" == "yarn" ]]; then + # Create Dask YARN config file + configure_dask_yarn + elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then + # Create Dask service + install_systemd_dask_service + + if [[ "$(hostname -s)" == "${MASTER}" ]]; then + systemctl start "${DASK_SCHEDULER_SERVICE}" + systemctl status "${DASK_SCHEDULER_SERVICE}" + fi + + echo "Starting Dask 'standalone' cluster..." + if [[ "${enable_worker_service}" == "1" ]]; then + systemctl start "${DASK_WORKER_SERVICE}" + systemctl status "${DASK_WORKER_SERVICE}" + fi + + configure_knox_for_dask + + local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" + if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then + configure_fluentd_for_dask fi - echo "RAPIDS installed with Dask runtime" - elif [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then - install_spark_rapids - configure_spark - echo "RAPIDS initialized with Spark runtime" else - echo "Unsupported RAPIDS Runtime: ${RAPIDS_RUNTIME}" + echo "Unsupported Dask Runtime: ${DASK_RUNTIME}" exit 1 fi + echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized." if [[ "${ROLE}" == "Master" ]]; then systemctl restart hadoop-yarn-resourcemanager.service # Restart NodeManager on Master as well if this is a single-node-cluster. @@ -309,7 +534,10 @@ function main() { fi } -function exit_handler() { +function exit_handler() ( + set +e + echo "Exit handler invoked" + # Free conda cache /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1 @@ -342,13 +570,13 @@ function exit_handler() { fi # Process disk usage logs from installation period - rm /tmp/keep-running-df + rm -f /tmp/keep-running-df sleep 6s # compute maximum size of disk during installation # Log file contains logs like the following (minus the preceeding #): #Filesystem Size Used Avail Use% Mounted on #/dev/vda2 6.8G 2.5G 4.0G 39% / - df -h + df --si perl -e '$max=( sort map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } )[-1]; @@ -358,15 +586,43 @@ print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log # zero free disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - set +e dd if=/dev/zero of=/zero ; sync ; rm -f /zero - set -e fi -} + + return 0 +) trap exit_handler EXIT function prepare_to_install(){ + readonly DEFAULT_CUDA_VERSION="12.4" + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) + readonly CUDA_VERSION + + readonly ROLE=$(get_metadata_attribute dataproc-role) + readonly MASTER=$(get_metadata_attribute dataproc-master) + + # RAPIDS config + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') + readonly RAPIDS_RUNTIME + + readonly DEFAULT_DASK_RAPIDS_VERSION="24.08" + readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) + + # Dask config + DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" + readonly DASK_RUNTIME + readonly DASK_SERVICE=dask-cluster + readonly DASK_WORKER_SERVICE=dask-worker + readonly DASK_SCHEDULER_SERVICE=dask-scheduler + readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/dask-rapids" + + # Knox config + readonly KNOX_HOME=/usr/lib/knox + readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0" + readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0" + enable_worker_service="0" + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" # Write to a ramdisk instead of churning the persistent disk if [[ ${free_mem} -ge 5250000 ]]; then @@ -389,11 +645,15 @@ function prepare_to_install(){ fi # Monitor disk usage in a screen session - apt-get install -y -qq screen + if is_debuntu ; then + apt-get install -y -qq screen + elif is_rocky ; then + dnf -y -q install screen + fi rm -f /tmp/disk-usage.log touch /tmp/keep-running-df screen -d -m -US keep-running-df \ - bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df -h / | tee -a /tmp/disk-usage.log ; sleep 5s ; done' + bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df --si / | tee -a /tmp/disk-usage.log ; sleep 5s ; done' } prepare_to_install From 1c7a31dab26eaf2bae0f7be8817bdd04b629b12e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 03:15:01 -0700 Subject: [PATCH 75/95] added some testing helpers and documentation --- rapids/Dockerfile | 40 +++++++++++++++++++ rapids/bazel.screenrc | 17 ++++++++ rapids/env.json.sample | 7 ++++ rapids/manual-test-runner.sh | 75 ++++++++++++++++++++++++++++++++++++ rapids/rapids.sh | 5 +-- rapids/run-bazel-tests.sh | 23 +++++++++++ rapids/test_rapids.py | 14 +++---- 7 files changed, 169 insertions(+), 12 deletions(-) create mode 100644 rapids/Dockerfile create mode 100644 rapids/bazel.screenrc create mode 100644 rapids/env.json.sample create mode 100644 rapids/manual-test-runner.sh create mode 100644 rapids/run-bazel-tests.sh diff --git a/rapids/Dockerfile b/rapids/Dockerfile new file mode 100644 index 000000000..53d91731b --- /dev/null +++ b/rapids/Dockerfile @@ -0,0 +1,40 @@ +# This Dockerfile builds the container from which rapids tests are run +# This process needs to be executed manually from a git clone +# +# See manual-test-runner.sh for instructions + +FROM gcr.io/cloud-builders/gcloud + +RUN useradd -m -d /home/ia-tests -s /bin/bash ia-tests + +RUN apt-get -qq update \ + && apt-get -y -qq install \ + apt-transport-https apt-utils \ + ca-certificates libmime-base64-perl gnupg \ + curl jq less screen + +# Install bazel signing key, repo and package +ENV bazel_kr_path=/usr/share/keyrings/bazel-release.pub.gpg +ENV bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" + +RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \ + | gpg --dearmor -o "${bazel_kr_path}" \ + && echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" \ + | dd of=/etc/apt/sources.list.d/bazel.list status=none \ + && apt-get update -qq + +RUN apt-get autoremove -y -qq && \ + apt-get install -y -qq default-jdk python3-setuptools bazel && \ + apt-get clean + + +# Install here any utilities you find useful when troubleshooting +RUN apt-get -y -qq install emacs-nox vim uuid-runtime && apt-get clean + +WORKDIR /init-actions + +USER ia-tests +COPY --chown=ia-tests:ia-tests . ${WORKDIR} + +ENTRYPOINT ["/bin/bash"] +#CMD ["/bin/bash"] diff --git a/rapids/bazel.screenrc b/rapids/bazel.screenrc new file mode 100644 index 000000000..d8240424e --- /dev/null +++ b/rapids/bazel.screenrc @@ -0,0 +1,17 @@ +# +# For debugging, uncomment the following line +# + +screen -L -t monitor 0 /bin/bash + +screen -L -t 2.0-debian10 1 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-debian10 ; exec /bin/bash' +#screen -L -t 2.0-rocky9 2 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-rocky9 ; exec /bin/bash' +#screen -L -t 2.0-ubuntu18 3 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-ubuntu18 ; exec /bin/bash' + +#screen -L -t 2.1-debian10 4 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-debian10 ; exec /bin/bash' +#screen -L -t 2.1-rocky9 5 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-rocky9 ; exec /bin/bash' +#screen -L -t 2.1-ubuntu18 6 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-ubuntu18 ; exec /bin/bash' + +#screen -L -t 2.2-debian10 7 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-debian10 ; exec /bin/bash' +#screen -L -t 2.2-rocky9 8 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-rocky9 ; exec /bin/bash' +#screen -L -t 2.2-ubuntu18 9 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-ubuntu18 ; exec /bin/bash' diff --git a/rapids/env.json.sample b/rapids/env.json.sample new file mode 100644 index 000000000..00d9fd65c --- /dev/null +++ b/rapids/env.json.sample @@ -0,0 +1,7 @@ +{ + "PROJECT_ID":"example-yyyy-nn", + "PURPOSE":"cuda-pre-init", + "BUCKET":"my-bucket-name", + "IMAGE_VERSION":"2.2-debian12", + "ZONE":"us-west4-ñ" +} diff --git a/rapids/manual-test-runner.sh b/rapids/manual-test-runner.sh new file mode 100644 index 000000000..dcc78af81 --- /dev/null +++ b/rapids/manual-test-runner.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# This script sets up the gcloud environment and launches tests in a screen session +# +# To run the script, the following will bootstrap +# +# git clone git@github.com:GoogleCloudDataproc/initialization-actions +# cd initialization-actions +# docker build -f rapids/Dockerfile -t rapids-init-actions-runner:latest . && time docker run -it rapids-init-actions-runner:latest rapids/manual-test-runner.sh +# docker build -f rapids/Dockerfile -t rapids-init-actions-runner:latest . +# time docker run -it rapids-init-actions-runner:latest rapids/manual-test-runner.sh +# +# The bazel run(s) happen in separate screen windows. +# To see a list of screen windows, press ^a " +# Num Name +# +# 0 monitor +# 1 2.0-debian10 +# 2 sh + + +readonly timestamp="$(date +%F-%H-%M)" +export BUILD_ID="$(uuidgen)" + +tmp_dir="/tmp/${BUILD_ID}" +log_dir="${tmp_dir}/logs" +mkdir -p "${log_dir}" + +IMAGE_VERSION="$1" +if [[ -z "${IMAGE_VERSION}" ]] ; then + IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" ; fi ; export IMAGE_VERSION +export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" +export REGION="$(jq -r .REGION env.json)" +export BUCKET="$(jq -r .BUCKET env.json)" + +gcs_log_dir="gs://${BUCKET}/${BUILD_ID}/logs" + +function exit_handler() { + RED='\\e[0;31m' + GREEN='\\e[0;32m' + NC='\\e[0m' + echo 'Cleaning up before exiting.' + + # TODO: list clusters which match our BUILD_ID and clean them up + # TODO: remove any test related resources in the project + + echo 'Uploading local logs to GCS bucket.' + gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/" + + if [[ -f "${tmp_dir}/tests_success" ]]; then + echo -e "${GREEN}Workflow succeeded, check logs at ${log_dir}/ or ${gcs_log_dir}/${NC}" + exit 0 + else + echo -e "${RED}Workflow failed, check logs at ${log_dir}/ or ${gcs_log_dir}/${NC}" + exit 1 + fi +} + +trap exit_handler EXIT + +# screen session name +session_name="manual-rapids-tests" + +gcloud config set project ${PROJECT_ID} +gcloud config set dataproc/region ${REGION} +gcloud auth login +gcloud config set compute/region ${REGION} + +export INTERNAL_IP_SSH="true" + +# Run tests in screen session so we can monitor the container in another window +screen -US "${session_name}" -c rapids/bazel.screenrc + + + diff --git a/rapids/rapids.sh b/rapids/rapids.sh index aa5000a64..5f0e8d711 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -496,7 +496,7 @@ function main() { if [[ "${DASK_RUNTIME}" == "yarn" ]]; then # Create Dask YARN config file configure_dask_yarn - elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then + else # Create Dask service install_systemd_dask_service @@ -517,9 +517,6 @@ function main() { if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then configure_fluentd_for_dask fi - else - echo "Unsupported Dask Runtime: ${DASK_RUNTIME}" - exit 1 fi echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized." diff --git a/rapids/run-bazel-tests.sh b/rapids/run-bazel-tests.sh new file mode 100644 index 000000000..4c2ca20a6 --- /dev/null +++ b/rapids/run-bazel-tests.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Run from root directory of initialization-actions checkout + +IMAGE="rapids-actions-image:$BUILD_ID" +max_parallel_tests=10 + +IMAGE_VERSION="$1" +if [[ -z "${IMAGE_VERSION}" ]] ; then + IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" ; fi ; export IMAGE_VERSION + +#declare -a TESTS_TO_RUN=('dask:test_dask' 'rapids:test_rapids') +#declare -a TESTS_TO_RUN=('dask:test_dask') +declare -a TESTS_TO_RUN=('rapids:test_rapids') + +time bazel test \ + --jobs="${max_parallel_tests}" \ + --local_test_jobs="${max_parallel_tests}" \ + --flaky_test_attempts=3 \ + --action_env="INTERNAL_IP_SSH=true" \ + --test_output="errors" \ + --test_arg="--image_version=${IMAGE_VERSION}" \ + "${TESTS_TO_RUN[@]}" diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 05a87a4da..2cab46e06 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -58,22 +58,20 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, configuration, self.INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-4", + machine_type="n1-standard-8", master_accelerator=accelerator, worker_accelerator=accelerator, - boot_disk_size="60GB", + boot_disk_size="50GB", timeout_in_minutes=60) for machine_suffix in machine_suffixes: + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) if dask_runtime == 'standalone' or dask_runtime == None: - self.verify_dask_worker_service("{}-{}".format(self.getClusterName(), - machine_suffix)) + self.verify_dask_worker_service(machine_name) elif dask_runtime == 'yarn': - self.verify_dask_config("{}-{}".format(self.getClusterName(), - machine_suffix)) + self.verify_dask_config(machine_name) - self.run_dask_script("{}-{}".format(self.getClusterName(), - machine_suffix)) + self.run_dask_script(machine_name) if __name__ == "__main__": From caf930760b3776d95f649f7066efc5c774dfbe5f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 03:53:10 -0700 Subject: [PATCH 76/95] dask-yarn tests do not work ; disabling until new release of dask-yarn is produced --- rapids/test_rapids.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 2cab46e06..6922c9353 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -44,9 +44,11 @@ def run_dask_script(self, name): self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME, name) - @parameterized.parameters(("STANDARD", ["m", "w-0"], GPU_T4, None), - ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"), - ("STANDARD", ["m"], GPU_T4, "standalone")) + # If a new version of dask-yarn is released, add this test back in. + # ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"), + # ("STANDARD", ["m", "w-0"], GPU_T4, None), + + @parameterized.parameters(("STANDARD", ["m"], GPU_T4, "standalone")) def test_rapids_dask(self, configuration, machine_suffixes, accelerator, dask_runtime): From 7fdda0cc662f040b540a3fb330a44a73686aa407 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 04:41:13 -0700 Subject: [PATCH 77/95] increase max idle time ; print the command to be run --- integration_tests/dataproc_test_case.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 149c4d5fa..936718498 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -180,11 +180,13 @@ def createCluster(self, if not FLAGS.skip_cleanup: args.append("--max-age=60m") - args.append("--max-idle=5m") + args.append("--max-idle=25m") cmd = "{} dataproc clusters create {} {}".format( "gcloud beta" if beta else "gcloud", self.name, " ".join(args)) + print("Running command: [{}]".format(cmd)) + _, stdout, _ = self.assert_command( cmd, timeout_in_minutes=timeout_in_minutes or DEFAULT_TIMEOUT) config = json.loads(stdout).get("config", {}) From dd12f026f2577d7210fb3f25724292b2350a12c8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 05:35:27 -0700 Subject: [PATCH 78/95] cleaned up comment positioning and content --- rapids/manual-test-runner.sh | 2 ++ rapids/test_rapids.py | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/rapids/manual-test-runner.sh b/rapids/manual-test-runner.sh index dcc78af81..8feeb2593 100644 --- a/rapids/manual-test-runner.sh +++ b/rapids/manual-test-runner.sh @@ -6,6 +6,8 @@ # # git clone git@github.com:GoogleCloudDataproc/initialization-actions # cd initialization-actions +# cp rapids/env.json.sample env.json +# vi env.json # docker build -f rapids/Dockerfile -t rapids-init-actions-runner:latest . && time docker run -it rapids-init-actions-runner:latest rapids/manual-test-runner.sh # docker build -f rapids/Dockerfile -t rapids-init-actions-runner:latest . # time docker run -it rapids-init-actions-runner:latest rapids/manual-test-runner.sh diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 6922c9353..46e23fe99 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -44,11 +44,13 @@ def run_dask_script(self, name): self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME, name) - # If a new version of dask-yarn is released, add this test back in. - # ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"), - # ("STANDARD", ["m", "w-0"], GPU_T4, None), - @parameterized.parameters(("STANDARD", ["m"], GPU_T4, "standalone")) + @parameterized.parameters( +# If a new version of dask-yarn is released, add this test back in. +# ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"), +# ("STANDARD", ["m"], GPU_T4, None), + ("STANDARD", ["m", "w-0"], GPU_T4, "standalone") + ) def test_rapids_dask(self, configuration, machine_suffixes, accelerator, dask_runtime): From 5cd395136535074b68f0bcbf170a41df3958c5ca Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 12:08:16 -0700 Subject: [PATCH 79/95] using ram disk for temp files if we have it --- rapids/rapids.sh | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 5f0e8d711..e762ff419 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -431,6 +431,7 @@ function install_dask_rapids() { local numba_spec="numba" fi + rapids_spec="rapids=${RAPIDS_VERSION}" CONDA_PACKAGES=() if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then # Pin `distributed` and `dask` package versions to old release @@ -440,6 +441,7 @@ function install_dask_rapids() { dask_spec="dask<2022.2" python_spec="python>=3.7,<3.8.0a0" + rapids_spec="rapids<=24.05" if is_ubuntu18 ; then # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic CONDA_PACKAGES+=("fiona<1.8.22") @@ -449,7 +451,7 @@ function install_dask_rapids() { CONDA_PACKAGES+=( "${cuda_spec}" - "rapids=${RAPIDS_VERSION}" + "${rapids_spec}" "${dask_spec}" "dask-bigquery" "dask-ml" @@ -471,7 +473,8 @@ function install_dask_rapids() { time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ ${CONDA_PACKAGES[*]} \ - "${python_spec}" + "${python_spec}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } local retval=$? sync if [[ "$retval" == "0" ]] ; then @@ -567,7 +570,7 @@ function exit_handler() ( fi # Process disk usage logs from installation period - rm -f /tmp/keep-running-df + rm -f "${tmpdir}/keep-running-df" sleep 6s # compute maximum size of disk during installation # Log file contains logs like the following (minus the preceeding #): @@ -577,7 +580,7 @@ function exit_handler() ( perl -e '$max=( sort map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } )[-1]; -print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log +print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log" echo "exit_handler has completed" @@ -589,8 +592,6 @@ print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log return 0 ) -trap exit_handler EXIT - function prepare_to_install(){ readonly DEFAULT_CUDA_VERSION="12.4" CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) @@ -623,6 +624,7 @@ function prepare_to_install(){ free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" # Write to a ramdisk instead of churning the persistent disk if [[ ${free_mem} -ge 5250000 ]]; then + tmpdir=/mnt/shm mkdir -p /mnt/shm mount -t tmpfs tmpfs /mnt/shm @@ -639,7 +641,11 @@ function prepare_to_install(){ else mount -t tmpfs tmpfs /var/cache/dnf fi + else + tmpdir=/tmp fi + install_log="${tmpdir}/install.log" + trap exit_handler EXIT # Monitor disk usage in a screen session if is_debuntu ; then @@ -647,10 +653,10 @@ function prepare_to_install(){ elif is_rocky ; then dnf -y -q install screen fi - rm -f /tmp/disk-usage.log - touch /tmp/keep-running-df + rm -f "${tmpdir}/disk-usage.log" + touch "${tmpdir}/keep-running-df" screen -d -m -US keep-running-df \ - bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df --si / | tee -a /tmp/disk-usage.log ; sleep 5s ; done' + bash -c 'while [[ -f ${tmpdir}/keep-running-df ]] ; do df --si / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done' } prepare_to_install From 3519fe0bafdf19c5239cfc99818aa59de5f537b5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 13:12:24 -0700 Subject: [PATCH 80/95] double quotes will allow temp directory variable to be expanded correctly --- rapids/rapids.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index e762ff419..8f7950e3e 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -576,7 +576,7 @@ function exit_handler() ( # Log file contains logs like the following (minus the preceeding #): #Filesystem Size Used Avail Use% Mounted on #/dev/vda2 6.8G 2.5G 4.0G 39% / - df --si + df -h | tee -a "${tmpdir}/disk-usage.log" perl -e '$max=( sort map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } )[-1]; @@ -653,10 +653,10 @@ function prepare_to_install(){ elif is_rocky ; then dnf -y -q install screen fi - rm -f "${tmpdir}/disk-usage.log" + df -h | tee "${tmpdir}/disk-usage.log" touch "${tmpdir}/keep-running-df" screen -d -m -US keep-running-df \ - bash -c 'while [[ -f ${tmpdir}/keep-running-df ]] ; do df --si / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done' + bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done" } prepare_to_install From 12e253d7b473b01829ccccd1849af8acd5708d0b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 13:16:58 -0700 Subject: [PATCH 81/95] using else instead of is_rocky --- rapids/rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 8f7950e3e..df282f9af 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -650,7 +650,7 @@ function prepare_to_install(){ # Monitor disk usage in a screen session if is_debuntu ; then apt-get install -y -qq screen - elif is_rocky ; then + else dnf -y -q install screen fi df -h | tee "${tmpdir}/disk-usage.log" From e8a44fe6e91338c7ee2b017f5f385aecd4d84de8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 13:18:15 -0700 Subject: [PATCH 82/95] corrected release version names --- rapids/bazel.screenrc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/rapids/bazel.screenrc b/rapids/bazel.screenrc index d8240424e..05d4cb788 100644 --- a/rapids/bazel.screenrc +++ b/rapids/bazel.screenrc @@ -2,16 +2,16 @@ # For debugging, uncomment the following line # -screen -L -t monitor 0 /bin/bash +# screen -L -t monitor 0 /bin/bash screen -L -t 2.0-debian10 1 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-debian10 ; exec /bin/bash' -#screen -L -t 2.0-rocky9 2 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-rocky9 ; exec /bin/bash' +#screen -L -t 2.0-rocky8 2 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-rocky8 ; exec /bin/bash' #screen -L -t 2.0-ubuntu18 3 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.0-ubuntu18 ; exec /bin/bash' -#screen -L -t 2.1-debian10 4 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-debian10 ; exec /bin/bash' -#screen -L -t 2.1-rocky9 5 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-rocky9 ; exec /bin/bash' -#screen -L -t 2.1-ubuntu18 6 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-ubuntu18 ; exec /bin/bash' +#screen -L -t 2.1-debian11 4 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-debian11 ; exec /bin/bash' +#screen -L -t 2.1-rocky8 5 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-rocky8 ; exec /bin/bash' +#screen -L -t 2.1-ubuntu20 6 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.1-ubuntu20 ; exec /bin/bash' -#screen -L -t 2.2-debian10 7 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-debian10 ; exec /bin/bash' +#screen -L -t 2.2-debian12 7 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-debian12 ; exec /bin/bash' #screen -L -t 2.2-rocky9 8 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-rocky9 ; exec /bin/bash' -#screen -L -t 2.2-ubuntu18 9 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-ubuntu18 ; exec /bin/bash' +#screen -L -t 2.2-ubuntu22 9 sh -c '/bin/bash -x rapids/run-bazel-tests.sh 2.2-ubuntu22 ; exec /bin/bash' From caab9bedb616e3706fed555c78e78ae3a81a9d16 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 13:56:08 -0700 Subject: [PATCH 83/95] revert to mainline --- cloudbuild/cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml index b590f4d9f..b06df98ea 100644 --- a/cloudbuild/cloudbuild.yaml +++ b/cloudbuild/cloudbuild.yaml @@ -80,7 +80,7 @@ steps: id: 'dataproc-2.2-debian12-tests' waitFor: ['gcr-push'] entrypoint: 'bash' - args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '2.2.20-debian12'] + args: ['cloudbuild/run-presubmit-on-k8s.sh', 'gcr.io/$PROJECT_ID/init-actions-image:$BUILD_ID', '$BUILD_ID', '2.2-debian12'] env: - 'COMMIT_SHA=$COMMIT_SHA' - 'CLOUDSDK_COMPUTE_REGION=us-central1' From 6d900bfcbb1d6817240180a9d6f1c195e1f78617 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 13:58:06 -0700 Subject: [PATCH 84/95] simplify and modernize this comment --- cloudbuild/presubmit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index 51ab7023f..be06ea5a6 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,7 +70,7 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then - continue # remove this and merge all changes to integration_tests/ and cloudbuild/ into prince's branch before squash/merge + continue # remove this before squash/merge echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 From 13cb7236b5cf7f280c3f4ce4a3bd983df0e7c709 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 14:00:12 -0700 Subject: [PATCH 85/95] default to using internal IP ; have not yet renamed rapids to dask-rapids ; tunnel through iap --- integration_tests/dataproc_test_case.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 936718498..266b347b9 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -21,7 +21,7 @@ flags.DEFINE_boolean('skip_cleanup', False, 'Skip cleanup of test resources') FLAGS(sys.argv) -INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "false").lower() == "true" +INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "true").lower() == "true" DEFAULT_TIMEOUT = 15 # minutes @@ -123,7 +123,7 @@ def createCluster(self, for i in init_actions: if "install_gpu_driver.sh" in i or "horovod.sh" in i or \ - "dask-rapids.sh" in i or "mlvm.sh" in i or \ + "rapids.sh" in i or "mlvm.sh" in i or \ "spark-rapids.sh" in i: args.append("--no-shielded-secure-boot") @@ -356,10 +356,10 @@ def random_str(size=4, chars=string.ascii_lowercase + string.digits): @staticmethod def run_command(cmd, timeout_in_minutes=DEFAULT_TIMEOUT): cmd = cmd.replace( - "gcloud compute ssh ", "gcloud compute ssh --internal-ip ") if ( + "gcloud compute ssh ", "gcloud compute ssh --tunnel-through-iap ") if ( INTERNAL_IP_SSH and "gcloud compute ssh " in cmd) else cmd cmd = cmd.replace("gcloud compute scp ", - "gcloud beta compute scp --internal-ip ") if ( + "gcloud beta compute scp --tunnel-through-iap ") if ( INTERNAL_IP_SSH and "gcloud compute scp " in cmd) else cmd p = subprocess.Popen( From aec628d3ed3c951ef0d41efa78f1ecf17e111f7e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 14:03:10 -0700 Subject: [PATCH 86/95] prepare layout for rename of rapids to dask-rapids --- integration_tests/dataproc_test_case.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 266b347b9..b21e9b971 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -122,9 +122,9 @@ def createCluster(self, args.append("--public-ip-address") for i in init_actions: - if "install_gpu_driver.sh" in i or "horovod.sh" in i or \ - "rapids.sh" in i or "mlvm.sh" in i or \ - "spark-rapids.sh" in i: + if "install_gpu_driver.sh" in i or \ + "mlvm.sh" in i or "rapids.sh" in i or \ + "horovod.sh" in i or "spark-rapids.sh" in i: args.append("--no-shielded-secure-boot") if optional_components: From 8c67d21a1dd76eecc2067e6cf5158fe17824118e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 15:02:20 -0700 Subject: [PATCH 87/95] reduce noise from docker run --- cloudbuild/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index 7491cf7d4..94e6e6cb3 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -10,7 +10,7 @@ COPY --chown=ia-tests:ia-tests . /init-actions # Install Bazel: # https://docs.bazel.build/versions/master/install-ubuntu.html ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg -RUN apt-get install -y -qq curl && \ +RUN apt-get install -y -qq curl >/dev/null 2>&1 && \ apt-get clean RUN /usr/bin/curl https://bazel.build/bazel-release.pub.gpg | \ gpg --dearmor -o "${bazel_kr_path}" @@ -18,7 +18,7 @@ RUN echo "deb [arch=amd64 signed-by=${bazel_kr_path}] http://storage.googleapis. dd of=/etc/apt/sources.list.d/bazel.list status=none && \ apt-get update -qq RUN apt-get autoremove -y -qq && \ - apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel && \ + apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \ apt-get clean USER ia-tests From a31f10cddee6f4465694a1572557bebe11f98a48 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 15:05:14 -0700 Subject: [PATCH 88/95] reduce noise in docker build --- rapids/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rapids/Dockerfile b/rapids/Dockerfile index 53d91731b..ad2c89086 100644 --- a/rapids/Dockerfile +++ b/rapids/Dockerfile @@ -11,7 +11,7 @@ RUN apt-get -qq update \ && apt-get -y -qq install \ apt-transport-https apt-utils \ ca-certificates libmime-base64-perl gnupg \ - curl jq less screen + curl jq less screen > /dev/null 2>&1 && apt-get clean # Install bazel signing key, repo and package ENV bazel_kr_path=/usr/share/keyrings/bazel-release.pub.gpg @@ -24,12 +24,12 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \ && apt-get update -qq RUN apt-get autoremove -y -qq && \ - apt-get install -y -qq default-jdk python3-setuptools bazel && \ + apt-get install -y -qq default-jdk python3-setuptools bazel > /dev/null 2>&1 && \ apt-get clean # Install here any utilities you find useful when troubleshooting -RUN apt-get -y -qq install emacs-nox vim uuid-runtime && apt-get clean +RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-get clean WORKDIR /init-actions From a6fa424b868909df16c22fd567f7731c88f28ec2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 15:05:34 -0700 Subject: [PATCH 89/95] removing older GPU from list --- rapids/test_rapids.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 46e23fe99..63fa72a7f 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -13,10 +13,9 @@ class RapidsTestCase(DataprocTestCase): "gpu/install_gpu_driver.sh", "rapids/rapids.sh" ] - GPU_P100 = "type=nvidia-tesla-p100" GPU_A100 = "type=nvidia-tesla-a100,count=2" GPU_H100 = "type=nvidia-h100-80gb,count=2" - GPU_T4 = "type=nvidia-tesla-t4" + GPU_T4 = "type=nvidia-tesla-t4" # Tests for RAPIDS init action DASK_RAPIDS_TEST_SCRIPT_FILE_NAME = "verify_rapids_dask.py" From e5b6e3f454b3934e4111e6541c5b5a56d6fc9484 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 15:09:57 -0700 Subject: [PATCH 90/95] removing delta from master --- dask/dask.sh | 61 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index f492f27f6..946608d9e 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -27,10 +27,49 @@ function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } +function print_metadata_value() { + local readonly tmpfile=$(mktemp) + http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ + -s -o ${tmpfile} 2>/dev/null) + local readonly return_code=$? + # If the command completed successfully, print the metadata value to stdout. + if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then + cat ${tmpfile} + fi + rm -f ${tmpfile} + return ${return_code} +} + +function print_metadata_value_if_exists() { + local return_code=1 + local readonly url=$1 + print_metadata_value ${url} + return_code=$? + return ${return_code} +} + +function get_metadata_value() { + set +x + local readonly varname=$1 + local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 + # Print the instance metadata value. + print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + return_code=$? + # If the instance doesn't have the value, try the project. + if [[ ${return_code} != 0 ]]; then + print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + return_code=$? + fi + set -x + return ${return_code} +} + function get_metadata_attribute() { - local -r attribute_name=$1 + set +x + local -r attribute_name="$1" local -r default_value="${2:-}" - /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" + get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" + set -x } readonly DEFAULT_CUDA_VERSION="12.4" @@ -38,7 +77,7 @@ readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VER function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } -readonly DASK_RUNTIME="$(/usr/share/google/get_metadata_value attributes/dask-runtime || echo 'standalone')" +readonly DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" # Dask 'standalone' config readonly DASK_SERVICE=dask-cluster @@ -87,8 +126,8 @@ EOF } enable_worker_service="0" -ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" -MASTER="$(/usr/share/google/get_metadata_value attributes/dataproc-master)" +ROLE="$(get_metadata_attribute dataproc-role)" +MASTER="$(get_metadata_attribute dataproc-master)" function install_systemd_dask_worker() { echo "Installing systemd Dask Worker service..." local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}" @@ -125,9 +164,9 @@ EOF if [[ "${ROLE}" != "Master" ]]; then enable_worker_service="1" else - local RUN_WORKER_ON_MASTER="$(/usr/share/google/get_metadata_value attributes/dask-worker-on-master || echo 'true')" + local RUN_WORKER_ON_MASTER="$(get_metadata_attribute dask-worker-on-master || echo 'true')" # Enable service on single-node cluster (no workers) - local worker_count="$(/usr/share/google/get_metadata_value attributes/dataproc-worker-count)" + local worker_count="$(get_metadata_attribute dataproc-worker-count)" if [[ "${worker_count}" == "0" ]]; then RUN_WORKER_ON_MASTER='true'; fi if [[ "${RUN_WORKER_ON_MASTER}" == "true" ]]; then @@ -431,8 +470,8 @@ function install_dask() { # Install dask local is_installed="0" - mamba="/opt/conda/default/bin/mamba" - conda="/opt/conda/default/bin/conda" + mamba="/opt/conda/miniconda3/bin/mamba" + conda="/opt/conda/miniconda3/bin/conda" set +e for installer in "${mamba}" "${conda}" ; do @@ -478,7 +517,7 @@ function main() { configure_knox_for_dask - local DASK_CLOUD_LOGGING="$(/usr/share/google/get_metadata_value attributes/dask-cloud-logging || echo 'false')" + local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then configure_fluentd_for_dask fi @@ -492,3 +531,5 @@ function main() { main + +df -h From 5b93e3a3b568d11342bee8a17ba5d984967df466 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 15:43:42 -0700 Subject: [PATCH 91/95] Thread.yield() --- cloudbuild/run-presubmit-on-k8s.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cloudbuild/run-presubmit-on-k8s.sh b/cloudbuild/run-presubmit-on-k8s.sh index 810213832..c573fd9a7 100644 --- a/cloudbuild/run-presubmit-on-k8s.sh +++ b/cloudbuild/run-presubmit-on-k8s.sh @@ -12,6 +12,8 @@ gcloud container clusters get-credentials "${CLOUDSDK_CONTAINER_CLUSTER}" LOGS_SINCE_TIME=$(date --iso-8601=seconds) +# This kubectl sometimes fails because services have not caught up. Thread.yield() +sleep 10s kubectl run "${POD_NAME}" \ --image="${IMAGE}" \ --restart=Never \ From 38ba6e3520d0a9ac49d40c0d4d474d7bc6fb13a3 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 15:44:20 -0700 Subject: [PATCH 92/95] improved documentation --- rapids/manual-test-runner.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rapids/manual-test-runner.sh b/rapids/manual-test-runner.sh index 8feeb2593..371917a40 100644 --- a/rapids/manual-test-runner.sh +++ b/rapids/manual-test-runner.sh @@ -5,10 +5,10 @@ # To run the script, the following will bootstrap # # git clone git@github.com:GoogleCloudDataproc/initialization-actions +# git checkout rapids-20240806 # cd initialization-actions # cp rapids/env.json.sample env.json # vi env.json -# docker build -f rapids/Dockerfile -t rapids-init-actions-runner:latest . && time docker run -it rapids-init-actions-runner:latest rapids/manual-test-runner.sh # docker build -f rapids/Dockerfile -t rapids-init-actions-runner:latest . # time docker run -it rapids-init-actions-runner:latest rapids/manual-test-runner.sh # From 91907aeb0783d1f99b81cc04dcf1839c8e9ebd60 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 16:59:37 -0700 Subject: [PATCH 93/95] default to non-private ip ; maybe that is why this last run failed --- integration_tests/dataproc_test_case.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index b21e9b971..e5d31140d 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -21,7 +21,7 @@ flags.DEFINE_boolean('skip_cleanup', False, 'Skip cleanup of test resources') FLAGS(sys.argv) -INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "true").lower() == "true" +INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "false").lower() == "true" DEFAULT_TIMEOUT = 15 # minutes From 6d8c32b7ac92047cfe4b3900bdb05ad5285c0fcf Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 17:56:01 -0700 Subject: [PATCH 94/95] revert dataproc_test_case.py to last known good --- integration_tests/dataproc_test_case.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index e5d31140d..936718498 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -122,9 +122,9 @@ def createCluster(self, args.append("--public-ip-address") for i in init_actions: - if "install_gpu_driver.sh" in i or \ - "mlvm.sh" in i or "rapids.sh" in i or \ - "horovod.sh" in i or "spark-rapids.sh" in i: + if "install_gpu_driver.sh" in i or "horovod.sh" in i or \ + "dask-rapids.sh" in i or "mlvm.sh" in i or \ + "spark-rapids.sh" in i: args.append("--no-shielded-secure-boot") if optional_components: @@ -356,10 +356,10 @@ def random_str(size=4, chars=string.ascii_lowercase + string.digits): @staticmethod def run_command(cmd, timeout_in_minutes=DEFAULT_TIMEOUT): cmd = cmd.replace( - "gcloud compute ssh ", "gcloud compute ssh --tunnel-through-iap ") if ( + "gcloud compute ssh ", "gcloud compute ssh --internal-ip ") if ( INTERNAL_IP_SSH and "gcloud compute ssh " in cmd) else cmd cmd = cmd.replace("gcloud compute scp ", - "gcloud beta compute scp --tunnel-through-iap ") if ( + "gcloud beta compute scp --internal-ip ") if ( INTERNAL_IP_SSH and "gcloud compute scp " in cmd) else cmd p = subprocess.Popen( From 7c8ce5745114e919c2930692e3d45e4d12b67a6e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 25 Oct 2024 22:08:02 -0700 Subject: [PATCH 95/95] using correct df command ; using greater or equal to rapids version ; dask>=2024.7 ; correctly capturing retval of installer program --- rapids/rapids.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index df282f9af..6c5c9d411 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -422,7 +422,7 @@ function install_dask_rapids() { if is_cuda12 ; then local python_spec="python>=3.11" local cuda_spec="cuda-version>=12,<13" - local dask_spec="dask>=2024.5" + local dask_spec="dask>=2024.7" local numba_spec="numba" elif is_cuda11 ; then local python_spec="python>=3.9" @@ -431,7 +431,7 @@ function install_dask_rapids() { local numba_spec="numba" fi - rapids_spec="rapids=${RAPIDS_VERSION}" + rapids_spec="rapids>=${RAPIDS_VERSION}" CONDA_PACKAGES=() if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then # Pin `distributed` and `dask` package versions to old release @@ -461,21 +461,20 @@ function install_dask_rapids() { ) # Install cuda, rapids, dask - local is_installed="0" mamba="/opt/conda/miniconda3/bin/mamba" conda="/opt/conda/miniconda3/bin/conda" "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]" ( set +e + local is_installed="0" for installer in "${mamba}" "${conda}" ; do test -d "${DASK_CONDA_ENV}" || \ time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ ${CONDA_PACKAGES[*]} \ "${python_spec}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } - local retval=$? + > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } sync if [[ "$retval" == "0" ]] ; then is_installed="1" @@ -576,7 +575,7 @@ function exit_handler() ( # Log file contains logs like the following (minus the preceeding #): #Filesystem Size Used Avail Use% Mounted on #/dev/vda2 6.8G 2.5G 4.0G 39% / - df -h | tee -a "${tmpdir}/disk-usage.log" + df -h / | tee -a "${tmpdir}/disk-usage.log" perl -e '$max=( sort map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } )[-1]; @@ -653,7 +652,7 @@ function prepare_to_install(){ else dnf -y -q install screen fi - df -h | tee "${tmpdir}/disk-usage.log" + df -h / | tee "${tmpdir}/disk-usage.log" touch "${tmpdir}/keep-running-df" screen -d -m -US keep-running-df \ bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done"