From 17114e3aa4bb0dc2a51da80d1bd34a4da2535e7f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 29 Oct 2024 17:19:27 -0700
Subject: [PATCH 01/21] [secure-boot] merge changes from init-actions ; include
 metadata when performing compute instance create

---
 custom_image_utils/args_parser.py            |   5 +-
 custom_image_utils/shell_script_generator.py |  14 +-
 examples/secure-boot/install_gpu_driver.sh   | 195 +++++++++----------
 examples/secure-boot/pre-init.sh             |  22 +--
 examples/secure-boot/rapids.sh               |  11 +-
 5 files changed, 120 insertions(+), 127 deletions(-)

diff --git a/custom_image_utils/args_parser.py b/custom_image_utils/args_parser.py
index 95bf9a1..2b0542d 100644
--- a/custom_image_utils/args_parser.py
+++ b/custom_image_utils/args_parser.py
@@ -228,8 +228,7 @@ def parse_args(args):
       type=str,
       required=False,
       default="tls/db.der",
-      help="""(Optional) Inserts the specified DER-format certificate into
-      the custom image's EFI boot sector for use with secure boot.""")
-
+      help="""(Optional) Pass an empty string to this argument to
+      disable support for shielded-secure-boot.""")
 
   return parser.parse_args(args)
diff --git a/custom_image_utils/shell_script_generator.py b/custom_image_utils/shell_script_generator.py
index 89730c3..9aa4e4e 100644
--- a/custom_image_utils/shell_script_generator.py
+++ b/custom_image_utils/shell_script_generator.py
@@ -44,11 +44,11 @@
 function exit_handler() {{
   echo 'Cleaning up before exiting.'
 
-  if [[ -f /tmp/{run_id}/vm_created ]]; then
+  if [[ -f /tmp/{run_id}/vm_created ]]; then ( set +e
     echo 'Deleting VM instance.'
     execute_with_retries gcloud compute instances delete {image_name}-install \
         --project={project_id} --zone={zone} -q
-  elif [[ -f /tmp/{run_id}/disk_created ]]; then
+  ) elif [[ -f /tmp/{run_id}/disk_created ]]; then
     echo 'Deleting disk.'
     execute_with_retries gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} --zone={zone} -q
   fi
@@ -111,11 +111,13 @@
 
   local cert_args=""
   local num_src_certs="0"
+  metadata_arg="{metadata_flag}"
   if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then
     # build tls/ directory from variables defined near the header of
     # the examples/secure-boot/create-key-pair.sh file
 
     eval "$(bash examples/secure-boot/create-key-pair.sh)"
+    metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}"
 
     # by default, a gcloud secret with the name of efi-db-pub-key-042 is
     # created in the current project to store the certificate installed
@@ -137,10 +139,12 @@
 
     mapfile -t src_img_modulus_md5sums < <(print_img_dbs_modulus_md5sums {dataproc_base_image})
     num_src_certs="${{#src_img_modulus_md5sums[@]}}"
+    echo "debug - num_src_certs: [${{#src_img_modulus_md5sums[*]}}]"
+    echo "value of src_img_modulus_md5sums: [${{src_img_modulus_md5sums}}]"
     echo "${{num_src_certs}} db certificates attached to source image"
-    if [[ "${{num_src_certs}}" -eq "0" ]]; then
+    if [[ -z "${{src_img_modulus_md5sums}}" ]]; then
       echo "no db certificates in source image"
-      cert_list=default_cert_list
+      cert_list="${{default_cert_list}}"
     else
       echo "db certs exist in source image"
       for cert in ${{default_cert_list[*]}}; do
@@ -209,7 +213,7 @@
       {accelerator_flag} \
       {service_account_flag} \
       --scopes=cloud-platform \
-      {metadata_flag} \
+      "${{metadata_arg}}" \
       --metadata-from-file startup-script=startup_script/run.sh )
 
   touch /tmp/{run_id}/vm_created
diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index c0129dc..6b9f492 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -16,7 +16,7 @@
 
 set -euxo pipefail
 
-function os_id()       ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
 function is_rocky()    ( set +x ;  [[ "$(os_id)" == 'rocky' ]] ; )
@@ -120,21 +120,20 @@ readonly ROLE
 
 # CUDA version and Driver version
 # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+# https://developer.nvidia.com/cuda-downloads
 readonly -A DRIVER_FOR_CUDA=(
-          [11.8]="525.147.05" [12.1]="530.30.02"  [12.4]="550.54.14"
-          [12.5]="555.42.06"  [12.6]="560.28.03"
+          [11.8]="525.147.05" [12.4]="550.54.14"  [12.6]="560.35.03"
 )
+# https://developer.nvidia.com/cudnn-downloads
 readonly -A CUDNN_FOR_CUDA=(
-          [11.8]="8.6.0.163"  [12.1]="8.9.0"      [12.4]="9.1.0.70"
-          [12.5]="9.2.1.18"
+          [11.8]="9.5.1.17"   [12.4]="9.5.1.17"   [12.6]="9.5.1.17"
 )
+# https://developer.nvidia.com/nccl/nccl-download
 readonly -A NCCL_FOR_CUDA=(
-          [11.8]="2.15.5"     [12.1]="2.17.1"     [12.4]="2.21.5"
-          [12.5]="2.22.3"
+          [11.8]="2.15.5"     [12.4]="2.23.4"     [12.6]="2.23.4"
 )
 readonly -A CUDA_SUBVER=(
-          [11.8]="11.8.0"     [12.1]="12.1.0"     [12.4]="12.4.1"
-          [12.5]="12.5.1"
+          [11.8]="11.8.0"     [12.4]="12.4.1"     [12.6]="12.6.2"
 )
 
 RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
@@ -216,6 +215,7 @@ readonly -A DEFAULT_NVIDIA_CUDA_URLS=(
   [11.8]="${NVIDIA_BASE_DL_URL}/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
   [12.1]="${NVIDIA_BASE_DL_URL}/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"
   [12.4]="${NVIDIA_BASE_DL_URL}/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run"
+  [12.6]="${NVIDIA_BASE_DL_URL}/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run"
 )
 readonly DEFAULT_NVIDIA_CUDA_URL=${DEFAULT_NVIDIA_CUDA_URLS["${CUDA_VERSION}"]}
 NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
@@ -233,9 +233,9 @@ if ( compare_versions_lte "8.3.1.22" "${CUDNN_VERSION}" ); then
   fi
   CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
 fi
-if ( compare_versions_lte "12.0" "${CUDA_VERSION}" ); then
-  # When cuda version is greater than 12.0
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.2.0.82_cuda12-archive.tar.xz"
+if is_cuda12 ; then
+  # When cuda version is 12
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.5.1.17_cuda12-archive.tar.xz"
 fi
 readonly CUDNN_TARBALL
 readonly CUDNN_TARBALL_URL
@@ -267,7 +267,8 @@ function execute_with_retries() (
     cmd="apt-get -y clean && $cmd"
   fi
   for ((i = 0; i < 3; i++)); do
-    if eval "$cmd" ; then return 0 ; fi
+    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    if [[ $retval == 0 ]] ; then return 0 ; fi
     sleep 5
   done
   return 1
@@ -279,9 +280,9 @@ function install_cuda_keyring_pkg() {
   local kr_ver=1.1
   curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
-    -o "${download_dir}/cuda-keyring.deb"
-  dpkg -i "${download_dir}/cuda-keyring.deb"
-  rm -f "${download_dir}/cuda-keyring.deb"
+    -o "${tmpdir}/cuda-keyring.deb"
+  dpkg -i "${tmpdir}/cuda-keyring.deb"
+  rm -f "${tmpdir}/cuda-keyring.deb"
   CUDA_KEYRING_PKG_INSTALLED="1"
 }
 
@@ -301,10 +302,10 @@ function install_local_cuda_repo() {
   readonly DIST_KEYRING_DIR="/var/${pkgname}"
 
   curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${LOCAL_DEB_URL}" -o "${download_dir}/${LOCAL_INSTALLER_DEB}"
+    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
 
-  dpkg -i "${download_dir}/${LOCAL_INSTALLER_DEB}"
-  rm "${download_dir}/${LOCAL_INSTALLER_DEB}"
+  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
   cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
 
   if is_ubuntu ; then
@@ -329,11 +330,11 @@ function install_local_cudnn_repo() {
 
   # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
   curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${local_deb_url}" -o "${download_dir}/local-installer.deb"
+    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
 
-  dpkg -i "${download_dir}/local-installer.deb"
+  dpkg -i "${tmpdir}/local-installer.deb"
 
-  rm -f "${download_dir}/local-installer.deb"
+  rm -f "${tmpdir}/local-installer.deb"
 
   cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
 
@@ -361,7 +362,7 @@ function install_local_cudnn8_repo() {
   CUDNN8_PKG_NAME="${pkgname}"
 
   deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_fn="${download_dir}/${deb_fn}"
+  local_deb_fn="${tmpdir}/${deb_fn}"
   local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
   curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
       "${local_deb_url}" -o "${local_deb_fn}"
@@ -383,10 +384,9 @@ function install_nvidia_nccl() {
   local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
 
   if is_rocky ; then
-    time execute_with_retries \
+    execute_with_retries \
       dnf -y -q install \
-        "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" \
-        > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+        "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}"
     sync
   elif is_ubuntu ; then
     install_cuda_keyring_pkg
@@ -394,16 +394,14 @@ function install_nvidia_nccl() {
     apt-get update -qq
 
     if is_ubuntu18 ; then
-      time execute_with_retries \
+      execute_with_retries \
         apt-get install -q -y \
-          libnccl2 libnccl-dev \
-          > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+          libnccl2 libnccl-dev
       sync
     else
-      time execute_with_retries \
+      execute_with_retries \
         apt-get install -q -y \
-          "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" \
-          > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+          "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}"
       sync
     fi
   else
@@ -427,16 +425,14 @@ function install_nvidia_cudnn() {
 
   if is_rocky ; then
     if is_cudnn8 ; then
-      time execute_with_retries dnf -y -q install \
+      execute_with_retries dnf -y -q install \
         "libcudnn${major_version}" \
-        "libcudnn${major_version}-devel" \
-        > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+        "libcudnn${major_version}-devel"
       sync
     elif is_cudnn9 ; then
-      time execute_with_retries dnf -y -q install \
+      execute_with_retries dnf -y -q install \
         "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
-        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" \
-        > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
       sync
     else
       echo "Unsupported cudnn version: '${major_version}'"
@@ -451,23 +447,21 @@ function install_nvidia_cudnn() {
 
         apt-get update -qq
 
-        time execute_with_retries \
+        execute_with_retries \
           apt-get -y install --no-install-recommends \
             "libcudnn8=${cudnn_pkg_version}" \
-            "libcudnn8-dev=${cudnn_pkg_version}" \
-            > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+            "libcudnn8-dev=${cudnn_pkg_version}"
 	sync
       elif is_cudnn9 ; then
 	install_cuda_keyring_pkg
 
         apt-get update -qq
 
-        time execute_with_retries \
+        execute_with_retries \
           apt-get -y install --no-install-recommends \
           "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
           "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
-          > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
 	sync
       else
         echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
@@ -478,9 +472,8 @@ function install_nvidia_cudnn() {
     packages=(
       "libcudnn${major_version}=${cudnn_pkg_version}"
       "libcudnn${major_version}-dev=${cudnn_pkg_version}")
-    time execute_with_retries \
-      apt-get install -q -y --no-install-recommends "${packages[*]}" \
-      > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+    execute_with_retries \
+      apt-get install -q -y --no-install-recommends "${packages[*]}"
     sync
   else
     echo "Unsupported OS: '${OS_NAME}'"
@@ -692,21 +685,17 @@ function build_driver_from_packages() {
     fi
     add_contrib_component
     apt-get update -qq
-    execute_with_retries apt-get install -y -qq --no-install-recommends dkms  \
-    > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
     #configure_dkms_certs
-    time execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" \
-     > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
     sync
 
   elif is_rocky ; then
     #configure_dkms_certs
-    if time execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" \
-       > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } ; then
+    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
       echo "nvidia-driver:${DRIVER}-dkms installed successfully"
     else
-      time execute_with_retries dnf -y -q module install 'nvidia-driver:latest' \
-      > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
     fi
     sync
   fi
@@ -714,24 +703,22 @@ function build_driver_from_packages() {
 }
 
 function install_nvidia_userspace_runfile() {
-  if test -f "${download_dir}/userspace-complete" ; then return ; fi
+  if test -f "${tmpdir}/userspace-complete" ; then return ; fi
   curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${USERSPACE_URL}" -o "${download_dir}/userspace.run"
-  time bash "${download_dir}/userspace.run" --no-kernel-modules --silent --install-libglvnd \
-  > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
-  rm -f "${download_dir}/userspace.run"
-  touch "${download_dir}/userspace-complete"
+    "${USERSPACE_URL}" -o "${tmpdir}/userspace.run"
+  time bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd
+  rm -f "${tmpdir}/userspace.run"
+  touch "${tmpdir}/userspace-complete"
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${download_dir}/cuda-complete" ; then return ; fi
+  if test -f "${tmpdir}/cuda-complete" ; then return ; fi
   time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${NVIDIA_CUDA_URL}" -o "${download_dir}/cuda.run"
-  time bash "${download_dir}/cuda.run" --silent --toolkit --no-opengl-libs \
-  > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
-  rm -f "${download_dir}/cuda.run"
-  touch "${download_dir}/cuda-complete"
+    "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run"
+  time bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs
+  rm -f "${tmpdir}/cuda.run"
+  touch "${tmpdir}/cuda-complete"
   sync
 }
 
@@ -746,12 +733,10 @@ function install_cuda_toolkit() {
   readonly cudatk_package
   if is_debuntu ; then
 #    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
-    time execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} \
-    > "${install_log}" 2>&1 || { cat "${install_log}" ; exit -4 ; }
+    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
      sync
   elif is_rocky ; then
-    time execute_with_retries dnf -y -q install "${cudatk_package}" \
-    > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+    execute_with_retries dnf -y -q install "${cudatk_package}"
     sync
   fi
 }
@@ -852,8 +837,7 @@ function install_gpu_agent() {
     "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
-  time execute_with_retries pip install -r "${install_dir}/requirements.txt" \
-  > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+  execute_with_retries pip install -r "${install_dir}/requirements.txt"
   sync
 
   # Generate GPU service.
@@ -1060,11 +1044,11 @@ function main() {
 
   if is_debuntu ; then
     export DEBIAN_FRONTEND=noninteractive
-    time execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" > /dev/null 2>&1
+    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" > /dev/null 2>&1
   elif is_rocky ; then
-    time execute_with_retries dnf -y -q update --exclude=systemd*,kernel* \
+    execute_with_retries dnf -y -q update --exclude=systemd*,kernel* \
     > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
-    time execute_with_retries dnf -y -q install pciutils gcc \
+    execute_with_retries dnf -y -q install pciutils gcc \
     > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
 
     local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
@@ -1072,7 +1056,7 @@ function main() {
     if [[ "${kernel_devel_pkg_out}" =~ 'Unable to find a match: kernel-devel-' ]] ; then
       # this kernel-devel may have been migrated to the vault
       local vault="https://download.rockylinux.org/vault/rocky/$(os_version)"
-      time execute_with_retries dnf -y -q --setopt=localpkg_gpgcheck=1 install \
+      execute_with_retries dnf -y -q --setopt=localpkg_gpgcheck=1 install \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
@@ -1241,8 +1225,10 @@ function clean_up_sources_lists() {
   # cran-r
   #
   if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
     rm -f /usr/share/keyrings/cran-r.gpg
-    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7' | \
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
       gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
     sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
   fi
@@ -1307,16 +1293,17 @@ function exit_handler() {
 
   # Process disk usage logs from installation period
   rm -f /tmp/keep-running-df
-  sleep 6s
+  sync
+  sleep 5.01s
   # compute maximum size of disk during installation
   # Log file contains logs like the following (minus the preceeding #):
 #Filesystem      Size  Used Avail Use% Mounted on
 #/dev/vda2       6.8G  2.5G  4.0G  39% /
-  df --si
-  perl -e '$max=( sort
+  df -h / | tee -a "${tmpdir}/disk-usage.log"
+  perl -e '$max=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> )[-1];
-print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log
+                  grep { m:^/: } <STDIN> )[0];
+print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log"
 
   echo "exit_handler has completed"
 
@@ -1328,24 +1315,24 @@ print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log
   return 0
 }
 
-trap exit_handler EXIT
-
 function prepare_to_install(){
   nvsmi_works="0"
   readonly bdcfg="/usr/local/bin/bdconfig"
-  download_dir=/tmp/
+  tmpdir=/tmp/
+  local free_mem
+  trap exit_handler EXIT
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
   # Write to a ramdisk instead of churning the persistent disk
-  if [[ ${free_mem} -ge 5250000 ]]; then
-    download_dir="/mnt/shm"
-    mkdir -p "${download_dir}"
-    mount -t tmpfs tmpfs "${download_dir}"
+  if [[ ${free_mem} -ge 10500000 ]]; then
+    tmpdir="/mnt/shm"
+    mkdir -p "${tmpdir}"
+    mount -t tmpfs tmpfs "${tmpdir}"
 
     # Download conda packages to tmpfs
-    /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${download_dir}"
+    /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
 
     # Download pip packages to tmpfs
-    pip config set global.cache-dir "${download_dir}" || echo "unable to set global.cache-dir"
+    pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
 
     # Download OS packages to tmpfs
     if is_debuntu ; then
@@ -1353,8 +1340,10 @@ function prepare_to_install(){
     else
       mount -t tmpfs tmpfs /var/cache/dnf
     fi
+  else
+    tmpdir=/tmp
   fi
-  install_log="${download_dir}/install.log"
+  install_log="${tmpdir}/install.log"
 
   if is_debuntu ; then
     clean_up_sources_lists
@@ -1371,23 +1360,23 @@ function prepare_to_install(){
   /opt/conda/miniconda3/bin/conda clean -a
 
   # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    set +e
-    time dd if=/dev/zero of=/zero ; sync ; rm -f /zero
-    set -e
-  fi
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
+    df -h
+    time dd if=/dev/zero of=/zero status=progress ; sync ; sleep 3s ; rm -f /zero
+  ) fi
 
   configure_dkms_certs
 
   # Monitor disk usage in a screen session
   if is_debuntu ; then
-      apt-get install -y -qq screen > /dev/null 2>&1
-  elif is_rocky ; then
-      dnf -y -q install screen > /dev/null 2>&1
+      execute_with_retries apt-get install -y -qq screen
+  else
+      execute_with_retries dnf -y -q install screen
   fi
-  touch /tmp/keep-running-df
+  df -h / | tee "${tmpdir}/disk-usage.log"
+  touch "${tmpdir}/keep-running-df"
   screen -d -m -US keep-running-df \
-    bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df --si / | tee -a /tmp/disk-usage.log ; sleep 5s ; done'
+    bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done"
 }
 
 prepare_to_install
diff --git a/examples/secure-boot/pre-init.sh b/examples/secure-boot/pre-init.sh
index 7797b4f..4334935 100644
--- a/examples/secure-boot/pre-init.sh
+++ b/examples/secure-boot/pre-init.sh
@@ -101,14 +101,14 @@ function generate_from_base_purpose() {
 # base image -> cuda
 case "${dataproc_version}" in
   "2.0-debian10" ) disk_size_gb="38" ;; # 40G   31G  7.8G  80% / # cuda-pre-init-2-0-debian10
-  "2.0-rocky8"   ) disk_size_gb="35" ;; # 38G   32G  6.2G  84% / # cuda-pre-init-2-0-rocky8
-  "2.0-ubuntu18" ) disk_size_gb="37" ;; # 39G   30G  8.5G  79% / # cuda-pre-init-2-0-ubuntu18
-  "2.1-debian11" ) disk_size_gb="37" ;; # 39G   34G  4.1G  90% / # cuda-pre-init-2-1-debian11
-  "2.1-rocky8"   ) disk_size_gb="38" ;; # 41G   35G  6.1G  86% / # cuda-pre-init-2-1-rocky8
-  "2.1-ubuntu20" ) disk_size_gb="35" ;; # 37G   32G  4.4G  88% / # cuda-pre-init-2-1-ubuntu20
+  "2.0-rocky8"   ) disk_size_gb="35" ;; # 38G   32G  6.3G  84% / # cuda-pre-init-2-0-rocky8
+  "2.0-ubuntu18" ) disk_size_gb="37" ;; # 39G   30G  8.5G  78% / # cuda-pre-init-2-0-ubuntu18
+  "2.1-debian11" ) disk_size_gb="37" ;; # 39G   34G  4.2G  89% / # cuda-pre-init-2-1-debian11
+  "2.1-rocky8"   ) disk_size_gb="38" ;; # 41G   35G  6.3G  85% / # cuda-pre-init-2-1-rocky8
+  "2.1-ubuntu20" ) disk_size_gb="36" ;; # 37G   33G  4.0G  90% / # cuda-pre-init-2-1-ubuntu20
   "2.2-debian12" ) disk_size_gb="38" ;; # 40G   35G  3.3G  92% / # cuda-pre-init-2-2-debian12
-  "2.2-rocky9"   ) disk_size_gb="40" ;; # 42G   36G  5.9G  86% / # cuda-pre-init-2-2-rocky9
-  "2.2-ubuntu22" ) disk_size_gb="38" ;; # 40G   35G  4.8G  88% / # cuda-pre-init-2-2-ubuntu22
+  "2.2-rocky9"   ) disk_size_gb="39" ;; # 43G   36G  7.2G  84% / # cuda-pre-init-2-2-rocky9
+  "2.2-ubuntu22" ) disk_size_gb="39" ;; # 40G   36G  4.4G  90% / # cuda-pre-init-2-2-ubuntu22
 esac
 
 # Install GPU drivers + cuda on dataproc base image
@@ -136,7 +136,7 @@ PURPOSE="rapids-pre-init"
 customization_script="examples/secure-boot/rapids.sh"
 time generate_from_base_purpose "cuda-pre-init"
 
-# Install dask without rapids on base image
-PURPOSE="dask-pre-init"
-customization_script="examples/secure-boot/dask.sh"
-time generate_from_base_purpose "cuda-pre-init"
+## Install dask without rapids on base image
+#PURPOSE="dask-pre-init"
+#customization_script="examples/secure-boot/dask.sh"
+#time generate_from_base_purpose "cuda-pre-init"
diff --git a/examples/secure-boot/rapids.sh b/examples/secure-boot/rapids.sh
index 6c5c9d4..a42335f 100644
--- a/examples/secure-boot/rapids.sh
+++ b/examples/secure-boot/rapids.sh
@@ -19,11 +19,12 @@
 
 set -euxo pipefail
 
-function os_id()       { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; }
-function is_ubuntu()   { [[ "$(os_id)" == 'ubuntu' ]] ; }
-function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; }
-function is_debian()   { [[ "$(os_id)" == 'debian' ]] ; }
-function is_debuntu()  { is_debian || is_ubuntu ; }
+function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function is_ubuntu()   ( set +x ;  [[ "$(os_id)" == 'ubuntu' ]] ; )
+function is_ubuntu18() ( set +x ;  is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; )
+function is_debian()   ( set +x ;  [[ "$(os_id)" == 'debian' ]] ; )
+function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
 
 function print_metadata_value() {
   local readonly tmpfile=$(mktemp)

From 02e058bbafc29f6476bcb010e1a329bd903f66e1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 29 Oct 2024 21:19:56 -0700
Subject: [PATCH 02/21] using tmpdir argument to runfiles

---
 examples/secure-boot/build-current-images.sh |  7 ++--
 examples/secure-boot/install_gpu_driver.sh   |  8 +++--
 examples/secure-boot/pre-init.sh             | 36 ++++++++++----------
 3 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/examples/secure-boot/build-current-images.sh b/examples/secure-boot/build-current-images.sh
index 0d7846d..b2cfeef 100644
--- a/examples/secure-boot/build-current-images.sh
+++ b/examples/secure-boot/build-current-images.sh
@@ -85,7 +85,7 @@ configure_service_account
 session_name="build-current-images"
 
 readonly timestamp="$(date +%F-%H-%M)"
-#readonly timestamp="2024-10-24-04-21"
+#readonly timestamp="2024-10-30-01-43"
 export timestamp
 
 export tmpdir=/tmp/${timestamp};
@@ -116,7 +116,10 @@ my( $dp_version ) = ($config =~ /-pre-init-(.+)/);
 $dp_version =~ s/-/./;
 
 my($max) = map { / maximum-disk-used: (\d+)/ } @raw_lines;
-$max+=3;
+if ( $fn =~ /2.0-(debian10|ubuntu18)/ )
+{ $max+=4 }else
+{ $max+=3 }
+$max = 30 if $max < 30;
 my $i_dp_version = sprintf(q{%-15s}, qq{"$dp_version"});
 
 print( qq{  $i_dp_version) disk_size_gb="$max" ;; # $stats # $config}, $/ );
diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 6b9f492..132d3c7 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -235,7 +235,8 @@ if ( compare_versions_lte "8.3.1.22" "${CUDNN_VERSION}" ); then
 fi
 if is_cuda12 ; then
   # When cuda version is 12
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.5.1.17_cuda12-archive.tar.xz"
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz"
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
 fi
 readonly CUDNN_TARBALL
 readonly CUDNN_TARBALL_URL
@@ -706,7 +707,7 @@ function install_nvidia_userspace_runfile() {
   if test -f "${tmpdir}/userspace-complete" ; then return ; fi
   curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${USERSPACE_URL}" -o "${tmpdir}/userspace.run"
-  time bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd
+  time bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}"
   rm -f "${tmpdir}/userspace.run"
   touch "${tmpdir}/userspace-complete"
   sync
@@ -716,7 +717,7 @@ function install_cuda_runfile() {
   if test -f "${tmpdir}/cuda-complete" ; then return ; fi
   time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run"
-  time bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs
+  time bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}"
   rm -f "${tmpdir}/cuda.run"
   touch "${tmpdir}/cuda-complete"
   sync
@@ -1300,6 +1301,7 @@ function exit_handler() {
 #Filesystem      Size  Used Avail Use% Mounted on
 #/dev/vda2       6.8G  2.5G  4.0G  39% /
   df -h / | tee -a "${tmpdir}/disk-usage.log"
+  cat "${tmpdir}/disk-usage.log" | sort
   perl -e '$max=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
                   grep { m:^/: } <STDIN> )[0];
diff --git a/examples/secure-boot/pre-init.sh b/examples/secure-boot/pre-init.sh
index 4334935..24c733f 100644
--- a/examples/secure-boot/pre-init.sh
+++ b/examples/secure-boot/pre-init.sh
@@ -100,15 +100,15 @@ function generate_from_base_purpose() {
 
 # base image -> cuda
 case "${dataproc_version}" in
-  "2.0-debian10" ) disk_size_gb="38" ;; # 40G   31G  7.8G  80% / # cuda-pre-init-2-0-debian10
-  "2.0-rocky8"   ) disk_size_gb="35" ;; # 38G   32G  6.3G  84% / # cuda-pre-init-2-0-rocky8
-  "2.0-ubuntu18" ) disk_size_gb="37" ;; # 39G   30G  8.5G  78% / # cuda-pre-init-2-0-ubuntu18
-  "2.1-debian11" ) disk_size_gb="37" ;; # 39G   34G  4.2G  89% / # cuda-pre-init-2-1-debian11
-  "2.1-rocky8"   ) disk_size_gb="38" ;; # 41G   35G  6.3G  85% / # cuda-pre-init-2-1-rocky8
-  "2.1-ubuntu20" ) disk_size_gb="36" ;; # 37G   33G  4.0G  90% / # cuda-pre-init-2-1-ubuntu20
-  "2.2-debian12" ) disk_size_gb="38" ;; # 40G   35G  3.3G  92% / # cuda-pre-init-2-2-debian12
-  "2.2-rocky9"   ) disk_size_gb="39" ;; # 43G   36G  7.2G  84% / # cuda-pre-init-2-2-rocky9
-  "2.2-ubuntu22" ) disk_size_gb="39" ;; # 40G   36G  4.4G  90% / # cuda-pre-init-2-2-ubuntu22
+  "2.0-debian10" ) disk_size_gb="33" ;; # 38G   17G   19G  47% / # cuda-pre-init-2-0-debian10
+  "2.0-rocky8"   ) disk_size_gb="32" ;; # 32G   19G   14G  59% / # cuda-pre-init-2-0-rocky8
+  "2.0-ubuntu18" ) disk_size_gb="32" ;; # 36G   17G   20G  46% / # cuda-pre-init-2-0-ubuntu18
+  "2.1-debian11" ) disk_size_gb="34" ;; # 34G   20G   13G  63% / # cuda-pre-init-2-1-debian11
+  "2.1-rocky8"   ) disk_size_gb="36" ;; # 36G   22G   15G  61% / # cuda-pre-init-2-1-rocky8
+  "2.1-ubuntu20" ) disk_size_gb="33" ;; # 33G   20G   13G  61% / # cuda-pre-init-2-1-ubuntu20
+  "2.2-debian12" ) disk_size_gb="36" ;; # 36G   23G   11G  69% / # cuda-pre-init-2-2-debian12
+  "2.2-rocky9"   ) disk_size_gb="37" ;; # 37G   23G   15G  62% / # cuda-pre-init-2-2-rocky9
+  "2.2-ubuntu22" ) disk_size_gb="35" ;; # 35G   23G   13G  65% / # cuda-pre-init-2-2-ubuntu22
 esac
 
 # Install GPU drivers + cuda on dataproc base image
@@ -118,15 +118,15 @@ time generate_from_dataproc_version "${dataproc_version}"
 
 # cuda image -> rapids
 case "${dataproc_version}" in
-  "2.0-debian10" ) disk_size_gb="44" ;; # 47G   41G  4.0G  91% / # rapids-pre-init-2-0-debian10
-  "2.0-rocky8"   ) disk_size_gb="45" ;; # 49G   42G  7.0G  86% / # rapids-pre-init-2-0-rocky8
-  "2.0-ubuntu18" ) disk_size_gb="43" ;; # 45G   40G  4.9G  90% / # rapids-pre-init-2-0-ubuntu18
-  "2.1-debian11" ) disk_size_gb="46" ;; # 49G   43G  3.6G  93% / # rapids-pre-init-2-1-debian11
-  "2.1-rocky8"   ) disk_size_gb="48" ;; # 52G   45G  7.2G  87% / # rapids-pre-init-2-1-rocky8
-  "2.1-ubuntu20" ) disk_size_gb="45" ;; # 47G   42G  5.2G  89% / # rapids-pre-init-2-1-ubuntu20
-  "2.2-debian12" ) disk_size_gb="48" ;; # 51G   45G  3.8G  93% / # rapids-pre-init-2-2-debian12
-  "2.2-rocky9"   ) disk_size_gb="49" ;; # 53G   46G  7.2G  87% / # rapids-pre-init-2-2-rocky9
-  "2.2-ubuntu22" ) disk_size_gb="48" ;; # 50G   45G  5.6G  89% / # rapids-pre-init-2-2-ubuntu22
+  "2.0-debian10" ) disk_size_gb="42" ;; # 41G   29G   11G  74% / # rapids-pre-init-2-0-debian10
+  "2.0-rocky8"   ) disk_size_gb="42" ;; # 42G   30G   13G  70% / # rapids-pre-init-2-0-rocky8
+  "2.0-ubuntu18" ) disk_size_gb="41" ;; # 39G   28G   11G  72% / # rapids-pre-init-2-0-ubuntu18
+  "2.1-debian11" ) disk_size_gb="43" ;; # 43G   31G  9.6G  77% / # rapids-pre-init-2-1-debian11
+  "2.1-rocky8"   ) disk_size_gb="45" ;; # 45G   33G   13G  72% / # rapids-pre-init-2-1-rocky8
+  "2.1-ubuntu20" ) disk_size_gb="43" ;; # 42G   31G   12G  74% / # rapids-pre-init-2-1-ubuntu20
+  "2.2-debian12" ) disk_size_gb="45" ;; # 45G   33G  9.5G  78% / # rapids-pre-init-2-2-debian12
+  "2.2-rocky9"   ) disk_size_gb="46" ;; # 46G   34G   13G  73% / # rapids-pre-init-2-2-rocky9
+  "2.2-ubuntu22" ) disk_size_gb="45" ;; # 44G   33G   11G  76% / # rapids-pre-init-2-2-ubuntu22
 esac
 
 #disk_size_gb="50"

From 514791cf53db2ef88b34fad5ad73b80429b89e1f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 29 Oct 2024 23:21:32 -0700
Subject: [PATCH 03/21] reducing the max disk size for 2.0- debian and ubuntu ;
 runfile uses --tmpdir argument now

---
 custom_image_utils/shell_script_generator.py | 24 ++++++++++++--------
 examples/secure-boot/build-current-images.sh |  6 ++---
 examples/secure-boot/install_gpu_driver.sh   |  7 +++---
 examples/secure-boot/pre-init.sh             | 18 ++++++++++-----
 4 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/custom_image_utils/shell_script_generator.py b/custom_image_utils/shell_script_generator.py
index 9aa4e4e..0a823c1 100644
--- a/custom_image_utils/shell_script_generator.py
+++ b/custom_image_utils/shell_script_generator.py
@@ -46,21 +46,22 @@
 
   if [[ -f /tmp/{run_id}/vm_created ]]; then ( set +e
     echo 'Deleting VM instance.'
-    execute_with_retries gcloud compute instances delete {image_name}-install \
-        --project={project_id} --zone={zone} -q
+    execute_with_retries \
+      gcloud compute instances delete {image_name}-install --project={project_id} --zone={zone} -q
   ) elif [[ -f /tmp/{run_id}/disk_created ]]; then
     echo 'Deleting disk.'
-    execute_with_retries gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} --zone={zone} -q
+    execute_with_retries \
+      gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} --zone={zone} -q
   fi
 
   echo 'Uploading local logs to GCS bucket.'
   gsutil -m rsync -r {log_dir}/ {gcs_log_dir}/
 
   if [[ -f /tmp/{run_id}/image_created ]]; then
-    echo -e "${{GREEN}}Workflow succeeded, check logs at {log_dir}/ or {gcs_log_dir}/${{NC}}"
+    echo -e "${{GREEN}}Workflow succeeded${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/"
     exit 0
   else
-    echo -e "${{RED}}Workflow failed, check logs at {log_dir}/ or {gcs_log_dir}/${{NC}}"
+    echo -e "${{RED}}Workflow failed${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/"
     exit 1
   fi
 }}
@@ -141,11 +142,12 @@
     num_src_certs="${{#src_img_modulus_md5sums[@]}}"
     echo "debug - num_src_certs: [${{#src_img_modulus_md5sums[*]}}]"
     echo "value of src_img_modulus_md5sums: [${{src_img_modulus_md5sums}}]"
-    echo "${{num_src_certs}} db certificates attached to source image"
     if [[ -z "${{src_img_modulus_md5sums}}" ]]; then
+      num_src_certs=0
       echo "no db certificates in source image"
       cert_list="${{default_cert_list}}"
     else
+      echo "${{num_src_certs}} db certificates attached to source image"
       echo "db certs exist in source image"
       for cert in ${{default_cert_list[*]}}; do
         if test_element_in_array "$(print_modulus_md5sum ${{cert}})" ${{src_img_modulus_md5sums[@]}} ; then
@@ -179,7 +181,8 @@
     echo 'Creating image.'
     base_obj_type="images"
     instance_disk_args='--image-project={project_id} --image={image_name}-install --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd'
-    time execute_with_retries gcloud compute images create {image_name}-install \
+    time execute_with_retries \
+      gcloud compute images create {image_name}-install \
       --project={project_id} \
       --source-image={dataproc_base_image} \
       ${{cert_args}} \
@@ -219,9 +222,10 @@
   touch /tmp/{run_id}/vm_created
 
   # clean up intermediate install image
-  if [[ "${{base_obj_type}}" == "images" ]] ; then
-    execute_with_retries gcloud compute images delete -q {image_name}-install --project={project_id}
-  fi
+  if [[ "${{base_obj_type}}" == "images" ]] ; then ( set +e
+    # This sometimes returns an API error but deletes the image despite the failure
+    gcloud compute images delete -q {image_name}-install --project={project_id}
+  ) fi
 
   echo 'Waiting for customization script to finish and VM shutdown.'
   execute_with_retries gcloud compute instances tail-serial-port-output {image_name}-install \
diff --git a/examples/secure-boot/build-current-images.sh b/examples/secure-boot/build-current-images.sh
index b2cfeef..e37bd76 100644
--- a/examples/secure-boot/build-current-images.sh
+++ b/examples/secure-boot/build-current-images.sh
@@ -85,7 +85,7 @@ configure_service_account
 session_name="build-current-images"
 
 readonly timestamp="$(date +%F-%H-%M)"
-#readonly timestamp="2024-10-30-01-43"
+#readonly timestamp="2024-10-30-04-13"
 export timestamp
 
 export tmpdir=/tmp/${timestamp};
@@ -116,9 +116,7 @@ my( $dp_version ) = ($config =~ /-pre-init-(.+)/);
 $dp_version =~ s/-/./;
 
 my($max) = map { / maximum-disk-used: (\d+)/ } @raw_lines;
-if ( $fn =~ /2.0-(debian10|ubuntu18)/ )
-{ $max+=4 }else
-{ $max+=3 }
+$max+=3;
 $max = 30 if $max < 30;
 my $i_dp_version = sprintf(q{%-15s}, qq{"$dp_version"});
 
diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 132d3c7..780e239 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -265,7 +265,8 @@ function execute_with_retries() (
   local -r cmd="$*"
 
   if [[ "$cmd" =~ "^apt-get install" ]] ; then
-    cmd="apt-get -y clean && $cmd"
+    apt-get -y clean
+    apt-get -y autoremove
   fi
   for ((i = 0; i < 3; i++)); do
     time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
@@ -707,7 +708,7 @@ function install_nvidia_userspace_runfile() {
   if test -f "${tmpdir}/userspace-complete" ; then return ; fi
   curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${USERSPACE_URL}" -o "${tmpdir}/userspace.run"
-  time bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}"
+  execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}"
   rm -f "${tmpdir}/userspace.run"
   touch "${tmpdir}/userspace-complete"
   sync
@@ -717,7 +718,7 @@ function install_cuda_runfile() {
   if test -f "${tmpdir}/cuda-complete" ; then return ; fi
   time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run"
-  time bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}"
+  execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}"
   rm -f "${tmpdir}/cuda.run"
   touch "${tmpdir}/cuda-complete"
   sync
diff --git a/examples/secure-boot/pre-init.sh b/examples/secure-boot/pre-init.sh
index 24c733f..f1fdce3 100644
--- a/examples/secure-boot/pre-init.sh
+++ b/examples/secure-boot/pre-init.sh
@@ -67,6 +67,12 @@ function generate() {
     return
   fi
 
+  local install_image="$(jq -r ".[] | select(.name == \"${image_name}-install\").name" "${tmpdir}/images.json")"
+  if [[ -n "${install_image}" ]] ; then
+    echo "Install image already exists"
+    gcloud -q compute images delete "${image_name}-install"
+  fi
+
   local instance="$(jq -r ".[] | select(.name == \"${image_name}-install\").name" "${tmpdir}/instances.json")"
 
   if [[ -n "${instance}" ]]; then
@@ -100,15 +106,15 @@ function generate_from_base_purpose() {
 
 # base image -> cuda
 case "${dataproc_version}" in
-  "2.0-debian10" ) disk_size_gb="33" ;; # 38G   17G   19G  47% / # cuda-pre-init-2-0-debian10
+  "2.0-debian10" ) disk_size_gb="32" ;; # 33G   17G   15G  55% / # cuda-pre-init-2-0-debian10
   "2.0-rocky8"   ) disk_size_gb="32" ;; # 32G   19G   14G  59% / # cuda-pre-init-2-0-rocky8
-  "2.0-ubuntu18" ) disk_size_gb="32" ;; # 36G   17G   20G  46% / # cuda-pre-init-2-0-ubuntu18
+  "2.0-ubuntu18" ) disk_size_gb="32" ;; # 31G   17G   15G  53% / # cuda-pre-init-2-0-ubuntu18
   "2.1-debian11" ) disk_size_gb="34" ;; # 34G   20G   13G  63% / # cuda-pre-init-2-1-debian11
   "2.1-rocky8"   ) disk_size_gb="36" ;; # 36G   22G   15G  61% / # cuda-pre-init-2-1-rocky8
-  "2.1-ubuntu20" ) disk_size_gb="33" ;; # 33G   20G   13G  61% / # cuda-pre-init-2-1-ubuntu20
+  "2.1-ubuntu20" ) disk_size_gb="34" ;; # 32G   20G   12G  63% / # cuda-pre-init-2-1-ubuntu20
   "2.2-debian12" ) disk_size_gb="36" ;; # 36G   23G   11G  69% / # cuda-pre-init-2-2-debian12
   "2.2-rocky9"   ) disk_size_gb="37" ;; # 37G   23G   15G  62% / # cuda-pre-init-2-2-rocky9
-  "2.2-ubuntu22" ) disk_size_gb="35" ;; # 35G   23G   13G  65% / # cuda-pre-init-2-2-ubuntu22
+  "2.2-ubuntu22" ) disk_size_gb="36" ;; # 34G   23G   12G  67% / # cuda-pre-init-2-2-ubuntu22
 esac
 
 # Install GPU drivers + cuda on dataproc base image
@@ -118,9 +124,9 @@ time generate_from_dataproc_version "${dataproc_version}"
 
 # cuda image -> rapids
 case "${dataproc_version}" in
-  "2.0-debian10" ) disk_size_gb="42" ;; # 41G   29G   11G  74% / # rapids-pre-init-2-0-debian10
+  "2.0-debian10" ) disk_size_gb="41" ;; # 42G   29G   12G  72% / # rapids-pre-init-2-0-debian10
   "2.0-rocky8"   ) disk_size_gb="42" ;; # 42G   30G   13G  70% / # rapids-pre-init-2-0-rocky8
-  "2.0-ubuntu18" ) disk_size_gb="41" ;; # 39G   28G   11G  72% / # rapids-pre-init-2-0-ubuntu18
+  "2.0-ubuntu18" ) disk_size_gb="41" ;; # 40G   28G   12G  70% / # rapids-pre-init-2-0-ubuntu18
   "2.1-debian11" ) disk_size_gb="43" ;; # 43G   31G  9.6G  77% / # rapids-pre-init-2-1-debian11
   "2.1-rocky8"   ) disk_size_gb="45" ;; # 45G   33G   13G  72% / # rapids-pre-init-2-1-rocky8
   "2.1-ubuntu20" ) disk_size_gb="43" ;; # 42G   31G   12G  74% / # rapids-pre-init-2-1-ubuntu20

From 14ba6188bd9015bfa2c20a963ed0f24a344c66fb Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 30 Oct 2024 00:32:28 -0700
Subject: [PATCH 04/21] corrected cert list array logic

---
 custom_image_utils/shell_script_generator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/custom_image_utils/shell_script_generator.py b/custom_image_utils/shell_script_generator.py
index 0a823c1..82d4cde 100644
--- a/custom_image_utils/shell_script_generator.py
+++ b/custom_image_utils/shell_script_generator.py
@@ -135,7 +135,8 @@
 
     local -a cert_list=()
 
-    local -a default_cert_list=("{trusted_cert}" "${{MS_UEFI_CA}}")
+    local -a default_cert_list
+    default_cert_list=("{trusted_cert}" "${{MS_UEFI_CA}}")
     local -a src_img_modulus_md5sums=()
 
     mapfile -t src_img_modulus_md5sums < <(print_img_dbs_modulus_md5sums {dataproc_base_image})
@@ -145,7 +146,7 @@
     if [[ -z "${{src_img_modulus_md5sums}}" ]]; then
       num_src_certs=0
       echo "no db certificates in source image"
-      cert_list="${{default_cert_list}}"
+      cert_list=( "${{default_cert_list[@]}}" )
     else
       echo "${{num_src_certs}} db certificates attached to source image"
       echo "db certs exist in source image"

From a20f4709663a389cd78de73792c213c8a7cd60a3 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 31 Oct 2024 00:33:16 -0700
Subject: [PATCH 05/21] [secure-boot] updated examples and exercised metadata
 injection

README.md
* clarified that passing an empty string to --trusted-cert will
  disable secure-boot
custom_image_utils/shell_script_generator.py
* execute_with_retries reduces log noise and emits timing information
* inject secret metadata unless --trusted-cert="" is passed
examples/secure-boot/build-current-images.sh
* using ceil to round up in disk usage analysis perl script
* example of calling perl script cleaned up a little
examples/secure-boot/install_gpu_driver.sh
examples/secure-boot/dask.sh
examples/secure-boot/rapids.sh
* matched exit_handler implementations between the three of the above
* collect disk usage information
* disk sizes are collected in 1k pages instead of human-readable
* mounting ram disk at /tmp to reduce disk usage further
examples/secure-boot/pre-init.sh
* removed metadata from arguments, since it will be injected in the
  generated script
* passing creating-image metadata to zero disk before and after run
* updated disk sizes
---
 README.md                                    |   8 +-
 custom_image_utils/shell_script_generator.py |  19 +--
 examples/secure-boot/build-current-images.sh |  19 +--
 examples/secure-boot/dask.sh                 | 113 ++++++++++-----
 examples/secure-boot/install_gpu_driver.sh   | 105 +++++++++-----
 examples/secure-boot/pre-init.sh             |  45 +++---
 examples/secure-boot/rapids.sh               | 139 ++++++++++++-------
 7 files changed, 285 insertions(+), 163 deletions(-)

diff --git a/README.md b/README.md
index f0a1111..e55a81b 100644
--- a/README.md
+++ b/README.md
@@ -133,10 +133,10 @@ python generate_custom_image.py \
     default value of 300 seconds will be used.
 *   **--dry-run**: Dry run mode which only validates input and generates
     workflow script without creating image. Disabled by default.
-*   **--trusted-cert**: a certificate in DER format to be inserted
-    into the custom image's EFI boot sector.  Can be generated by
-    reading examples/secure-boot/README.md.  This argument is mutually
-    exclusive with base-image-family
+
+*   **--trusted-cert**: (Optional) Pass an empty string to this
+    argument to disable support for shielded-secure-boot.
+
 *   **--metadata**: VM metadata which can be read by the customization script
     with `/usr/share/google/get_metadata_value attributes/<key>` at runtime. The
     value of this flag takes the form of `key1=value1,key2=value2,...`. If the
diff --git a/custom_image_utils/shell_script_generator.py b/custom_image_utils/shell_script_generator.py
index 82d4cde..448d76a 100644
--- a/custom_image_utils/shell_script_generator.py
+++ b/custom_image_utils/shell_script_generator.py
@@ -35,7 +35,10 @@
   local -r cmd="$*"
 
   for ((i = 0; i < 3; i++)); do
-    if eval "$cmd"; then return 0 ; fi
+    set -x
+    time eval "$cmd" > "/tmp/{run_id}/install.log" 2>&1 && retval=$? || {{ retval=$? ; cat "/tmp/{run_id}/install.log" ; }}
+    set +x
+    if [[ $retval == 0 ]] ; then return 0 ; fi
     sleep 5
   done
   return 1
@@ -182,7 +185,7 @@
     echo 'Creating image.'
     base_obj_type="images"
     instance_disk_args='--image-project={project_id} --image={image_name}-install --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd'
-    time execute_with_retries \
+    execute_with_retries \
       gcloud compute images create {image_name}-install \
       --project={project_id} \
       --source-image={dataproc_base_image} \
@@ -194,7 +197,7 @@
     echo 'Creating disk.'
     base_obj_type="disks"
     instance_disk_args='--disk=auto-delete=yes,boot=yes,mode=rw,name={image_name}-install'
-    time execute_with_retries gcloud compute disks create {image_name}-install \
+    execute_with_retries gcloud compute disks create {image_name}-install \
       --project={project_id} \
       --zone={zone} \
       --image={dataproc_base_image} \
@@ -205,8 +208,7 @@
 
   date
   echo 'Creating VM instance to run customization script.'
-  ( set -x
-  time execute_with_retries gcloud compute instances create {image_name}-install \
+  execute_with_retries gcloud compute instances create {image_name}-install \
       --project={project_id} \
       --zone={zone} \
       {network_flag} \
@@ -218,7 +220,7 @@
       {service_account_flag} \
       --scopes=cloud-platform \
       "${{metadata_arg}}" \
-      --metadata-from-file startup-script=startup_script/run.sh )
+      --metadata-from-file startup-script=startup_script/run.sh
 
   touch /tmp/{run_id}/vm_created
 
@@ -252,13 +254,12 @@
 
   date
   echo 'Creating custom image.'
-  ( set -x
-  time execute_with_retries gcloud compute images create {image_name} \
+  execute_with_retries gcloud compute images create {image_name} \
     --project={project_id} \
     --source-disk-zone={zone} \
     --source-disk={image_name}-install \
     {storage_location_flag} \
-    --family={family} )
+    --family={family}
 
   touch /tmp/{run_id}/image_created
 }}
diff --git a/examples/secure-boot/build-current-images.sh b/examples/secure-boot/build-current-images.sh
index e37bd76..858f7c5 100644
--- a/examples/secure-boot/build-current-images.sh
+++ b/examples/secure-boot/build-current-images.sh
@@ -85,7 +85,7 @@ configure_service_account
 session_name="build-current-images"
 
 readonly timestamp="$(date +%F-%H-%M)"
-#readonly timestamp="2024-10-30-04-13"
+#readonly timestamp="2024-10-31-05-55"
 export timestamp
 
 export tmpdir=/tmp/${timestamp};
@@ -103,6 +103,7 @@ screen -US "${session_name}" -c examples/secure-boot/pre-init.screenrc
 function find_disk_usage() {
   test -f /tmp/genline.pl || cat > /tmp/genline.pl<<'EOF'
 #!/usr/bin/perl -w
+use POSIX qw(ceil);
 use strict;
 
 my $fn = $ARGV[0];
@@ -111,20 +112,20 @@ my( $config ) = ( $fn =~ /custom-image-(.*-(debian|rocky|ubuntu)\d+)-\d+/ );
 my @raw_lines = <STDIN>;
 my( $l ) = grep { m: /dev/.*/\s*$: } @raw_lines;
 my( $stats ) = ( $l =~ m:\s*/dev/\S+\s+(.*?)\s*$: );
+$stats =~ s:(\d{4,}):sprintf(q{%-6s}, sprintf(q{%.2fG},($1/1024)/1024)):eg;
 
 my( $dp_version ) = ($config =~ /-pre-init-(.+)/);
 $dp_version =~ s/-/./;
 
-my($max) = map { / maximum-disk-used: (\d+)/ } @raw_lines;
-$max+=3;
-$max = 30 if $max < 30;
+my($max)   = map { / maximum-disk-used: (\d+)/ } @raw_lines;
+my($gbmax) = ceil((($max / 1024) / 1024) * 1.03);
+$gbmax     = 30 if $gbmax < 30;
 my $i_dp_version = sprintf(q{%-15s}, qq{"$dp_version"});
-
-print( qq{  $i_dp_version) disk_size_gb="$max" ;; # $stats # $config}, $/ );
+print( qq{  $i_dp_version) disk_size_gb="$gbmax" ;; # $stats # $config}, $/ );
 EOF
-  for f in $(grep -l 'Customization script suc' /tmp/custom-image-*/logs/workflow.log|sed -e 's/workflow.log/startup-script.log/')
-  do
-    grep -A20 'Filesystem.*Avail' $f | perl /tmp/genline.pl $f
+  for workflow_log in $(grep -l "Customization script" /tmp/custom-image-*/logs/workflow.log) ;  do
+    startup_log=$(echo "${workflow_log}" | sed -e 's/workflow.log/startup-script.log/')
+    grep -A5 'Filesystem.*Avail' "${startup_log}" | perl /tmp/genline.pl "${workflow_log}"
   done
 }
 
diff --git a/examples/secure-boot/dask.sh b/examples/secure-boot/dask.sh
index e1c1229..b71b4e5 100644
--- a/examples/secure-boot/dask.sh
+++ b/examples/secure-boot/dask.sh
@@ -517,8 +517,8 @@ function main() {
   echo "Dask for ${DASK_RUNTIME} successfully initialized."
 }
 
-function exit_handler() (
-  set +e
+function exit_handler() {
+  set +ex
   echo "Exit handler invoked"
 
   # Free conda cache
@@ -527,16 +527,30 @@ function exit_handler() (
   # Clear pip cache
   pip cache purge || echo "unable to purge pip cache"
 
-  # remove the tmpfs conda pkgs_dirs
-  if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi
-
-  # Clean up shared memory mounts
-  for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
-    if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
-      rm -rf ${shmdir}/*
-      umount -f ${shmdir}
-    fi
-  done
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # Stop hadoop services
+    systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+
+    # remove the tmpfs conda pkgs_dirs
+    /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config"
+
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
+      if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
+        rm -rf ${shmdir}/*
+        sync
+        sleep 3s
+        execute_with_retries umount -f ${shmdir}
+      fi
+    done
+
+    umount -f /tmp
+    systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
 
   # Clean up OS package cache ; re-hold systemd package
   if is_debuntu ; then
@@ -546,36 +560,62 @@ function exit_handler() (
     dnf clean all
   fi
 
-  # print disk usage statistics
-  if is_debuntu ; then
-    # Rocky doesn't have sort -h and fails when the argument is passed
-    du --max-depth 3 -hx / | sort -h | tail -10
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  elif is_debian ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib64/google-cloud-sdk \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
   fi
 
   # Process disk usage logs from installation period
-  rm -f /tmp/keep-running-df
-  sleep 6s
+  rm -f /run/keep-running-df
+  sync
+  sleep 5.01s
   # compute maximum size of disk during installation
   # Log file contains logs like the following (minus the preceeding #):
-#Filesystem      Size  Used Avail Use% Mounted on
-#/dev/vda2       6.8G  2.5G  4.0G  39% /
-  df --si
-  perl -e '$max=( sort
-                 map { (split)[2] =~ /^(\d+)/ }
-                grep { m:^/: } <STDIN> )[-1];
-print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
+
+  perl -e '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
 
   echo "exit_handler has completed"
 
   # zero free disk space
   if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero ; sync ; rm -f /zero
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
   fi
 
   return 0
-)
-
-trap exit_handler EXIT
+}
 
 function prepare_to_install() {
   readonly DEFAULT_CUDA_VERSION="12.4"
@@ -601,7 +641,8 @@ function prepare_to_install() {
 
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
   # Write to a ramdisk instead of churning the persistent disk
-  if [[ ${free_mem} -ge 5250000 ]]; then
+  if [[ ${free_mem} -ge 10500000 ]]; then
+    tmpdir=/mnt/shm
     mkdir -p /mnt/shm
     mount -t tmpfs tmpfs /mnt/shm
 
@@ -618,18 +659,22 @@ function prepare_to_install() {
     else
       mount -t tmpfs tmpfs /var/cache/dnf
     fi
+  else
+    tmpdir=/tmp
   fi
+  install_log="/run/install.log"
+  trap exit_handler EXIT
 
   # Monitor disk usage in a screen session
   if is_debuntu ; then
       apt-get install -y -qq screen
-  elif is_rocky ; then
+  else
       dnf -y -q install screen
   fi
-  rm -f /tmp/disk-usage.log
-  touch /tmp/keep-running-df
+  df / | tee "/run/disk-usage.log"
+  touch "/run/keep-running-df"
   screen -d -m -US keep-running-df \
-    bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df --si / | tee -a /tmp/disk-usage.log ; sleep 5s ; done'
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 }
 
 prepare_to_install
diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 780e239..7add73b 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -269,7 +269,9 @@ function execute_with_retries() (
     apt-get -y autoremove
   fi
   for ((i = 0; i < 3; i++)); do
+    set -x
     time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    set +x
     if [[ $retval == 0 ]] ; then return 0 ; fi
     sleep 5
   done
@@ -1250,32 +1252,42 @@ function clean_up_sources_lists() {
 }
 
 function exit_handler() {
-  echo "Exit handler invoked"
   set +ex
+  echo "Exit handler invoked"
+
   # Purge private key material until next grant
   clear_dkms_key
 
   # Free conda cache
-  /opt/conda/miniconda3/bin/conda clean -a
+  /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1
 
   # Clear pip cache
   pip cache purge || echo "unable to purge pip cache"
 
-  # remove the tmpfs conda pkgs_dirs
-  if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # Stop hadoop services
+    systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
 
-  # remove the tmpfs pip cache-dir
-  pip config unset global.cache-dir || echo "unable to set global pip cache"
+    # remove the tmpfs conda pkgs_dirs
+    /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config"
 
-  # Clean up shared memory mounts
-  for shmdir in /mnt/shm /var/cache/apt/archives /var/cache/dnf ; do
-    if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
-      rm -rf ${shmdir}/*
-      sync
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
 
-      execute_with_retries umount -f ${shmdir}
-    fi
-  done
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
+      if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
+        rm -rf ${shmdir}/*
+        sync
+        sleep 3s
+        execute_with_retries umount -f ${shmdir}
+      fi
+    done
+
+    umount -f /tmp
+    systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
 
   # Clean up OS package cache ; re-hold systemd package
   if is_debuntu ; then
@@ -1287,32 +1299,58 @@ function exit_handler() {
     dnf clean all
   fi
 
-  # print disk usage statistics
-  if is_debuntu ; then
-    # Rocky doesn't have sort -h and fails when the argument is passed
-    du --max-depth 3 -hx / | sort -h | tail -10
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  elif is_debian ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib64/google-cloud-sdk \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
   fi
 
   # Process disk usage logs from installation period
-  rm -f /tmp/keep-running-df
+  rm -f /run/keep-running-df
   sync
   sleep 5.01s
   # compute maximum size of disk during installation
   # Log file contains logs like the following (minus the preceeding #):
-#Filesystem      Size  Used Avail Use% Mounted on
-#/dev/vda2       6.8G  2.5G  4.0G  39% /
-  df -h / | tee -a "${tmpdir}/disk-usage.log"
-  cat "${tmpdir}/disk-usage.log" | sort
-  perl -e '$max=( sort { $a => $b }
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
+
+  perl -e '@siz=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> )[0];
-print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log"
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
 
   echo "exit_handler has completed"
 
   # zero free disk space
   if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero ; sync ; rm -f /zero
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
   fi
 
   return 0
@@ -1327,6 +1365,12 @@ function prepare_to_install(){
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
   # Write to a ramdisk instead of churning the persistent disk
   if [[ ${free_mem} -ge 10500000 ]]; then
+    # Services might use /tmp for temporary files
+    echo "debug: this may break things!"
+    systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+    sudo mount -t tmpfs tmpfs /tmp
+    systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+
     tmpdir="/mnt/shm"
     mkdir -p "${tmpdir}"
     mount -t tmpfs tmpfs "${tmpdir}"
@@ -1364,7 +1408,6 @@ function prepare_to_install(){
 
   # zero free disk space
   if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
-    df -h
     time dd if=/dev/zero of=/zero status=progress ; sync ; sleep 3s ; rm -f /zero
   ) fi
 
@@ -1376,10 +1419,10 @@ function prepare_to_install(){
   else
       execute_with_retries dnf -y -q install screen
   fi
-  df -h / | tee "${tmpdir}/disk-usage.log"
-  touch "${tmpdir}/keep-running-df"
+  df / > "/run/disk-usage.log"
+  touch "/run/keep-running-df"
   screen -d -m -US keep-running-df \
-    bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done"
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 }
 
 prepare_to_install
diff --git a/examples/secure-boot/pre-init.sh b/examples/secure-boot/pre-init.sh
index f1fdce3..698b891 100644
--- a/examples/secure-boot/pre-init.sh
+++ b/examples/secure-boot/pre-init.sh
@@ -37,13 +37,10 @@ gcloud config set project ${PROJECT_ID}
 #gcloud auth login
 
 eval "$(bash examples/secure-boot/create-key-pair.sh)"
-metadata="public_secret_name=${public_secret_name}"
-metadata="${metadata},private_secret_name=${private_secret_name}"
-metadata="${metadata},secret_project=${secret_project}"
-metadata="${metadata},secret_version=${secret_version}"
-metadata="${metadata},dask-runtime=standalone"
+metadata="dask-runtime=standalone"
 metadata="${metadata},rapids-runtime=DASK"
 metadata="${metadata},cuda-version=12.4"
+metadata="${metadata},creating-image=c9h"
 
 # If no OS family specified, default to debian
 if [[ "${IMAGE_VERSION}" != *-* ]] ; then
@@ -69,7 +66,7 @@ function generate() {
 
   local install_image="$(jq -r ".[] | select(.name == \"${image_name}-install\").name" "${tmpdir}/images.json")"
   if [[ -n "${install_image}" ]] ; then
-    echo "Install image already exists"
+    echo "Install image already exists.  Cleaning up after aborted run."
     gcloud -q compute images delete "${image_name}-install"
   fi
 
@@ -106,15 +103,15 @@ function generate_from_base_purpose() {
 
 # base image -> cuda
 case "${dataproc_version}" in
-  "2.0-debian10" ) disk_size_gb="32" ;; # 33G   17G   15G  55% / # cuda-pre-init-2-0-debian10
-  "2.0-rocky8"   ) disk_size_gb="32" ;; # 32G   19G   14G  59% / # cuda-pre-init-2-0-rocky8
-  "2.0-ubuntu18" ) disk_size_gb="32" ;; # 31G   17G   15G  53% / # cuda-pre-init-2-0-ubuntu18
-  "2.1-debian11" ) disk_size_gb="34" ;; # 34G   20G   13G  63% / # cuda-pre-init-2-1-debian11
-  "2.1-rocky8"   ) disk_size_gb="36" ;; # 36G   22G   15G  61% / # cuda-pre-init-2-1-rocky8
-  "2.1-ubuntu20" ) disk_size_gb="34" ;; # 32G   20G   12G  63% / # cuda-pre-init-2-1-ubuntu20
-  "2.2-debian12" ) disk_size_gb="36" ;; # 36G   23G   11G  69% / # cuda-pre-init-2-2-debian12
-  "2.2-rocky9"   ) disk_size_gb="37" ;; # 37G   23G   15G  62% / # cuda-pre-init-2-2-rocky9
-  "2.2-ubuntu22" ) disk_size_gb="36" ;; # 34G   23G   12G  67% / # cuda-pre-init-2-2-ubuntu22
+  "2.0-debian10" ) disk_size_gb="30" ;; # 29.30G 28.29G       0 100% / # cuda-pre-init-2-0-debian10
+  "2.0-rocky8"   ) disk_size_gb="30" ;; # 29.79G 28.94G   0.85G  98% / # cuda-pre-init-2-0-rocky8
+  "2.0-ubuntu18" ) disk_size_gb="30" ;; # 28.89G 27.64G   1.24G  96% / # cuda-pre-init-2-0-ubuntu18
+  "2.1-debian11" ) disk_size_gb="32" ;; # 31.26G 30.74G       0 100% / # cuda-pre-init-2-1-debian11
+  "2.1-rocky8"   ) disk_size_gb="34" ;; # 33.79G 32.00G   1.80G  95% / # cuda-pre-init-2-1-rocky8
+  "2.1-ubuntu20" ) disk_size_gb="32" ;; # 30.83G 30.35G   0.46G  99% / # cuda-pre-init-2-1-ubuntu20
+  "2.2-debian12" ) disk_size_gb="34" ;; # 33.23G 32.71G       0 100% / # cuda-pre-init-2-2-debian12
+  "2.2-rocky9"   ) disk_size_gb="35" ;; # 34.79G 33.16G   1.64G  96% / # cuda-pre-init-2-2-rocky9
+  "2.2-ubuntu22" ) disk_size_gb="35" ;; # 33.74G 32.94G   0.78G  98% / # cuda-pre-init-2-2-ubuntu22
 esac
 
 # Install GPU drivers + cuda on dataproc base image
@@ -124,15 +121,15 @@ time generate_from_dataproc_version "${dataproc_version}"
 
 # cuda image -> rapids
 case "${dataproc_version}" in
-  "2.0-debian10" ) disk_size_gb="41" ;; # 42G   29G   12G  72% / # rapids-pre-init-2-0-debian10
-  "2.0-rocky8"   ) disk_size_gb="42" ;; # 42G   30G   13G  70% / # rapids-pre-init-2-0-rocky8
-  "2.0-ubuntu18" ) disk_size_gb="41" ;; # 40G   28G   12G  70% / # rapids-pre-init-2-0-ubuntu18
-  "2.1-debian11" ) disk_size_gb="43" ;; # 43G   31G  9.6G  77% / # rapids-pre-init-2-1-debian11
-  "2.1-rocky8"   ) disk_size_gb="45" ;; # 45G   33G   13G  72% / # rapids-pre-init-2-1-rocky8
-  "2.1-ubuntu20" ) disk_size_gb="43" ;; # 42G   31G   12G  74% / # rapids-pre-init-2-1-ubuntu20
-  "2.2-debian12" ) disk_size_gb="45" ;; # 45G   33G  9.5G  78% / # rapids-pre-init-2-2-debian12
-  "2.2-rocky9"   ) disk_size_gb="46" ;; # 46G   34G   13G  73% / # rapids-pre-init-2-2-rocky9
-  "2.2-ubuntu22" ) disk_size_gb="45" ;; # 44G   33G   11G  76% / # rapids-pre-init-2-2-ubuntu22
+  "2.0-debian10" ) disk_size_gb="41" ;; # 40.12G 37.51G   0.86G  98% / # rapids-pre-init-2-0-debian10
+  "2.0-rocky8"   ) disk_size_gb="39" ;; # 38.79G 38.04G   0.76G  99% / # rapids-pre-init-2-0-rocky8
+  "2.0-ubuntu18" ) disk_size_gb="39" ;; # 37.62G 36.69G   0.91G  98% / # rapids-pre-init-2-0-ubuntu18
+  "2.1-debian11" ) disk_size_gb="43" ;; # 42.09G 39.77G   0.49G  99% / # rapids-pre-init-2-1-debian11
+  "2.1-rocky8"   ) disk_size_gb="44" ;; # 43.79G 41.11G   2.68G  94% / # rapids-pre-init-2-1-rocky8
+  "2.1-ubuntu20" ) disk_size_gb="41" ;; # 39.55G 39.39G   0.15G 100% / # rapids-pre-init-2-1-ubuntu20
+  "2.2-debian12" ) disk_size_gb="45" ;; # 44.06G 41.73G   0.41G 100% / # rapids-pre-init-2-2-debian12
+  "2.2-rocky9"   ) disk_size_gb="45" ;; # 44.79G 42.29G   2.51G  95% / # rapids-pre-init-2-2-rocky9
+  "2.2-ubuntu22" ) disk_size_gb="44" ;; # 42.46G 41.97G   0.48G  99% / # rapids-pre-init-2-2-ubuntu22
 esac
 
 #disk_size_gb="50"
diff --git a/examples/secure-boot/rapids.sh b/examples/secure-boot/rapids.sh
index a42335f..0c285ed 100644
--- a/examples/secure-boot/rapids.sh
+++ b/examples/secure-boot/rapids.sh
@@ -73,17 +73,6 @@ function get_metadata_attribute() (
 function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; }
 function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; }
 
-function execute_with_retries() {
-  local -r cmd="$*"
-  for i in {0..9} ; do
-    if eval "$cmd"; then
-      return 0 ; fi
-    sleep 5
-  done
-  echo "Cmd '${cmd}' failed."
-  return 1
-}
-
 function configure_dask_yarn() {
   readonly DASK_YARN_CONFIG_DIR=/etc/dask/
   readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml
@@ -470,8 +459,7 @@ function install_dask_rapids() {
   ( set +e
   local is_installed="0"
   for installer in "${mamba}" "${conda}" ; do
-    test -d "${DASK_CONDA_ENV}" || \
-      time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \
+    time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \
       -c 'conda-forge' -c 'nvidia' -c 'rapidsai'  \
       ${CONDA_PACKAGES[*]} \
       "${python_spec}" \
@@ -480,8 +468,10 @@ function install_dask_rapids() {
     if [[ "$retval" == "0" ]] ; then
       is_installed="1"
       break
+    else
+      test -d "${DASK_CONDA_ENV}" && ( "${conda}" remove -n 'dask-rapids' --all || rm -rf "${DASK_CONDA_ENV}" )
+      "${conda}" config --set channel_priority flexible
     fi
-    "${conda}" config --set channel_priority flexible
   done
   if [[ "${is_installed}" == "0" ]]; then
     echo "failed to install dask"
@@ -534,8 +524,8 @@ function main() {
   fi
 }
 
-function exit_handler() (
-  set +e
+function exit_handler() {
+  set +ex
   echo "Exit handler invoked"
 
   # Free conda cache
@@ -544,16 +534,30 @@ function exit_handler() (
   # Clear pip cache
   pip cache purge || echo "unable to purge pip cache"
 
-  # remove the tmpfs conda pkgs_dirs
-  if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi
-
-  # Clean up shared memory mounts
-  for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
-    if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
-      rm -rf ${shmdir}/*
-      umount -f ${shmdir}
-    fi
-  done
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # Stop hadoop services
+    systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+
+    # remove the tmpfs conda pkgs_dirs
+    /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config"
+
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
+      if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
+        rm -rf ${shmdir}/*
+        sync
+        sleep 3s
+        execute_with_retries umount -f ${shmdir}
+      fi
+    done
+
+    umount -f /tmp
+    systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
 
   # Clean up OS package cache ; re-hold systemd package
   if is_debuntu ; then
@@ -563,36 +567,64 @@ function exit_handler() (
     dnf clean all
   fi
 
-  # print disk usage statistics
-  if is_debuntu ; then
-    # Rocky doesn't have sort -h and fails when the argument is passed
-    du --max-depth 3 -hx / | sort -h | tail -10
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  elif is_debian ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib64/google-cloud-sdk \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
   fi
 
   # Process disk usage logs from installation period
-  rm -f "${tmpdir}/keep-running-df"
-  sleep 6s
+  rm -f /run/keep-running-df
+  sync
+  sleep 5.01s
   # compute maximum size of disk during installation
   # Log file contains logs like the following (minus the preceeding #):
-#Filesystem      Size  Used Avail Use% Mounted on
-#/dev/vda2       6.8G  2.5G  4.0G  39% /
-  df -h / | tee -a "${tmpdir}/disk-usage.log"
-  perl -e '$max=( sort
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
+
+  perl -e '@siz=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> )[-1];
-print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log"
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
 
   echo "exit_handler has completed"
 
   # zero free disk space
   if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero ; sync ; rm -f /zero
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
   fi
 
   return 0
-)
+}
 
-function prepare_to_install(){
+function prepare_to_install() {
   readonly DEFAULT_CUDA_VERSION="12.4"
   CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION})
   readonly CUDA_VERSION
@@ -600,13 +632,6 @@ function prepare_to_install(){
   readonly ROLE=$(get_metadata_attribute dataproc-role)
   readonly MASTER=$(get_metadata_attribute dataproc-master)
 
-  # RAPIDS config
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
-  readonly RAPIDS_RUNTIME
-
-  readonly DEFAULT_DASK_RAPIDS_VERSION="24.08"
-  readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})
-
   # Dask config
   DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')"
   readonly DASK_RUNTIME
@@ -621,9 +646,16 @@ function prepare_to_install(){
   readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0"
   enable_worker_service="0"
 
+  # RAPIDS config
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
+  readonly RAPIDS_RUNTIME
+
+  readonly DEFAULT_DASK_RAPIDS_VERSION="24.08"
+  readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})
+
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
   # Write to a ramdisk instead of churning the persistent disk
-  if [[ ${free_mem} -ge 5250000 ]]; then
+  if [[ ${free_mem} -ge 10500000 ]]; then
     tmpdir=/mnt/shm
     mkdir -p /mnt/shm
     mount -t tmpfs tmpfs /mnt/shm
@@ -647,16 +679,19 @@ function prepare_to_install(){
   install_log="${tmpdir}/install.log"
   trap exit_handler EXIT
 
+  # Clean conda cache
+  /opt/conda/miniconda3/bin/conda clean -a
+
   # Monitor disk usage in a screen session
   if is_debuntu ; then
       apt-get install -y -qq screen
   else
       dnf -y -q install screen
   fi
-  df -h / | tee "${tmpdir}/disk-usage.log"
-  touch "${tmpdir}/keep-running-df"
+  df / > "/run/disk-usage.log"
+  touch "/run/keep-running-df"
   screen -d -m -US keep-running-df \
-    bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done"
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 }
 
 prepare_to_install

From f6d5d2c6dacf4f9e88f45ad4312933cd04a8fbb4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 31 Oct 2024 14:21:05 -0700
Subject: [PATCH 06/21] tmpfs umount will clear contents ; pause before tmpfs
 mount for dnf lock

---
 examples/secure-boot/rapids.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/secure-boot/rapids.sh b/examples/secure-boot/rapids.sh
index 0c285ed..09c46f3 100644
--- a/examples/secure-boot/rapids.sh
+++ b/examples/secure-boot/rapids.sh
@@ -548,7 +548,6 @@ function exit_handler() {
     # Clean up shared memory mounts
     for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
       if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
-        rm -rf ${shmdir}/*
         sync
         sleep 3s
         execute_with_retries umount -f ${shmdir}
@@ -671,6 +670,7 @@ function prepare_to_install() {
     if is_debuntu ; then
       mount -t tmpfs tmpfs /var/cache/apt/archives
     else
+      while [[ -f /var/cache/dnf/metadata_lock.pid ]] ; do sleep 1s ; done
       mount -t tmpfs tmpfs /var/cache/dnf
     fi
   else

From 6202098a304767b55d26605caac19da8360df928 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 1 Nov 2024 00:42:38 -0700
Subject: [PATCH 07/21] work in progress to attach local conda mirror during
 rapids install

---
 examples/secure-boot/rapids.sh | 50 +++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/examples/secure-boot/rapids.sh b/examples/secure-boot/rapids.sh
index 09c46f3..d4682b8 100644
--- a/examples/secure-boot/rapids.sh
+++ b/examples/secure-boot/rapids.sh
@@ -529,7 +529,17 @@ function exit_handler() {
   echo "Exit handler invoked"
 
   # Free conda cache
-  /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1
+  "${CONDA}" clean -a > /dev/null 2>&1
+
+  "${CONDA}" config --remove-key custom_channels
+  if grep -q "${conda_mirror_mountpoint}" /proc/mounts ; then
+    umount "${conda_mirror_mountpoint}"
+    gcloud compute instances detach-disk "$(hostname -s)" \
+      --disk       "${CONDA_DISK_FQN}" \
+      --zone       "${ZONE}" \
+      --disk-scope regional
+  fi
+
 
   # Clear pip cache
   pip cache purge || echo "unable to purge pip cache"
@@ -540,7 +550,7 @@ function exit_handler() {
     systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
 
     # remove the tmpfs conda pkgs_dirs
-    /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config"
+    "${CONDA}" config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config"
 
     # remove the tmpfs pip cache-dir
     pip config unset global.cache-dir || echo "unable to unset global pip cache"
@@ -652,6 +662,14 @@ function prepare_to_install() {
   readonly DEFAULT_DASK_RAPIDS_VERSION="24.08"
   readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})
 
+  readonly PROJECT_ID="$(gcloud config get project)"
+  zone="$(/usr/share/google/get_metadata_value zone)"
+  export ZONE="$(echo $zone | sed -e 's:.*/::')"
+  export REGION="$(echo ${ZONE} | perl -pe 's/^(.+)-[^-]+$/$1/')"
+  export CONDA_MIRROR_DISK_NAME="conda-mirror-${REGION}"
+  export CONDA_DISK_FQN="projects/${PROJECT_ID}/regions/${REGION}/disks/${CONDA_MIRROR_DISK_NAME}"
+
+  export CONDA=/opt/conda/miniconda3/bin/conda
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
   # Write to a ramdisk instead of churning the persistent disk
   if [[ ${free_mem} -ge 10500000 ]]; then
@@ -660,7 +678,7 @@ function prepare_to_install() {
     mount -t tmpfs tmpfs /mnt/shm
 
     # Download conda packages to tmpfs
-    /opt/conda/miniconda3/bin/conda config --add pkgs_dirs /mnt/shm
+    "${CONDA}" config --add pkgs_dirs /mnt/shm
     mount -t tmpfs tmpfs /mnt/shm
 
     # Download pip packages to tmpfs
@@ -679,8 +697,32 @@ function prepare_to_install() {
   install_log="${tmpdir}/install.log"
   trap exit_handler EXIT
 
+  conda_mirror_mountpoint=/srv/conda-mirror
+  ( set +e
+    # If the service account can describe the disk, attempt to attach and mount it
+    gcloud compute disks describe "${CONDA_MIRROR_DISK_NAME}" --region us-west4 > /tmp/mirror-disk.json
+    if [[ "$?" == "0" ]] ; then
+      gcloud compute instances attach-disk "$(hostname -s)" \
+        --disk        "${CONDA_DISK_FQN}" \
+	--device-name "${CONDA_MIRROR_DISK_NAME}" \
+	--disk-scope=regional \
+	--mode=ro \
+	--zone="${REGION}"
+
+      mkdir -p "${conda_mirror_mountpoint}"
+      mount "/dev/disk/by-id/google-${CONDA_MIRROR_DISK_NAME}" "${conda_mirror_mountpoint}"
+      for channel in conda-forge rapidsai nvidia ; do
+	"${CONDA}" config --set custom_channels.${channel} "file://${conda_mirror_mountpoint}"
+      done
+    else
+      for channel in conda-forge rapidsai nvidia ; do
+	"${CONDA}" config --set custom_channels.${channel} "http://10.42.79.42/"
+      done
+    fi
+  )
+
   # Clean conda cache
-  /opt/conda/miniconda3/bin/conda clean -a
+  "${CONDA}" clean -a
 
   # Monitor disk usage in a screen session
   if is_debuntu ; then

From a47eae23c2bc6e84a9c804a652f7a66cec74ee42 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 1 Nov 2024 00:43:26 -0700
Subject: [PATCH 08/21] gave the line generation perl script its own file

---
 examples/secure-boot/build-current-images.sh | 24 +----------------
 examples/secure-boot/genline.pl              | 27 ++++++++++++++++++++
 2 files changed, 28 insertions(+), 23 deletions(-)
 create mode 100644 examples/secure-boot/genline.pl

diff --git a/examples/secure-boot/build-current-images.sh b/examples/secure-boot/build-current-images.sh
index 858f7c5..f139982 100644
--- a/examples/secure-boot/build-current-images.sh
+++ b/examples/secure-boot/build-current-images.sh
@@ -101,31 +101,9 @@ screen -US "${session_name}" -c examples/secure-boot/pre-init.screenrc
 # tail -n 3 /tmp/custom-image-*/logs/startup-script.log
 # tail -n 3 /tmp/custom-image-${PURPOSE}-2-*/logs/workflow.log
 function find_disk_usage() {
-  test -f /tmp/genline.pl || cat > /tmp/genline.pl<<'EOF'
-#!/usr/bin/perl -w
-use POSIX qw(ceil);
-use strict;
-
-my $fn = $ARGV[0];
-my( $config ) = ( $fn =~ /custom-image-(.*-(debian|rocky|ubuntu)\d+)-\d+/ );
-
-my @raw_lines = <STDIN>;
-my( $l ) = grep { m: /dev/.*/\s*$: } @raw_lines;
-my( $stats ) = ( $l =~ m:\s*/dev/\S+\s+(.*?)\s*$: );
-$stats =~ s:(\d{4,}):sprintf(q{%-6s}, sprintf(q{%.2fG},($1/1024)/1024)):eg;
-
-my( $dp_version ) = ($config =~ /-pre-init-(.+)/);
-$dp_version =~ s/-/./;
-
-my($max)   = map { / maximum-disk-used: (\d+)/ } @raw_lines;
-my($gbmax) = ceil((($max / 1024) / 1024) * 1.03);
-$gbmax     = 30 if $gbmax < 30;
-my $i_dp_version = sprintf(q{%-15s}, qq{"$dp_version"});
-print( qq{  $i_dp_version) disk_size_gb="$gbmax" ;; # $stats # $config}, $/ );
-EOF
   for workflow_log in $(grep -l "Customization script" /tmp/custom-image-*/logs/workflow.log) ;  do
     startup_log=$(echo "${workflow_log}" | sed -e 's/workflow.log/startup-script.log/')
-    grep -A5 'Filesystem.*Avail' "${startup_log}" | perl /tmp/genline.pl "${workflow_log}"
+    grep -A5 'Filesystem.*Avail' "${startup_log}" | perl genline.pl "${workflow_log}"
   done
 }
 
diff --git a/examples/secure-boot/genline.pl b/examples/secure-boot/genline.pl
new file mode 100644
index 0000000..dcf7c80
--- /dev/null
+++ b/examples/secure-boot/genline.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl -w
+use strict;
+use POSIX qw(ceil);
+
+my $fn = $ARGV[0];
+my( $config, $purpose, $dp_version, $timestamp ) =
+  ( $fn =~
+    qr{custom-image-
+       (
+	 ([^-]+)-
+	 (\d+-\d+-(debian|rocky|ubuntu)\d+)
+       )-
+       (\d{4}(?:-\d{2}){4})
+    }x
+  );
+$dp_version =~ s/-/./;
+
+my @raw_lines = <STDIN>;
+my( $l ) = grep { m: /dev/.*/\s*$: } @raw_lines;
+my( $stats ) = ( $l =~ m:\s*/dev/\S+\s+(.*?)\s*$: );
+$stats =~ s:(\d{4,}):sprintf(q{%-6s}, sprintf(q{%.2fG},($1/1024)/1024)):eg;
+
+my($max)   = map { / maximum-disk-used: (\d+)/ } @raw_lines;
+my($gbmax) = ceil((($max / 1024) / 1024) * 1.03);
+$gbmax     = 30 if $gbmax < 30;
+my $i_dp_version = sprintf(q{%-15s}, qq{"$dp_version"});
+print( qq{  $i_dp_version) disk_size_gb="$gbmax" ;; # $stats # $purpose}, $/ );

From 0135f395964c6ff192df9051b0ec3e7ca23b15cc Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 4 Nov 2024 16:51:11 -0800
Subject: [PATCH 09/21] added support for mirror via locally attached disk or
 via http mirror

---
 examples/secure-boot/rapids.sh | 51 ++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/examples/secure-boot/rapids.sh b/examples/secure-boot/rapids.sh
index d4682b8..05e2807 100644
--- a/examples/secure-boot/rapids.sh
+++ b/examples/secure-boot/rapids.sh
@@ -412,7 +412,7 @@ function install_dask_rapids() {
   if is_cuda12 ; then
     local python_spec="python>=3.11"
     local cuda_spec="cuda-version>=12,<13"
-    local dask_spec="dask>=2024.7"
+    local dask_spec="dask>=2023.11"
     local numba_spec="numba"
   elif is_cuda11 ; then
     local python_spec="python>=3.9"
@@ -560,7 +560,7 @@ function exit_handler() {
       if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
         sync
         sleep 3s
-        execute_with_retries umount -f ${shmdir}
+        umount -f ${shmdir}
       fi
     done
 
@@ -659,15 +659,21 @@ function prepare_to_install() {
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
   readonly RAPIDS_RUNTIME
 
-  readonly DEFAULT_DASK_RAPIDS_VERSION="24.08"
+  readonly DEFAULT_DASK_RAPIDS_VERSION="23.11"
   readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})
 
   readonly PROJECT_ID="$(gcloud config get project)"
   zone="$(/usr/share/google/get_metadata_value zone)"
   export ZONE="$(echo $zone | sed -e 's:.*/::')"
   export REGION="$(echo ${ZONE} | perl -pe 's/^(.+)-[^-]+$/$1/')"
-  export CONDA_MIRROR_DISK_NAME="conda-mirror-${REGION}"
+
+  # use a regional mirror instead of fetching from cloudflare CDN
+  conda_mirror_disk='conda-mirror'
+  export CONDA_MIRROR_DISK="$(get_metadata_attribute 'conda-mirror-disk' ${conda_mirror_disk})"
+  export CONDA_MIRROR_DISK_NAME="$(gcloud compute disks list | awk "/${CONDA_MIRROR_DISK}-${REGION}-/ {print \$1}" | sort | tail -1)"
   export CONDA_DISK_FQN="projects/${PROJECT_ID}/regions/${REGION}/disks/${CONDA_MIRROR_DISK_NAME}"
+  conda_mirror_host='10.42.79.42'
+  export CONDA_MIRROR_HOST="$(get_metadata_attribute 'conda-mirror-host' ${conda_mirror_host})"
 
   export CONDA=/opt/conda/miniconda3/bin/conda
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
@@ -698,28 +704,31 @@ function prepare_to_install() {
   trap exit_handler EXIT
 
   conda_mirror_mountpoint=/srv/conda-mirror
+  if [[ -n "${CONDA_MIRROR_DISK_NAME}" ]]; then
   ( set +e
     # If the service account can describe the disk, attempt to attach and mount it
     gcloud compute disks describe "${CONDA_MIRROR_DISK_NAME}" --region us-west4 > /tmp/mirror-disk.json
     if [[ "$?" == "0" ]] ; then
-      gcloud compute instances attach-disk "$(hostname -s)" \
-        --disk        "${CONDA_DISK_FQN}" \
-	--device-name "${CONDA_MIRROR_DISK_NAME}" \
-	--disk-scope=regional \
-	--mode=ro \
-	--zone="${REGION}"
-
-      mkdir -p "${conda_mirror_mountpoint}"
-      mount "/dev/disk/by-id/google-${CONDA_MIRROR_DISK_NAME}" "${conda_mirror_mountpoint}"
-      for channel in conda-forge rapidsai nvidia ; do
-	"${CONDA}" config --set custom_channels.${channel} "file://${conda_mirror_mountpoint}"
-      done
-    else
-      for channel in conda-forge rapidsai nvidia ; do
-	"${CONDA}" config --set custom_channels.${channel} "http://10.42.79.42/"
+      for channel in rapidsai nvidia ; do
+	"${CONDA}" config --set custom_channels.${channel} "file://${conda_mirror_mountpoint}/"
       done
-    fi
-  )
+      if ! grep -q "${CONDA_MIRROR_DISK_NAME}" /proc/mounts ; then 
+        gcloud compute instances attach-disk "$(hostname -s)" \
+          --disk        "${CONDA_DISK_FQN}" \
+          --device-name "${CONDA_MIRROR_DISK_NAME}" \
+          --disk-scope  "regional" \
+          --zone        "${ZONE}" \
+          --mode=ro
+
+        mkdir -p "${conda_mirror_mountpoint}"
+        mount -o ro "/dev/disk/by-id/google-${CONDA_MIRROR_DISK_NAME}" "${conda_mirror_mountpoint}"
+      fi
+    fi ; )
+  elif nc -vz "${CONDA_MIRROR_HOST}" 80 > /dev/null 2>&1 ; then
+    for channel in rapidsai nvidia ; do
+      "${CONDA}" config --set "custom_channels.${channel}" "http://${CONDA_MIRROR_HOST}/"
+    done
+  fi
 
   # Clean conda cache
   "${CONDA}" clean -a

From 13b4eac3b604ff25d0bfdef0d81ea07bac090b3f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 6 Nov 2024 15:09:50 -0800
Subject: [PATCH 10/21] installs via the thin proxy.  I will take a snapshot
 and try installing from disk.

---
 examples/secure-boot/rapids.sh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/secure-boot/rapids.sh b/examples/secure-boot/rapids.sh
index 05e2807..621cf14 100644
--- a/examples/secure-boot/rapids.sh
+++ b/examples/secure-boot/rapids.sh
@@ -412,7 +412,7 @@ function install_dask_rapids() {
   if is_cuda12 ; then
     local python_spec="python>=3.11"
     local cuda_spec="cuda-version>=12,<13"
-    local dask_spec="dask>=2023.11"
+    local dask_spec="dask"
     local numba_spec="numba"
   elif is_cuda11 ; then
     local python_spec="python>=3.9"
@@ -547,6 +547,7 @@ function exit_handler() {
   # If system memory was sufficient to mount memory-backed filesystems
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
     # Stop hadoop services
+    echo "cleaning up tmpfs mounts"
     systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
 
     # remove the tmpfs conda pkgs_dirs
@@ -556,7 +557,7 @@ function exit_handler() {
     pip config unset global.cache-dir || echo "unable to unset global pip cache"
 
     # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
       if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
         sync
         sleep 3s
@@ -564,7 +565,7 @@ function exit_handler() {
       fi
     done
 
-    umount -f /tmp
+    echo "restarting services"
     systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
   fi
 
@@ -709,9 +710,11 @@ function prepare_to_install() {
     # If the service account can describe the disk, attempt to attach and mount it
     gcloud compute disks describe "${CONDA_MIRROR_DISK_NAME}" --region us-west4 > /tmp/mirror-disk.json
     if [[ "$?" == "0" ]] ; then
-      for channel in rapidsai nvidia ; do
+      for channel in 'conda-forge' 'rapidsai' 'nvidia' 'dask' ; do
 	"${CONDA}" config --set custom_channels.${channel} "file://${conda_mirror_mountpoint}/"
       done
+      #"${CONDA}" config --set "custom_channels.conda-forge" "http://${CONDA_MIRROR_HOST}/"
+
       if ! grep -q "${CONDA_MIRROR_DISK_NAME}" /proc/mounts ; then 
         gcloud compute instances attach-disk "$(hostname -s)" \
           --disk        "${CONDA_DISK_FQN}" \
@@ -725,7 +728,7 @@ function prepare_to_install() {
       fi
     fi ; )
   elif nc -vz "${CONDA_MIRROR_HOST}" 80 > /dev/null 2>&1 ; then
-    for channel in rapidsai nvidia ; do
+    for channel in 'conda-forge' 'rapidsai' 'nvidia' 'dask' ; do
       "${CONDA}" config --set "custom_channels.${channel}" "http://${CONDA_MIRROR_HOST}/"
     done
   fi

From 42def705384629060726b8452eb57eb879ba594a Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 6 Nov 2024 22:37:41 -0800
Subject: [PATCH 11/21] picking up SUBNET from env.json

---
 examples/secure-boot/env.json.sample | 3 ++-
 examples/secure-boot/pre-init.sh     | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/secure-boot/env.json.sample b/examples/secure-boot/env.json.sample
index c8a89b5..2461fc0 100644
--- a/examples/secure-boot/env.json.sample
+++ b/examples/secure-boot/env.json.sample
@@ -3,5 +3,6 @@
   "PURPOSE":"cuda-pre-init",
   "BUCKET":"my-bucket-name",
   "IMAGE_VERSION":"2.2-debian12",
-  "ZONE":"us-west4-a"
+  "ZONE":"us-west4-a",
+  "SUBNET":"my-subnet"
 }
diff --git a/examples/secure-boot/pre-init.sh b/examples/secure-boot/pre-init.sh
index 698b891..dd17eea 100644
--- a/examples/secure-boot/pre-init.sh
+++ b/examples/secure-boot/pre-init.sh
@@ -25,6 +25,7 @@ export PROJECT_ID="$(jq    -r .PROJECT_ID           env.json)"
 export PURPOSE="$(jq       -r .PURPOSE              env.json)"
 export BUCKET="$(jq        -r .BUCKET               env.json)"
 export ZONE="$(jq          -r .ZONE                 env.json)"
+export SUBNET="$(jq        -r .SUBNET               env.json)"
 
 custom_image_zone="${ZONE}"
 disk_size_gb="30" # greater than or equal to 30
@@ -89,6 +90,7 @@ function generate() {
     --zone                 "${custom_image_zone}" \
     --disk-size            "${disk_size_gb}" \
     --gcs-bucket           "${BUCKET}" \
+    --subnet               "${SUBNET}" \
     --shutdown-instance-timer-sec=30 \
     --no-smoke-test \
     ${extra_args}

From c5a4109c6810aeafbba2f29b03754eb9dccbd462 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 6 Nov 2024 22:53:26 -0800
Subject: [PATCH 12/21] add temporary permissions for disk mounting

---
 examples/secure-boot/build-current-images.sh | 22 ++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/examples/secure-boot/build-current-images.sh b/examples/secure-boot/build-current-images.sh
index f139982..e0d1ed2 100644
--- a/examples/secure-boot/build-current-images.sh
+++ b/examples/secure-boot/build-current-images.sh
@@ -49,6 +49,15 @@ function configure_service_account() {
   gcloud secrets add-iam-policy-binding "${public_secret_name}" \
     --member="serviceAccount:${GSA}" \
     --role="roles/secretmanager.secretAccessor" > /dev/null 2>&1
+
+  gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
+    --member="serviceAccount:${GSA}" \
+    --role=roles/compute.instanceAdmin.v1 > /dev/null 2>&1
+
+  gcloud iam service-accounts add-iam-policy-binding "${GSA}" \
+    --member="serviceAccount:${GSA}" \
+    --role=roles/iam.serviceAccountUser > /dev/null 2>&1
+
 }
 
 function revoke_bindings() {
@@ -66,6 +75,15 @@ function revoke_bindings() {
   gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
     --member="serviceAccount:${GSA}" \
     --role="roles/secretmanager.viewer" > /dev/null 2>&1
+
+  gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
+    --member="serviceAccount:${GSA}" \
+    --role=roles/compute.instanceAdmin.v1 > /dev/null 2>&1
+
+  gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \
+    --member="serviceAccount:${GSA}" \
+    --role=roles/iam.serviceAccountUser > /dev/null 2>&1
+
 }
 
 export PROJECT_ID="$(jq    -r .PROJECT_ID    env.json)"
@@ -85,7 +103,7 @@ configure_service_account
 session_name="build-current-images"
 
 readonly timestamp="$(date +%F-%H-%M)"
-#readonly timestamp="2024-10-31-05-55"
+#readonly timestamp="2024-11-05-22-55"
 export timestamp
 
 export tmpdir=/tmp/${timestamp};
@@ -103,7 +121,7 @@ screen -US "${session_name}" -c examples/secure-boot/pre-init.screenrc
 function find_disk_usage() {
   for workflow_log in $(grep -l "Customization script" /tmp/custom-image-*/logs/workflow.log) ;  do
     startup_log=$(echo "${workflow_log}" | sed -e 's/workflow.log/startup-script.log/')
-    grep -A5 'Filesystem.*Avail' "${startup_log}" | perl genline.pl "${workflow_log}"
+    grep -A5 'Filesystem.*Avail' "${startup_log}" | perl examples/secure-boot/genline.pl "${workflow_log}"
   done
 }
 

From c248c908a3ba18237bbb306a71d72ed55d4e5452 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 14 Nov 2024 15:34:30 -0800
Subject: [PATCH 13/21] current, working revisions

---
 examples/secure-boot/install_gpu_driver.sh |  9 +--
 examples/secure-boot/rapids.sh             | 70 +++++++++++-----------
 2 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 7add73b..6033c2a 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -618,7 +618,7 @@ function add_repo_cuda() {
   if is_debuntu ; then
     local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
     local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
-echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
+    echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
     | sudo tee "${sources_list_path}"
     curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
       -o "${kr_path}"
@@ -1276,16 +1276,15 @@ function exit_handler() {
     pip config unset global.cache-dir || echo "unable to unset global pip cache"
 
     # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
       if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
         rm -rf ${shmdir}/*
         sync
         sleep 3s
-        execute_with_retries umount -f ${shmdir}
+        umount -f ${shmdir}
       fi
     done
 
-    umount -f /tmp
     systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
   fi
 
@@ -1362,6 +1361,7 @@ function prepare_to_install(){
   tmpdir=/tmp/
   local free_mem
   trap exit_handler EXIT
+  export CONDA=/opt/conda/miniconda3/bin/conda
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
   # Write to a ramdisk instead of churning the persistent disk
   if [[ ${free_mem} -ge 10500000 ]]; then
@@ -1396,6 +1396,7 @@ function prepare_to_install(){
     clean_up_sources_lists
     apt-get update -qq
     apt-get -y clean
+    sleep 5s
     apt-get -y -qq autoremove
     if is_debian12 ; then
     apt-mark unhold systemd libsystemd0 ; fi
diff --git a/examples/secure-boot/rapids.sh b/examples/secure-boot/rapids.sh
index 621cf14..7ced36d 100644
--- a/examples/secure-boot/rapids.sh
+++ b/examples/secure-boot/rapids.sh
@@ -443,9 +443,6 @@ function install_dask_rapids() {
     "${cuda_spec}"
     "${rapids_spec}"
     "${dask_spec}"
-    "dask-bigquery"
-    "dask-ml"
-    "dask-sql"
     "cudf"
     "${numba_spec}"
   )
@@ -471,10 +468,12 @@ function install_dask_rapids() {
     else
       test -d "${DASK_CONDA_ENV}" && ( "${conda}" remove -n 'dask-rapids' --all || rm -rf "${DASK_CONDA_ENV}" )
       "${conda}" config --set channel_priority flexible
+      "${CONDA}" clean -a > /dev/null 2>&1
     fi
   done
   if [[ "${is_installed}" == "0" ]]; then
     echo "failed to install dask"
+    df -h
     return 1
   fi
   )
@@ -525,22 +524,22 @@ function main() {
 }
 
 function exit_handler() {
-  set +ex
+  set +e
+  set -x
   echo "Exit handler invoked"
 
   # Free conda cache
   "${CONDA}" clean -a > /dev/null 2>&1
 
   "${CONDA}" config --remove-key custom_channels
-  if grep -q "${conda_mirror_mountpoint}" /proc/mounts ; then
-    umount "${conda_mirror_mountpoint}"
+  if grep -q "${rapids_mirror_mountpoint}" /proc/mounts ; then
+    umount "${rapids_mirror_mountpoint}"
     gcloud compute instances detach-disk "$(hostname -s)" \
-      --disk       "${CONDA_DISK_FQN}" \
+      --device-name "${RAPIDS_MIRROR_DISK_NAME}" \
       --zone       "${ZONE}" \
       --disk-scope regional
   fi
 
-
   # Clear pip cache
   pip cache purge || echo "unable to purge pip cache"
 
@@ -610,8 +609,8 @@ function exit_handler() {
   # Log file contains logs like the following (minus the preceeding #):
 #Filesystem     1K-blocks    Used Available Use% Mounted on
 #/dev/vda2        7096908 2611344   4182932  39% /
+  set +x
   df / | tee -a "/run/disk-usage.log"
-
   perl -e '@siz=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
                   grep { m:^/: } <STDIN> );
@@ -620,12 +619,12 @@ print( "    samples-taken: ", scalar @siz, $/,
        "maximum-disk-used: $max", $/,
        "minimum-disk-used: $min", $/,
        "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
-
+  set -x
   echo "exit_handler has completed"
 
   # zero free disk space
   if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero
+    eval "dd if=/dev/zero of=/zero"
     sync
     sleep 3s
     rm -f /zero
@@ -668,23 +667,16 @@ function prepare_to_install() {
   export ZONE="$(echo $zone | sed -e 's:.*/::')"
   export REGION="$(echo ${ZONE} | perl -pe 's/^(.+)-[^-]+$/$1/')"
 
-  # use a regional mirror instead of fetching from cloudflare CDN
-  conda_mirror_disk='conda-mirror'
-  export CONDA_MIRROR_DISK="$(get_metadata_attribute 'conda-mirror-disk' ${conda_mirror_disk})"
-  export CONDA_MIRROR_DISK_NAME="$(gcloud compute disks list | awk "/${CONDA_MIRROR_DISK}-${REGION}-/ {print \$1}" | sort | tail -1)"
-  export CONDA_DISK_FQN="projects/${PROJECT_ID}/regions/${REGION}/disks/${CONDA_MIRROR_DISK_NAME}"
-  conda_mirror_host='10.42.79.42'
-  export CONDA_MIRROR_HOST="$(get_metadata_attribute 'conda-mirror-host' ${conda_mirror_host})"
-
   export CONDA=/opt/conda/miniconda3/bin/conda
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
   # Write to a ramdisk instead of churning the persistent disk
-  if [[ ${free_mem} -ge 10500000 ]]; then
+  if [[ ${free_mem} -ge 33300000 ]]; then
     tmpdir=/mnt/shm
     mkdir -p /mnt/shm
     mount -t tmpfs tmpfs /mnt/shm
 
     # Download conda packages to tmpfs
+    # Minimum of 15.5G of capacity required for rapids package install via conda
     "${CONDA}" config --add pkgs_dirs /mnt/shm
     mount -t tmpfs tmpfs /mnt/shm
 
@@ -704,32 +696,38 @@ function prepare_to_install() {
   install_log="${tmpdir}/install.log"
   trap exit_handler EXIT
 
-  conda_mirror_mountpoint=/srv/conda-mirror
-  if [[ -n "${CONDA_MIRROR_DISK_NAME}" ]]; then
+  # use a regional mirror instead of fetching from cloudflare CDN
+  export RAPIDS_MIRROR_DISK="$(get_metadata_attribute 'rapids-mirror-disk' '')"
+  export RAPIDS_MIRROR_DISK_NAME="$(gcloud compute disks list | awk "/${RAPIDS_MIRROR_DISK}-${REGION}-/ {print \$1}" | sort | tail -1)"
+  export RAPIDS_DISK_FQN="projects/${PROJECT_ID}/regions/${REGION}/disks/${RAPIDS_MIRROR_DISK_NAME}"
+  export RAPIDS_MIRROR_HOST="$(get_metadata_attribute 'rapids-mirror-host' '')"
+  rapids_mirror_mountpoint=/srv/mirror
+  if [[ -n "${RAPIDS_MIRROR_DISK_NAME}" ]]; then
   ( set +e
     # If the service account can describe the disk, attempt to attach and mount it
-    gcloud compute disks describe "${CONDA_MIRROR_DISK_NAME}" --region us-west4 > /tmp/mirror-disk.json
+    gcloud compute disks describe "${RAPIDS_MIRROR_DISK_NAME}" --region us-west4 > /tmp/mirror-disk.txt
     if [[ "$?" == "0" ]] ; then
-      for channel in 'conda-forge' 'rapidsai' 'nvidia' 'dask' ; do
-	"${CONDA}" config --set custom_channels.${channel} "file://${conda_mirror_mountpoint}/"
+      for channel in 'rapidsai' 'nvidia' 'main' 'r' 'conda-forge' ; do
+        "${CONDA}" config --set \
+          "custom_channels.${channel}" "file://${rapids_mirror_mountpoint}/conda.anaconda.org/"
       done
-      #"${CONDA}" config --set "custom_channels.conda-forge" "http://${CONDA_MIRROR_HOST}/"
 
-      if ! grep -q "${CONDA_MIRROR_DISK_NAME}" /proc/mounts ; then 
+      if ! grep -q "${rapids_mirror_mountpoint}" /proc/mounts ; then 
         gcloud compute instances attach-disk "$(hostname -s)" \
-          --disk        "${CONDA_DISK_FQN}" \
-          --device-name "${CONDA_MIRROR_DISK_NAME}" \
+          --disk        "${RAPIDS_DISK_FQN}" \
+          --device-name "${RAPIDS_MIRROR_DISK_NAME}" \
           --disk-scope  "regional" \
           --zone        "${ZONE}" \
           --mode=ro
 
-        mkdir -p "${conda_mirror_mountpoint}"
-        mount -o ro "/dev/disk/by-id/google-${CONDA_MIRROR_DISK_NAME}" "${conda_mirror_mountpoint}"
+        mkdir -p "${rapids_mirror_mountpoint}"
+        mount -o ro "/dev/disk/by-id/google-${RAPIDS_MIRROR_DISK_NAME}" "${rapids_mirror_mountpoint}"
       fi
     fi ; )
-  elif nc -vz "${CONDA_MIRROR_HOST}" 80 > /dev/null 2>&1 ; then
-    for channel in 'conda-forge' 'rapidsai' 'nvidia' 'dask' ; do
-      "${CONDA}" config --set "custom_channels.${channel}" "http://${CONDA_MIRROR_HOST}/"
+  elif [[ -n "${RAPIDS_MIRROR_HOST}" ]] && nc -vz "${RAPIDS_MIRROR_HOST}" 80 > /dev/null 2>&1 ; then
+    for channel in 'conda-forge' 'rapidsai' 'nvidia' 'r' 'main' ; do
+      "${CONDA}" config --set \
+        "custom_channels.${channel}" "http://${RAPIDS_MIRROR_HOST}/conda.anaconda.org/"
     done
   fi
 
@@ -738,9 +736,9 @@ function prepare_to_install() {
 
   # Monitor disk usage in a screen session
   if is_debuntu ; then
-      apt-get install -y -qq screen
+    apt-get install -y -qq screen
   else
-      dnf -y -q install screen
+    dnf -y -q install screen
   fi
   df / > "/run/disk-usage.log"
   touch "/run/keep-running-df"

From a83696355c5801c46baea91688fabc78b8f44a5d Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 21 Nov 2024 17:18:16 -0800
Subject: [PATCH 14/21] snapshot

---
 custom_image_utils/shell_script_generator.py |   2 +-
 examples/secure-boot/build-current-images.sh |   4 +-
 examples/secure-boot/create-key-pair.sh      |   1 -
 examples/secure-boot/genline.pl              |   7 +-
 examples/secure-boot/install_gpu_driver.sh   |  78 +++---
 examples/secure-boot/pre-init.sh             |  12 +-
 examples/secure-boot/rapids.sh               | 265 ++++++++++++-------
 7 files changed, 229 insertions(+), 140 deletions(-)

diff --git a/custom_image_utils/shell_script_generator.py b/custom_image_utils/shell_script_generator.py
index 448d76a..82d44b7 100644
--- a/custom_image_utils/shell_script_generator.py
+++ b/custom_image_utils/shell_script_generator.py
@@ -237,7 +237,7 @@
       --port=1 2>&1 \
       | grep 'startup-script' \
       | sed -e 's/ {image_name}-install.*startup-script://g' \
-      | dd bs=1 of={log_dir}/startup-script.log \
+      | dd status=none bs=1 of={log_dir}/startup-script.log \
       || true
   echo 'Checking customization script result.'
   date
diff --git a/examples/secure-boot/build-current-images.sh b/examples/secure-boot/build-current-images.sh
index e0d1ed2..f5834c7 100644
--- a/examples/secure-boot/build-current-images.sh
+++ b/examples/secure-boot/build-current-images.sh
@@ -113,7 +113,7 @@ gcloud compute instances list --zones "${ZONE}" --format json > ${tmpdir}/instan
 gcloud compute images    list                   --format json > ${tmpdir}/images.json
 
 # Run generation scripts simultaneously for each dataproc image version
-screen -US "${session_name}" -c examples/secure-boot/pre-init.screenrc
+screen -L -US "${session_name}" -c examples/secure-boot/pre-init.screenrc
 
 # tail -n 3 /tmp/custom-image-*/logs/workflow.log
 # tail -n 3 /tmp/custom-image-*/logs/startup-script.log
@@ -121,7 +121,7 @@ screen -US "${session_name}" -c examples/secure-boot/pre-init.screenrc
 function find_disk_usage() {
   for workflow_log in $(grep -l "Customization script" /tmp/custom-image-*/logs/workflow.log) ;  do
     startup_log=$(echo "${workflow_log}" | sed -e 's/workflow.log/startup-script.log/')
-    grep -A5 'Filesystem.*Avail' "${startup_log}" | perl examples/secure-boot/genline.pl "${workflow_log}"
+    grep -A5 'Filesystem.*1K-blocks' "${startup_log}" | perl examples/secure-boot/genline.pl "${workflow_log}"
   done
 }
 
diff --git a/examples/secure-boot/create-key-pair.sh b/examples/secure-boot/create-key-pair.sh
index 3039042..8f2a42a 100644
--- a/examples/secure-boot/create-key-pair.sh
+++ b/examples/secure-boot/create-key-pair.sh
@@ -74,7 +74,6 @@ function create_key () {
     fi
 
     if [[ -f "${PRIVATE_KEY}" ]]; then
-        echo "key already exists.  Skipping generation." >&2
         modulus_md5sum="$(cat tls/modulus-md5sum.txt)"
         return
     fi
diff --git a/examples/secure-boot/genline.pl b/examples/secure-boot/genline.pl
index dcf7c80..81ab752 100644
--- a/examples/secure-boot/genline.pl
+++ b/examples/secure-boot/genline.pl
@@ -2,12 +2,13 @@
 use strict;
 use POSIX qw(ceil);
 
+# /tmp/custom-image-cuda-pre-init-2-0-debian10-2024-11-14-20-00-20241114-200043/logs/workflow.log
 my $fn = $ARGV[0];
 my( $config, $purpose, $dp_version, $timestamp ) =
   ( $fn =~
-    qr{custom-image-
+    m{custom-image-
        (
-	 ([^-]+)-
+	 (.+)-
 	 (\d+-\d+-(debian|rocky|ubuntu)\d+)
        )-
        (\d{4}(?:-\d{2}){4})
@@ -18,7 +19,7 @@
 my @raw_lines = <STDIN>;
 my( $l ) = grep { m: /dev/.*/\s*$: } @raw_lines;
 my( $stats ) = ( $l =~ m:\s*/dev/\S+\s+(.*?)\s*$: );
-$stats =~ s:(\d{4,}):sprintf(q{%-6s}, sprintf(q{%.2fG},($1/1024)/1024)):eg;
+$stats =~ s:(\d{4,}):sprintf(q{%7s}, sprintf(q{%.2fG},($1/1024)/1024)):eg;
 
 my($max)   = map { / maximum-disk-used: (\d+)/ } @raw_lines;
 my($gbmax) = ceil((($max / 1024) / 1024) * 1.03);
diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 6033c2a..5e961cf 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -16,6 +16,18 @@
 
 set -euxo pipefail
 
+# if customer needs proxy:
+
+function set_proxy(){
+  export METADATA_HTTP_PROXY=""
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  export no_proxy=metadata.google.internal
+  export NO_PROXY=metadata.google.internal
+}
+
 function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
@@ -745,6 +757,11 @@ function install_cuda_toolkit() {
   fi
 }
 
+function remove_drivers_aliases() {
+ local conffile="/etc/modprobe.d/nvidia-aliases.conf"
+ rm "${conffile}"
+}
+
 function install_drivers_aliases() {
   if is_rocky ; then return ; fi
   if ! (is_debian12 || is_debian11) ; then return ; fi
@@ -768,9 +785,14 @@ function load_kernel_module() {
     rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
   done
 
-  install_drivers_aliases
+#  install_drivers_aliases
   depmod -a
   modprobe nvidia
+  for suffix in uvm modeset drm; do
+    modprobe "nvidia-${suffix}"
+  done
+  # TODO: if peermem is available, also modprobe nvidia-peermem
+#  remove_drivers_aliases
 }
 
 # Install NVIDIA GPU driver provided by NVIDIA
@@ -789,33 +811,25 @@ function install_nvidia_gpu_driver() {
           libglvnd0 \
           libcuda1
     #clear_dkms_key
-    load_kernel_module
   elif is_ubuntu18 || is_debian10 || (is_debian12 && is_cuda11) ; then
 
     install_nvidia_userspace_runfile
 
     build_driver_from_github
 
-    load_kernel_module
-
     install_cuda_runfile
   elif is_debuntu ; then
     install_cuda_keyring_pkg
 
     build_driver_from_packages
 
-    load_kernel_module
-
     install_cuda_toolkit
   elif is_rocky ; then
     add_repo_cuda
 
     build_driver_from_packages
 
-    load_kernel_module
-
     install_cuda_toolkit
-
   else
     echo "Unsupported OS: '${OS_NAME}'"
     exit 1
@@ -1059,7 +1073,8 @@ function main() {
     local kernel_devel_pkg_out="$(eval "${dnf_cmd} 2>&1")"
     if [[ "${kernel_devel_pkg_out}" =~ 'Unable to find a match: kernel-devel-' ]] ; then
       # this kernel-devel may have been migrated to the vault
-      local vault="https://download.rockylinux.org/vault/rocky/$(os_version)"
+      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
+      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
       execute_with_retries dnf -y -q --setopt=localpkg_gpgcheck=1 install \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
@@ -1099,6 +1114,8 @@ function main() {
     if [[ $IS_MIG_ENABLED -eq 0 ]]; then
       install_nvidia_gpu_driver
 
+      load_kernel_module
+
       if [[ -n ${CUDNN_VERSION} ]]; then
         install_nvidia_nccl
         install_nvidia_cudnn
@@ -1116,7 +1133,7 @@ function main() {
         rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
       done
 
-      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e H100 -e A100 || echo -n "")"
+      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")"
       if test -n "$(nvsmi -L)" ; then
 	# cache the result of the gpu query
         ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
@@ -1258,20 +1275,11 @@ function exit_handler() {
   # Purge private key material until next grant
   clear_dkms_key
 
-  # Free conda cache
-  /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1
-
   # Clear pip cache
   pip cache purge || echo "unable to purge pip cache"
 
   # If system memory was sufficient to mount memory-backed filesystems
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # Stop hadoop services
-    systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-
-    # remove the tmpfs conda pkgs_dirs
-    /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config"
-
     # remove the tmpfs pip cache-dir
     pip config unset global.cache-dir || echo "unable to unset global pip cache"
 
@@ -1285,13 +1293,15 @@ function exit_handler() {
       fi
     done
 
-    systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+    # restart services stopped during preparation stage
+    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
   fi
 
-  # Clean up OS package cache ; re-hold systemd package
   if is_debuntu ; then
+    # Clean up OS package cache
     apt-get -y -qq clean
     apt-get -y -qq autoremove
+    # re-hold systemd package
     if is_debian12 ; then
     apt-mark hold systemd libsystemd0 ; fi
   else
@@ -1305,13 +1315,13 @@ function exit_handler() {
       /usr/lib \
       /opt/nvidia/* \
       /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
+      /opt/conda/miniconda3 | sort -h
   elif is_debian ; then
     du -hs \
       /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
       /usr/lib \
       /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
+      /opt/conda/miniconda3 | sort -h
   else
     du -hs \
       /var/lib/docker \
@@ -1361,22 +1371,23 @@ function prepare_to_install(){
   tmpdir=/tmp/
   local free_mem
   trap exit_handler EXIT
-  export CONDA=/opt/conda/miniconda3/bin/conda
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
   # Write to a ramdisk instead of churning the persistent disk
   if [[ ${free_mem} -ge 10500000 ]]; then
-    # Services might use /tmp for temporary files
-    echo "debug: this may break things!"
-    systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+
+    # Services might use /tmp for temporary files - if we see errors,
+    # consider uncommenting the following command to stop them during
+    # install
+
+    # systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
     sudo mount -t tmpfs tmpfs /tmp
-    systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
 
     tmpdir="/mnt/shm"
     mkdir -p "${tmpdir}"
     mount -t tmpfs tmpfs "${tmpdir}"
 
-    # Download conda packages to tmpfs
-    /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+    # Clear pip cache
+    pip cache purge || echo "unable to purge pip cache"
 
     # Download pip packages to tmpfs
     pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
@@ -1404,12 +1415,9 @@ function prepare_to_install(){
     dnf clean all
   fi
 
-  # Clean conda cache
-  /opt/conda/miniconda3/bin/conda clean -a
-
   # zero free disk space
   if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
-    time dd if=/dev/zero of=/zero status=progress ; sync ; sleep 3s ; rm -f /zero
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
   ) fi
 
   configure_dkms_certs
diff --git a/examples/secure-boot/pre-init.sh b/examples/secure-boot/pre-init.sh
index dd17eea..78ec517 100644
--- a/examples/secure-boot/pre-init.sh
+++ b/examples/secure-boot/pre-init.sh
@@ -27,6 +27,8 @@ export BUCKET="$(jq        -r .BUCKET               env.json)"
 export ZONE="$(jq          -r .ZONE                 env.json)"
 export SUBNET="$(jq        -r .SUBNET               env.json)"
 
+export region="$(echo "${ZONE}" | perl -pe 's/-[a-z]+$//')"
+
 custom_image_zone="${ZONE}"
 disk_size_gb="30" # greater than or equal to 30
 
@@ -42,6 +44,8 @@ metadata="dask-runtime=standalone"
 metadata="${metadata},rapids-runtime=DASK"
 metadata="${metadata},cuda-version=12.4"
 metadata="${metadata},creating-image=c9h"
+metadata="${metadata},rapids-mirror-disk=rapids-mirror-${region}"
+metadata="${metadata},rapids-mirror-host=10.42.79.42"
 
 # If no OS family specified, default to debian
 if [[ "${IMAGE_VERSION}" != *-* ]] ; then
@@ -81,7 +85,7 @@ function generate() {
   fi
   set -xe
   python generate_custom_image.py \
-    --machine-type         "n1-standard-8" \
+    --machine-type         "n1-standard-16" \
     --accelerator          "type=nvidia-tesla-t4" \
     --image-name           "${image_name}" \
     --customization-script "${customization_script}" \
@@ -125,11 +129,11 @@ time generate_from_dataproc_version "${dataproc_version}"
 case "${dataproc_version}" in
   "2.0-debian10" ) disk_size_gb="41" ;; # 40.12G 37.51G   0.86G  98% / # rapids-pre-init-2-0-debian10
   "2.0-rocky8"   ) disk_size_gb="39" ;; # 38.79G 38.04G   0.76G  99% / # rapids-pre-init-2-0-rocky8
-  "2.0-ubuntu18" ) disk_size_gb="39" ;; # 37.62G 36.69G   0.91G  98% / # rapids-pre-init-2-0-ubuntu18
-  "2.1-debian11" ) disk_size_gb="43" ;; # 42.09G 39.77G   0.49G  99% / # rapids-pre-init-2-1-debian11
+  "2.0-ubuntu18" ) disk_size_gb="40" ;; # 37.62G 36.69G   0.91G  98% / # rapids-pre-init-2-0-ubuntu18
+  "2.1-debian11" ) disk_size_gb="44" ;; # 42.09G 39.77G   0.49G  99% / # rapids-pre-init-2-1-debian11
   "2.1-rocky8"   ) disk_size_gb="44" ;; # 43.79G 41.11G   2.68G  94% / # rapids-pre-init-2-1-rocky8
   "2.1-ubuntu20" ) disk_size_gb="41" ;; # 39.55G 39.39G   0.15G 100% / # rapids-pre-init-2-1-ubuntu20
-  "2.2-debian12" ) disk_size_gb="45" ;; # 44.06G 41.73G   0.41G 100% / # rapids-pre-init-2-2-debian12
+  "2.2-debian12" ) disk_size_gb="46" ;; # 44.06G 41.73G   0.41G 100% / # rapids-pre-init-2-2-debian12
   "2.2-rocky9"   ) disk_size_gb="45" ;; # 44.79G 42.29G   2.51G  95% / # rapids-pre-init-2-2-rocky9
   "2.2-ubuntu22" ) disk_size_gb="44" ;; # 42.46G 41.97G   0.48G  99% / # rapids-pre-init-2-2-ubuntu22
 esac
diff --git a/examples/secure-boot/rapids.sh b/examples/secure-boot/rapids.sh
index 7ced36d..308003f 100644
--- a/examples/secure-boot/rapids.sh
+++ b/examples/secure-boot/rapids.sh
@@ -88,7 +88,7 @@ function configure_dask_yarn() {
 # https://yarn.dask.org/en/latest/configuration.html#default-configuration
 
 yarn:
-  environment: python://${DASK_CONDA_ENV}/bin/python
+  environment: python://${RAPIDS_CONDA_ENV}/bin/python
 
   worker:
     count: 2
@@ -110,7 +110,7 @@ function install_systemd_dask_worker() {
 LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log"
 nvidia-smi -c DEFAULT
 echo "dask-cuda-worker starting, logging to \${LOGFILE}"
-${DASK_CONDA_ENV}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1
+${RAPIDS_CONDA_ENV}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1
 EOF
 
   chmod 750 "${DASK_WORKER_LAUNCHER}"
@@ -162,7 +162,7 @@ function install_systemd_dask_scheduler() {
 #!/bin/bash
 LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log"
 echo "dask scheduler starting, logging to \${LOGFILE}"
-${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
+${RAPIDS_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
 EOF
 
   chmod 750 "${DASK_SCHEDULER_LAUNCHER}"
@@ -409,16 +409,24 @@ EOF
 }
 
 function install_dask_rapids() {
+#To enable CUDA support, UCX requires the CUDA Runtime library (libcudart).
+#The library can be installed with the appropriate command below:
+
+#* For CUDA 11, run:    conda install cudatoolkit cuda-version=11
+#* For CUDA 12, run:    conda install cuda-cudart cuda-version=12
+
   if is_cuda12 ; then
     local python_spec="python>=3.11"
     local cuda_spec="cuda-version>=12,<13"
     local dask_spec="dask"
     local numba_spec="numba"
+    local cudart_spec="cuda-cudart"
   elif is_cuda11 ; then
     local python_spec="python>=3.9"
     local cuda_spec="cuda-version>=11,<12.0a0"
     local dask_spec="dask"
     local numba_spec="numba"
+    local cudart_spec="cudatoolkit"
   fi
 
   rapids_spec="rapids>=${RAPIDS_VERSION}"
@@ -441,6 +449,7 @@ function install_dask_rapids() {
 
   CONDA_PACKAGES+=(
     "${cuda_spec}"
+    "${cudart_spec}"
     "${rapids_spec}"
     "${dask_spec}"
     "cudf"
@@ -448,32 +457,54 @@ function install_dask_rapids() {
   )
 
   # Install cuda, rapids, dask
-  mamba="/opt/conda/miniconda3/bin/mamba"
-  conda="/opt/conda/miniconda3/bin/conda"
+  mamba="${CONDA_ROOT}/bin/mamba"
+  conda="${CONDA_ROOT}/bin/conda"
+
+  readonly DASK_CONDA_ENV="${CONDA_ROOT}/envs/${RAPIDS_ENV_NAME}"  
+  if test -d "${DASK_CONDA_ENV}" ; then
+    "${conda}" remove -n "${RAPIDS_ENV_NAME}" --all > /dev/null 2>&1 || rm -rf "${DASK_CONDA_ENV}"
+  fi
+  # Unpin conda version and upgrade
+#  perl -ni -e 'print unless /^conda /' "${CONDA_ROOT}/conda-meta/pinned"
+#  "${mamba}" install conda mamba libmamba libmambapy conda-libmamba-solver
 
-  "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]"
+  # This error occurs when we set channel_alias
+#  util_files_to_patch="$(find "${CONDA_ROOT}" -name utils.py | grep mamba/utils.py)"
+#  perl -pi -e 's[raise ValueError\("missing key][print("missing key]' ${util_files_to_patch}
+#  File "/home/zhyue/mambaforge/lib/python3.9/site-packages/mamba/utils.py", line 393, in compute_final_precs
+#  raise ValueError("missing key {} in channels: {}".format(key, lookup_dict))
+
+  CONDA_EXE="${CONDA_ROOT}/bin/conda"
+  CONDA_PYTHON_EXE="${CONDA_ROOT}/bin/python"
+  PATH="${CONDA_ROOT}/bin/condabin:${CONDA_ROOT}/bin:${PATH}"
 
   ( set +e
   local is_installed="0"
   for installer in "${mamba}" "${conda}" ; do
-    time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \
+    echo "${installer}" "create" -q -m -n "${RAPIDS_ENV_NAME}" -y --no-channel-priority \
+      -c 'conda-forge' -c 'nvidia' -c 'rapidsai'  \
+      ${CONDA_PACKAGES[*]} \
+      "${python_spec}"
+#    read placeholder
+    # for debugging, consider -vvv
+    time "${installer}" "create" -q -m -n "${RAPIDS_ENV_NAME}" -y --no-channel-priority \
       -c 'conda-forge' -c 'nvidia' -c 'rapidsai'  \
       ${CONDA_PACKAGES[*]} \
       "${python_spec}" \
-      > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+      && retval=$? || retval=$?
     sync
     if [[ "$retval" == "0" ]] ; then
       is_installed="1"
       break
     else
-      test -d "${DASK_CONDA_ENV}" && ( "${conda}" remove -n 'dask-rapids' --all || rm -rf "${DASK_CONDA_ENV}" )
+      test -d "${RAPIDS_CONDA_ENV}" && ( "${conda}" remove -n "${RAPIDS_ENV_NAME}" --all > /dev/null 2>&1 || rm -rf "${RAPIDS_CONDA_ENV}" )
       "${conda}" config --set channel_priority flexible
-      "${CONDA}" clean -a > /dev/null 2>&1
+      df -h
+      clean_conda_cache
     fi
   done
   if [[ "${is_installed}" == "0" ]]; then
     echo "failed to install dask"
-    df -h
     return 1
   fi
   )
@@ -523,49 +554,37 @@ function main() {
   fi
 }
 
+function clean_conda_cache() {
+  if ! grep -q "${rapids_mirror_mountpoint}" /proc/mounts ; then
+    "${CONDA}" clean -a
+  fi
+}
+
 function exit_handler() {
   set +e
   set -x
   echo "Exit handler invoked"
 
-  # Free conda cache
-  "${CONDA}" clean -a > /dev/null 2>&1
+  unmount_rapids_mirror
 
-  "${CONDA}" config --remove-key custom_channels
-  if grep -q "${rapids_mirror_mountpoint}" /proc/mounts ; then
-    umount "${rapids_mirror_mountpoint}"
-    gcloud compute instances detach-disk "$(hostname -s)" \
-      --device-name "${RAPIDS_MIRROR_DISK_NAME}" \
-      --zone       "${ZONE}" \
-      --disk-scope regional
-  fi
-
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
+  mv ~/.condarc.default ~/.condarc
+  mv /root/.config/pip/pip.conf.default /root/.config/pip/pip.conf
 
   # If system memory was sufficient to mount memory-backed filesystems
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # Stop hadoop services
     echo "cleaning up tmpfs mounts"
-    systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-
-    # remove the tmpfs conda pkgs_dirs
-    "${CONDA}" config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config"
-
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
 
     # Clean up shared memory mounts
     for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
       if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
         sync
-        sleep 3s
         umount -f ${shmdir}
       fi
     done
-
-    echo "restarting services"
-    systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  else
+    clean_conda_cache
+    # Clear pip cache from non-tmpfs
+    pip cache purge || echo "unable to purge pip cache"
   fi
 
   # Clean up OS package cache ; re-hold systemd package
@@ -583,13 +602,13 @@ function exit_handler() {
       /usr/lib \
       /opt/nvidia/* \
       /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
+      ${CONDA_ROOT}
   elif is_debian ; then
     du -hs \
       /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
       /usr/lib \
       /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
+      ${CONDA_ROOT}
   else
     du -hs \
       /var/lib/docker \
@@ -598,7 +617,7 @@ function exit_handler() {
       /usr/lib \
       /opt/nvidia/* \
       /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
+      ${CONDA_ROOT}
   fi
 
   # Process disk usage logs from installation period
@@ -633,6 +652,71 @@ print( "    samples-taken: ", scalar @siz, $/,
   return 0
 }
 
+function unmount_rapids_mirror() {
+  if ! grep -q "${rapids_mirror_mountpoint}" /proc/mounts ; then return ; fi
+
+  umount "${rapids_mirror_mountpoint}"
+  umount "${rapids_mirror_mountpoint}_ro"
+  gcloud compute instances detach-disk "$(hostname -s)" \
+    --device-name "${RAPIDS_MIRROR_DISK_NAME}" \
+    --zone       "${ZONE}" \
+    --disk-scope regional
+}
+
+function mount_rapids_mirror() {
+  # use a regional mirror instead of fetching from cloudflare CDN
+  export RAPIDS_MIRROR_DISK_NAME="$(gcloud compute disks list | awk "/${RAPIDS_MIRROR_DISK}-/ {print \$1}" | sort | tail -1)"
+  export RAPIDS_DISK_FQN="projects/${PROJECT_ID}/regions/${REGION}/disks/${RAPIDS_MIRROR_DISK_NAME}"
+
+  if [[ -z "${RAPIDS_MIRROR_DISK_NAME}" ]]; then return ; fi
+
+  # If the service account can describe the disk, attempt to attach and mount it
+  eval gcloud compute disks describe "${RAPIDS_MIRROR_DISK_NAME}" --region "${REGION}" > /tmp/mirror-disk.txt
+  if [[ "$?" != "0" ]] ; then return ; fi
+  
+  if ! grep -q "${rapids_mirror_mountpoint}" /proc/mounts ; then 
+    gcloud compute instances attach-disk "$(hostname -s)" \
+      --disk        "${RAPIDS_DISK_FQN}" \
+      --device-name "${RAPIDS_MIRROR_DISK_NAME}" \
+      --disk-scope  "regional" \
+      --zone        "${ZONE}" \
+      --mode=ro
+
+    mkdir -p "${rapids_mirror_mountpoint}" "${rapids_mirror_mountpoint}_ro" "${tmpdir}/overlay" "${tmpdir}/workdir"
+    mount -o ro "/dev/disk/by-id/google-${RAPIDS_MIRROR_DISK_NAME}" "${rapids_mirror_mountpoint}_ro"
+    mount -t overlay overlay -o lowerdir="${rapids_mirror_mountpoint}_ro",upperdir="${tmpdir}/overlay",workdir="${tmpdir}/workdir" "${rapids_mirror_mountpoint}"
+  fi
+  ${CONDA} config --add pkgs_dirs "${rapids_mirror_mountpoint}/conda_cache"
+#  echo "${CONDA}" config --set channel_alias "file://${rapids_mirror_mountpoint}/conda.anaconda.org"
+#  for channel in 'rapidsai' 'nvidia' 'pkgs/main' 'pkgs/r' 'conda-forge' ; do
+#    echo "${CONDA}" config --set \
+#      "custom_channels.${channel}" "file://${rapids_mirror_mountpoint}/conda.anaconda.org/"
+#  done
+  # patch conda to install from mirror
+#  files_to_patch=$(find ${CONDA_ROOT}/ -name 'download.py' | grep conda/gateways/connection)
+#  perl -i -pe 's{if "://" not in self.url:}{if "file://" in self.url or "://" not in self.url:}' \
+#    ${files_to_patch}
+#  perl -i -pe 's{self.url = url$}{self.url = url.replace("file://","")}' \
+#    ${files_to_patch}
+
+#  time for d in dask main nvidia r rapidsai conda-forge ; do
+#    find "${rapids_mirror_mountpoint}/conda.anaconda.org/${d}" -name '*.conda' -o -name '*.tar.bz2' -print0 | \
+#      xargs -0 ln -sf -t "${pkgs_dir}"
+#  done
+
+  # Point to the cache built with the mirror
+#  for channel in 'rapidsai' 'nvidia' 'main' 'r' 'conda-forge' ; do
+#    for plat in noarch linux-64 ; do
+#      echo ${CONDA} config --add pkgs_dirs "/srv/mirror/conda.anaconda.org/${channel}/${plat}"
+#    done
+#  done
+
+#  for channel in pkgs/main pkgs/r ; do
+#    echo ${CONDA} config --add default_channels "file://${rapids_mirror_mountpoint}/conda.anaconda.org/${channel}"
+#  done
+
+}
+
 function prepare_to_install() {
   readonly DEFAULT_CUDA_VERSION="12.4"
   CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION})
@@ -641,13 +725,17 @@ function prepare_to_install() {
   readonly ROLE=$(get_metadata_attribute dataproc-role)
   readonly MASTER=$(get_metadata_attribute dataproc-master)
 
+  export CONDA_ROOT=/opt/conda/miniconda3
+  export CONDA="${CONDA_ROOT}/bin/conda"
+
   # Dask config
   DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')"
   readonly DASK_RUNTIME
   readonly DASK_SERVICE=dask-cluster
   readonly DASK_WORKER_SERVICE=dask-worker
   readonly DASK_SCHEDULER_SERVICE=dask-scheduler
-  readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/dask-rapids"
+  readonly RAPIDS_ENV_NAME="dask-rapids"
+  readonly RAPIDS_CONDA_ENV="${CONDA_ROOT}/envs/${RAPIDS_ENV_NAME}"
 
   # Knox config
   readonly KNOX_HOME=/usr/lib/knox
@@ -667,78 +755,67 @@ function prepare_to_install() {
   export ZONE="$(echo $zone | sed -e 's:.*/::')"
   export REGION="$(echo ${ZONE} | perl -pe 's/^(.+)-[^-]+$/$1/')"
 
-  export CONDA=/opt/conda/miniconda3/bin/conda
+  export RAPIDS_MIRROR_DISK="$(get_metadata_attribute 'rapids-mirror-disk' '')"
+  export RAPIDS_MIRROR_HOST="$(get_metadata_attribute 'rapids-mirror-host' '')"
+
+  rapids_mirror_mountpoint=/srv/mirror
+
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  # With a local conda mirror mounted, use reduced ram disk size
+  if [[ -n "${RAPIDS_MIRROR_DISK}" ]] ; then
+    min_mem=18500000
+    pkgs_dir=
+  else
+    min_mem=33300000
+  fi
   # Write to a ramdisk instead of churning the persistent disk
-  if [[ ${free_mem} -ge 33300000 ]]; then
+  if [[ ${free_mem} -ge ${min_mem} ]]; then
     tmpdir=/mnt/shm
-    mkdir -p /mnt/shm
-    mount -t tmpfs tmpfs /mnt/shm
-
-    # Download conda packages to tmpfs
-    # Minimum of 15.5G of capacity required for rapids package install via conda
-    "${CONDA}" config --add pkgs_dirs /mnt/shm
-    mount -t tmpfs tmpfs /mnt/shm
+    mkdir -p "${tmpdir}"
+    mount -t tmpfs tmpfs "${tmpdir}"
 
-    # Download pip packages to tmpfs
-    pip config set global.cache-dir /mnt/shm || echo "unable to set global.cache-dir"
-
-    # Download OS packages to tmpfs
-    if is_debuntu ; then
-      mount -t tmpfs tmpfs /var/cache/apt/archives
-    else
-      while [[ -f /var/cache/dnf/metadata_lock.pid ]] ; do sleep 1s ; done
-      mount -t tmpfs tmpfs /var/cache/dnf
-    fi
+    # Minimum of 11G of capacity required for rapids package install via conda
+    # + 5G without rapids mirror mounted
+    mount -t tmpfs tmpfs "${tmpdir}"
   else
     tmpdir=/tmp
   fi
+
   install_log="${tmpdir}/install.log"
   trap exit_handler EXIT
 
-  # use a regional mirror instead of fetching from cloudflare CDN
-  export RAPIDS_MIRROR_DISK="$(get_metadata_attribute 'rapids-mirror-disk' '')"
-  export RAPIDS_MIRROR_DISK_NAME="$(gcloud compute disks list | awk "/${RAPIDS_MIRROR_DISK}-${REGION}-/ {print \$1}" | sort | tail -1)"
-  export RAPIDS_DISK_FQN="projects/${PROJECT_ID}/regions/${REGION}/disks/${RAPIDS_MIRROR_DISK_NAME}"
-  export RAPIDS_MIRROR_HOST="$(get_metadata_attribute 'rapids-mirror-host' '')"
-  rapids_mirror_mountpoint=/srv/mirror
-  if [[ -n "${RAPIDS_MIRROR_DISK_NAME}" ]]; then
-  ( set +e
-    # If the service account can describe the disk, attempt to attach and mount it
-    gcloud compute disks describe "${RAPIDS_MIRROR_DISK_NAME}" --region us-west4 > /tmp/mirror-disk.txt
-    if [[ "$?" == "0" ]] ; then
-      for channel in 'rapidsai' 'nvidia' 'main' 'r' 'conda-forge' ; do
-        "${CONDA}" config --set \
-          "custom_channels.${channel}" "file://${rapids_mirror_mountpoint}/conda.anaconda.org/"
-      done
-
-      if ! grep -q "${rapids_mirror_mountpoint}" /proc/mounts ; then 
-        gcloud compute instances attach-disk "$(hostname -s)" \
-          --disk        "${RAPIDS_DISK_FQN}" \
-          --device-name "${RAPIDS_MIRROR_DISK_NAME}" \
-          --disk-scope  "regional" \
-          --zone        "${ZONE}" \
-          --mode=ro
-
-        mkdir -p "${rapids_mirror_mountpoint}"
-        mount -o ro "/dev/disk/by-id/google-${RAPIDS_MIRROR_DISK_NAME}" "${rapids_mirror_mountpoint}"
-      fi
-    fi ; )
-  elif [[ -n "${RAPIDS_MIRROR_HOST}" ]] && nc -vz "${RAPIDS_MIRROR_HOST}" 80 > /dev/null 2>&1 ; then
-    for channel in 'conda-forge' 'rapidsai' 'nvidia' 'r' 'main' ; do
-      "${CONDA}" config --set \
+  touch ~/.condarc
+  cp ~/.condarc ~/.condarc.default
+
+  #"${CONDA}" config --set verbosity 3
+  # Clean conda cache
+  clean_conda_cache
+
+  mount_rapids_mirror
+
+  if [[ -n "${RAPIDS_MIRROR_HOST}" ]] && nc -vz "${RAPIDS_MIRROR_HOST}" 80 > /dev/null 2>&1 ; then
+    for channel in 'conda-forge' 'rapidsai' 'nvidia' 'pkgs/r' 'pkgs/main' ; do
+      echo "${CONDA}" config --set \
         "custom_channels.${channel}" "http://${RAPIDS_MIRROR_HOST}/conda.anaconda.org/"
     done
   fi
 
-  # Clean conda cache
-  "${CONDA}" clean -a
+  if grep -q "${rapids_mirror_mountpoint}" /proc/mounts ; then
+    # if we are using the mirror disk, install exclusively from its cache
+    extra_conda_args="--offline"
+  else
+    pkgs_dir="${tmpdir}/pkgs_dir"
+    mkdir -p "${pkgs_dir}"
+    "${CONDA}" config --add pkgs_dirs "${pkgs_dir}"
+  fi
 
   # Monitor disk usage in a screen session
   if is_debuntu ; then
-    apt-get install -y -qq screen
+    command -v screen || \
+      apt-get install -y -qq screen
   else
-    dnf -y -q install screen
+    command -v screen || \
+      dnf -y -q install screen
   fi
   df / > "/run/disk-usage.log"
   touch "/run/keep-running-df"

From 713f1d96bb11590946c781b6c614f592c02b30ea Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 21 Nov 2024 17:44:09 -0800
Subject: [PATCH 15/21] removed some unnecessary log redirection, comments

---
 examples/secure-boot/install_gpu_driver.sh | 63 +++++++---------------
 1 file changed, 18 insertions(+), 45 deletions(-)

diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 5e961cf..1d83944 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -16,18 +16,6 @@
 
 set -euxo pipefail
 
-# if customer needs proxy:
-
-function set_proxy(){
-  export METADATA_HTTP_PROXY=""
-  export http_proxy="${METADATA_HTTP_PROXY}"
-  export https_proxy="${METADATA_HTTP_PROXY}"
-  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
-  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
-  export no_proxy=metadata.google.internal
-  export NO_PROXY=metadata.google.internal
-}
-
 function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
@@ -245,8 +233,8 @@ if ( compare_versions_lte "8.3.1.22" "${CUDNN_VERSION}" ); then
   fi
   CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
 fi
-if is_cuda12 ; then
-  # When cuda version is 12
+if ( compare_versions_lte "12.0" "${CUDA_VERSION}" ); then
+  # When cuda version is greater than 12.0
   CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz"
   CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
 fi
@@ -757,42 +745,18 @@ function install_cuda_toolkit() {
   fi
 }
 
-function remove_drivers_aliases() {
- local conffile="/etc/modprobe.d/nvidia-aliases.conf"
- rm "${conffile}"
-}
-
-function install_drivers_aliases() {
-  if is_rocky ; then return ; fi
-  if ! (is_debian12 || is_debian11) ; then return ; fi
-  if (is_debian12 && is_cuda11) && is_src_nvidia ; then return ; fi # don't install on debian 12 / cuda11 with drivers from nvidia
-  # Add a modprobe alias to prefer the open kernel modules
-  local conffile="/etc/modprobe.d/nvidia-aliases.conf"
-  echo -n "" > "${conffile}"
-  local prefix
-  if   is_src_os     ; then prefix="nvidia-current-open"
-  elif is_src_nvidia ; then prefix="nvidia-current" ; fi
-  local suffix
-  for suffix in uvm peermem modeset drm; do
-    echo "alias nvidia-${suffix} ${prefix}-${suffix}" >> "${conffile}"
-  done
-  echo "alias nvidia ${prefix}" >> "${conffile}"
-}
-
 function load_kernel_module() {
   # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
   for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
     rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
   done
 
-#  install_drivers_aliases
   depmod -a
   modprobe nvidia
   for suffix in uvm modeset drm; do
     modprobe "nvidia-${suffix}"
   done
   # TODO: if peermem is available, also modprobe nvidia-peermem
-#  remove_drivers_aliases
 }
 
 # Install NVIDIA GPU driver provided by NVIDIA
@@ -1062,12 +1026,10 @@ function main() {
 
   if is_debuntu ; then
     export DEBIAN_FRONTEND=noninteractive
-    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" > /dev/null 2>&1
+    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}"
   elif is_rocky ; then
-    execute_with_retries dnf -y -q update --exclude=systemd*,kernel* \
-    > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
-    execute_with_retries dnf -y -q install pciutils gcc \
-    > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; }
+    execute_with_retries dnf -y -q update --exclude=systemd*,kernel*
+    execute_with_retries dnf -y -q install pciutils gcc
 
     local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
     local kernel_devel_pkg_out="$(eval "${dnf_cmd} 2>&1")"
@@ -1080,8 +1042,7 @@ function main() {
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
-        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" \
-	> "${install_log}" 2>&1 || { cat "${install_log}" ; exit -4 ; }
+        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
       sync
     else
       execute_with_retries "${dnf_cmd}"
@@ -1365,6 +1326,16 @@ print( "    samples-taken: ", scalar @siz, $/,
   return 0
 }
 
+function set_proxy(){
+  export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)"
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  export no_proxy=metadata.google.internal,169.254.169.254
+  export NO_PROXY=metadata.google.internal,169.254.169.254
+}
+
 function prepare_to_install(){
   nvsmi_works="0"
   readonly bdcfg="/usr/local/bin/bdconfig"
@@ -1403,6 +1374,8 @@ function prepare_to_install(){
   fi
   install_log="${tmpdir}/install.log"
 
+  set_proxy
+
   if is_debuntu ; then
     clean_up_sources_lists
     apt-get update -qq

From 55b0d3811b753e8def987881764f4bce042376c0 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 22 Nov 2024 11:30:20 -0800
Subject: [PATCH 16/21] tested to work

---
 examples/secure-boot/install_gpu_driver.sh | 34 +++++++++++-----------
 examples/secure-boot/pre-init.sh           |  6 ++--
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 1d83944..79d157b 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -38,6 +38,7 @@ function os_vercat()   ( set +x
                    else os_version ; fi ; )
 
 function remove_old_backports {
+  if ! is_debuntu ; then  return ; fi
   if is_debian12 ; then return ; fi
   # This script uses 'apt-get update' and is therefore potentially dependent on
   # backports repositories which have been archived.  In order to mitigate this
@@ -1016,20 +1017,11 @@ function nvsmi() {
   "${nvsmi}" $*
 }
 
-function main() {
-  if ! is_debian && ! is_ubuntu && ! is_rocky ; then
-    echo "Unsupported OS: '$(os_name)'"
-    exit 1
-  fi
-
-  remove_old_backports
-
+function install_dependencies() {
   if is_debuntu ; then
-    export DEBIAN_FRONTEND=noninteractive
-    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}"
+    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen
   elif is_rocky ; then
-    execute_with_retries dnf -y -q update --exclude=systemd*,kernel*
-    execute_with_retries dnf -y -q install pciutils gcc
+    execute_with_retries dnf -y -q install pciutils gcc screen
 
     local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
     local kernel_devel_pkg_out="$(eval "${dnf_cmd} 2>&1")"
@@ -1048,7 +1040,9 @@ function main() {
       execute_with_retries "${dnf_cmd}"
     fi
   fi
+}
 
+function main() {
   # This configuration should be run on all nodes
   # regardless if they have attached GPUs
   configure_yarn
@@ -1340,6 +1334,15 @@ function prepare_to_install(){
   nvsmi_works="0"
   readonly bdcfg="/usr/local/bin/bdconfig"
   tmpdir=/tmp/
+  if ! is_debuntu && ! is_rocky ; then
+    echo "Unsupported OS: '$(os_name)'"
+    exit 1
+  fi
+
+  remove_old_backports
+
+  export DEBIAN_FRONTEND=noninteractive
+
   local free_mem
   trap exit_handler EXIT
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
@@ -1395,12 +1398,9 @@ function prepare_to_install(){
 
   configure_dkms_certs
 
+  install_dependencies
+
   # Monitor disk usage in a screen session
-  if is_debuntu ; then
-      execute_with_retries apt-get install -y -qq screen
-  else
-      execute_with_retries dnf -y -q install screen
-  fi
   df / > "/run/disk-usage.log"
   touch "/run/keep-running-df"
   screen -d -m -US keep-running-df \
diff --git a/examples/secure-boot/pre-init.sh b/examples/secure-boot/pre-init.sh
index 78ec517..bbabe3f 100644
--- a/examples/secure-boot/pre-init.sh
+++ b/examples/secure-boot/pre-init.sh
@@ -128,14 +128,14 @@ time generate_from_dataproc_version "${dataproc_version}"
 # cuda image -> rapids
 case "${dataproc_version}" in
   "2.0-debian10" ) disk_size_gb="41" ;; # 40.12G 37.51G   0.86G  98% / # rapids-pre-init-2-0-debian10
-  "2.0-rocky8"   ) disk_size_gb="39" ;; # 38.79G 38.04G   0.76G  99% / # rapids-pre-init-2-0-rocky8
+  "2.0-rocky8"   ) disk_size_gb="41" ;; # 38.79G 38.04G   0.76G  99% / # rapids-pre-init-2-0-rocky8
   "2.0-ubuntu18" ) disk_size_gb="40" ;; # 37.62G 36.69G   0.91G  98% / # rapids-pre-init-2-0-ubuntu18
   "2.1-debian11" ) disk_size_gb="44" ;; # 42.09G 39.77G   0.49G  99% / # rapids-pre-init-2-1-debian11
   "2.1-rocky8"   ) disk_size_gb="44" ;; # 43.79G 41.11G   2.68G  94% / # rapids-pre-init-2-1-rocky8
-  "2.1-ubuntu20" ) disk_size_gb="41" ;; # 39.55G 39.39G   0.15G 100% / # rapids-pre-init-2-1-ubuntu20
+  "2.1-ubuntu20" ) disk_size_gb="45" ;; # 39.55G 39.39G   0.15G 100% / # rapids-pre-init-2-1-ubuntu20
   "2.2-debian12" ) disk_size_gb="46" ;; # 44.06G 41.73G   0.41G 100% / # rapids-pre-init-2-2-debian12
   "2.2-rocky9"   ) disk_size_gb="45" ;; # 44.79G 42.29G   2.51G  95% / # rapids-pre-init-2-2-rocky9
-  "2.2-ubuntu22" ) disk_size_gb="44" ;; # 42.46G 41.97G   0.48G  99% / # rapids-pre-init-2-2-ubuntu22
+  "2.2-ubuntu22" ) disk_size_gb="46" ;; # 42.46G 41.97G   0.48G  99% / # rapids-pre-init-2-2-ubuntu22
 esac
 
 #disk_size_gb="50"

From f86ee4dc8d1f69d1aa003fc984a24feb631f4853 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 25 Nov 2024 14:03:29 -0800
Subject: [PATCH 17/21] [init-actions] merging from init action

* now generating version comparison functions for use as utilities
* rename remove_old_packports
* specifying versions more clearly
* added mount_ramdisk function to contain ramdisk related code
---
 examples/secure-boot/install_gpu_driver.sh | 195 +++++++++++----------
 1 file changed, 104 insertions(+), 91 deletions(-)

diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 79d157b..b5f38a7 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -19,17 +19,32 @@ set -euxo pipefail
 function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
-function is_rocky()    ( set +x ;  [[ "$(os_id)" == 'rocky' ]] ; )
-function is_rocky8()   ( set +x ;  is_rocky && [[ "$(os_version)" == '8'* ]] ; )
-function is_rocky9()   ( set +x ;  is_rocky && [[ "$(os_version)" == '9'* ]] ; )
-function is_ubuntu()   ( set +x ;  [[ "$(os_id)" == 'ubuntu' ]] ; )
-function is_ubuntu18() ( set +x ;  is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; )
-function is_ubuntu20() ( set +x ;  is_ubuntu && [[ "$(os_version)" == '20.04'* ]] ; )
-function is_ubuntu22() ( set +x ;  is_ubuntu && [[ "$(os_version)" == '22.04'* ]] ; )
-function is_debian()   ( set +x ;  [[ "$(os_id)" == 'debian' ]] ; )
-function is_debian10() ( set +x ;  is_debian && [[ "$(os_version)" == '10'* ]] ; )
-function is_debian11() ( set +x ;  is_debian && [[ "$(os_version)" == '11'* ]] ; )
-function is_debian12() ( set +x ;  is_debian && [[ "$(os_version)" == '12'* ]] ; )
+
+function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
+function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
+function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
+function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
+
+readonly -A supported_os=(
+  ['debian']="10 11 12"
+  ['rocky']="8 9"
+  ['ubuntu']="18.04 20.04 22.04"
+)
+
+# dynamically define OS version test utility functions
+if [[ "$(os_id)" == "rocky" ]];
+then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+else _os_version="$(os_version)"; fi
+for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+  done
+done
+
 function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
 
 function os_vercat()   ( set +x
@@ -37,9 +52,8 @@ function os_vercat()   ( set +x
   elif is_rocky  ; then os_version | sed -e 's/[^0-9].*$//g'
                    else os_version ; fi ; )
 
-function remove_old_backports {
-  if ! is_debuntu ; then  return ; fi
-  if is_debian12 ; then return ; fi
+function repair_old_backports {
+  if ge_debian12 || ! is_debuntu ; then return ; fi
   # This script uses 'apt-get update' and is therefore potentially dependent on
   # backports repositories which have been archived.  In order to mitigate this
   # problem, we will use archive.debian.org for the oldoldstable repo
@@ -59,14 +73,6 @@ function remove_old_backports {
   done
 }
 
-# Return true if the first argument is equal to or less than the second argument
-function compare_versions_lte { [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; }
-
-# Return true if the first argument is less than the second argument
-function compare_versions_lt() ( set +x
-  [ "$1" = "$2" ] && return 1 || compare_versions_lte $1 $2
-)
-
 function print_metadata_value() {
   local readonly tmpfile=$(mktemp)
   http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
@@ -123,7 +129,7 @@ readonly ROLE
 # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
 # https://developer.nvidia.com/cuda-downloads
 readonly -A DRIVER_FOR_CUDA=(
-          [11.8]="525.147.05" [12.4]="550.54.14"  [12.6]="560.35.03"
+          [11.8]="560.35.03" [12.4]="560.35.06"  [12.6]="560.35.06"
 )
 # https://developer.nvidia.com/cudnn-downloads
 readonly -A CUDNN_FOR_CUDA=(
@@ -140,32 +146,44 @@ readonly -A CUDA_SUBVER=(
 RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 readonly DEFAULT_CUDA_VERSION='12.4'
 CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+# CUDA 11 no longer supported on debian12 - 2024-11-22
+if ge_debian12 && version_le "${CUDA_VERSION%%.*}" "11" ; then
+  CUDA_VERSION="${DEFAULT_CUDA_VERSION}"
+fi
 readonly CUDA_VERSION
 readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}"
 
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
+function le_cuda12() ( set +x ; version_le "${CUDA_VERSION}" "12" ; )
+function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION}" "12" ; )
+
 function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
+function le_cuda11() ( set +x ; version_le "${CUDA_VERSION}" "11" ; )
+function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION}" "11" ; )
+
 readonly DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
 DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
-if is_debian11 || is_ubuntu22 || is_ubuntu20 ; then DRIVER_VERSION="560.28.03" ; fi
-if is_ubuntu20 && is_cuda11 ; then DRIVER_VERSION="535.183.06" ; fi
+if is_debian11 || ge_ubuntu20 ; then DRIVER_VERSION="560.28.03" ; fi
+if is_ubuntu20 && le_cuda11 ; then DRIVER_VERSION="535.183.06" ; fi
 
 readonly DRIVER_VERSION
 readonly DRIVER=${DRIVER_VERSION%%.*}
 
-# Parameters for NVIDIA-provided CUDNN library
+readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+# Parameters for NVIDIA-provided cuDNN library
 readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
 CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
 function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
 function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-if is_rocky \
-   && (compare_versions_lte "${CUDNN_VERSION}" "8.0.5.39") ; then
-  CUDNN_VERSION="8.0.5.39"
-elif (is_ubuntu20 || is_ubuntu22 || is_debian12) && is_cudnn8 ; then
+# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
   # cuDNN v8 is not distribution for ubuntu20+, debian12
-  CUDNN_VERSION="9.1.0.70"
-
-elif (is_ubuntu18 || is_debian10 || is_debian11) && is_cudnn9 ; then
+  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
   # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
   CUDNN_VERSION="8.8.0.121"
 fi
@@ -188,7 +206,7 @@ if is_ubuntu22  ; then
 
     nccl_shortname="ubuntu2004"
     shortname="$(os_id)$(os_vercat)"
-elif is_rocky9 ; then
+elif ge_rocky9 ; then
     # use packages from previous release until such time as nvidia
     # release rhel9 builds
 
@@ -212,13 +230,12 @@ NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}
 readonly NCCL_REPO_URL
 readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
 
-readonly -A DEFAULT_NVIDIA_CUDA_URLS=(
-  [11.8]="${NVIDIA_BASE_DL_URL}/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
-  [12.1]="${NVIDIA_BASE_DL_URL}/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"
-  [12.4]="${NVIDIA_BASE_DL_URL}/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run"
-  [12.6]="${NVIDIA_BASE_DL_URL}/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run"
-)
-readonly DEFAULT_NVIDIA_CUDA_URL=${DEFAULT_NVIDIA_CUDA_URLS["${CUDA_VERSION}"]}
+if ge_cuda12 ; then
+  readonly DEFAULT_NVIDIA_CUDA_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run"
+else
+  readonly DEFAULT_NVIDIA_CUDA_URL="https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
+fi
+
 NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
 readonly NVIDIA_CUDA_URL
 
@@ -227,16 +244,19 @@ readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
 
 CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
 CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
-if ( compare_versions_lte "8.3.1.22" "${CUDNN_VERSION}" ); then
+if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
+  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
   CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
-  if ( compare_versions_lte "${CUDNN_VERSION}" "8.4.1.50" ); then
+  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
+    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
     CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
   fi
+  # Use legacy url format with one of the tarball name formats depending on version as above
   CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
 fi
-if ( compare_versions_lte "12.0" "${CUDA_VERSION}" ); then
-  # When cuda version is greater than 12.0
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz"
+if ( version_ge "${CUDA_VERSION}" "12.0" ); then
+  # Use modern url format When cuda version is greater than or equal to 12.0
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
   CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
 fi
 readonly CUDNN_TARBALL
@@ -443,7 +463,7 @@ function install_nvidia_cudnn() {
       echo "Unsupported cudnn version: '${major_version}'"
     fi
   elif is_debuntu; then
-    if is_debian12 && is_src_os ; then
+    if ge_debian12 && is_src_os ; then
       apt-get -y install nvidia-cudnn
     else
       local CUDNN="${CUDNN_VERSION%.*}"
@@ -575,7 +595,7 @@ function clear_dkms_key {
 }
 
 function add_contrib_component() {
-  if is_debian12 ; then
+  if ge_debian12 ; then
       # Include in sources file components on which nvidia-kernel-open-dkms depends
       local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
       local components="main contrib"
@@ -588,7 +608,7 @@ function add_contrib_component() {
 
 function add_nonfree_components() {
   if is_src_nvidia ; then return; fi
-  if is_debian12 ; then
+  if ge_debian12 ; then
       # Include in sources file components on which nvidia-open-kernel-dkms depends
       local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
       local components="main contrib non-free non-free-firmware"
@@ -673,7 +693,7 @@ function build_driver_from_github() {
 }
 
 function build_driver_from_packages() {
-  if is_ubuntu || is_debian ; then
+  if is_debuntu ; then
     if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
       local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
       local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
@@ -729,7 +749,7 @@ function install_cuda_runfile() {
 
 function install_cuda_toolkit() {
   local cudatk_package=cuda-toolkit
-  if is_debian12 && is_src_os ; then
+  if ge_debian12 && is_src_os ; then
     cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
   elif [[ -n "${CUDA_VERSION}" ]]; then
     cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
@@ -762,7 +782,7 @@ function load_kernel_module() {
 
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
-  if is_debian12 && is_src_os ; then
+  if ge_debian12 && is_src_os ; then
     add_nonfree_components
     add_repo_nvidia_container_toolkit
     apt-get update -qq
@@ -776,7 +796,7 @@ function install_nvidia_gpu_driver() {
           libglvnd0 \
           libcuda1
     #clear_dkms_key
-  elif is_ubuntu18 || is_debian10 || (is_debian12 && is_cuda11) ; then
+  elif le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ; then
 
     install_nvidia_userspace_runfile
 
@@ -1240,10 +1260,7 @@ function exit_handler() {
 
     # Clean up shared memory mounts
     for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
-      if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
-        rm -rf ${shmdir}/*
-        sync
-        sleep 3s
+      if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then
         umount -f ${shmdir}
       fi
     done
@@ -1257,7 +1274,7 @@ function exit_handler() {
     apt-get -y -qq clean
     apt-get -y -qq autoremove
     # re-hold systemd package
-    if is_debian12 ; then
+    if ge_debian12 ; then
     apt-mark hold systemd libsystemd0 ; fi
   else
     dnf clean all
@@ -1330,6 +1347,32 @@ function set_proxy(){
   export NO_PROXY=metadata.google.internal,169.254.169.254
 }
 
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+
+  # Write to a ramdisk instead of churning the persistent disk
+
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}"
+  mount -t tmpfs tmpfs "${tmpdir}"
+
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  pip cache purge || echo "unable to purge pip cache"
+
+  # Download pip packages to tmpfs
+  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
+  else
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
+}
+
 function prepare_to_install(){
   nvsmi_works="0"
   readonly bdcfg="/usr/local/bin/bdconfig"
@@ -1339,42 +1382,12 @@ function prepare_to_install(){
     exit 1
   fi
 
-  remove_old_backports
+  repair_old_backports
 
   export DEBIAN_FRONTEND=noninteractive
 
-  local free_mem
   trap exit_handler EXIT
-  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  # Write to a ramdisk instead of churning the persistent disk
-  if [[ ${free_mem} -ge 10500000 ]]; then
-
-    # Services might use /tmp for temporary files - if we see errors,
-    # consider uncommenting the following command to stop them during
-    # install
-
-    # systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-    sudo mount -t tmpfs tmpfs /tmp
-
-    tmpdir="/mnt/shm"
-    mkdir -p "${tmpdir}"
-    mount -t tmpfs tmpfs "${tmpdir}"
-
-    # Clear pip cache
-    pip cache purge || echo "unable to purge pip cache"
-
-    # Download pip packages to tmpfs
-    pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
-
-    # Download OS packages to tmpfs
-    if is_debuntu ; then
-      mount -t tmpfs tmpfs /var/cache/apt/archives
-    else
-      mount -t tmpfs tmpfs /var/cache/dnf
-    fi
-  else
-    tmpdir=/tmp
-  fi
+  mount_ramdisk
   install_log="${tmpdir}/install.log"
 
   set_proxy
@@ -1385,7 +1398,7 @@ function prepare_to_install(){
     apt-get -y clean
     sleep 5s
     apt-get -y -qq autoremove
-    if is_debian12 ; then
+    if ge_debian12 ; then
     apt-mark unhold systemd libsystemd0 ; fi
   else
     dnf clean all

From 5a61051004c6e17b72cd33a7adf50f4302cbdbb7 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 25 Nov 2024 15:15:46 -0800
Subject: [PATCH 18/21] proper version for 560

---
 examples/secure-boot/install_gpu_driver.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index b5f38a7..d8fe904 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -129,7 +129,7 @@ readonly ROLE
 # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
 # https://developer.nvidia.com/cuda-downloads
 readonly -A DRIVER_FOR_CUDA=(
-          [11.8]="560.35.03" [12.4]="560.35.06"  [12.6]="560.35.06"
+          [11.8]="560.35.03" [12.4]="560.35.03"  [12.6]="560.35.03"
 )
 # https://developer.nvidia.com/cudnn-downloads
 readonly -A CUDNN_FOR_CUDA=(

From 3d8284fc0c026cce251b8e90feab481f5a76e140 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 25 Nov 2024 20:39:20 -0800
Subject: [PATCH 19/21] correct cuda version when using older debuntu

---
 examples/secure-boot/install_gpu_driver.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index d8fe904..935b416 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -199,7 +199,7 @@ readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USER
 
 # Short name for urls
 if is_ubuntu22  ; then
-    # at the time of writing 20240721 there is no ubuntu2204 in the index of repos at
+    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
     # https://developer.download.nvidia.com/compute/machine-learning/repos/
     # use packages from previous release until such time as nvidia
     # release ubuntu2204 builds
@@ -231,7 +231,11 @@ readonly NCCL_REPO_URL
 readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
 
 if ge_cuda12 ; then
-  readonly DEFAULT_NVIDIA_CUDA_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run"
+  if (le_debian11 || le_ubuntu18)
+  then CUDA_DRIVER_VERSION="525.60.13"         ; CUDA_URL_VERSION="12.0.0"
+  else CUDA_DRIVER_VERSION="${DRIVER_VERSION}" ; CUDA_URL_VERSION="${CUDA_FULL_VERSION}" ; fi
+
+  readonly DEFAULT_NVIDIA_CUDA_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_URL_VERSION}/local_installers/cuda_${CUDA_URL_VERSION}_${CUDA_DRIVER_VERSION}_linux.run"
 else
   readonly DEFAULT_NVIDIA_CUDA_URL="https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
 fi

From aec11e612618ebde303ad00f68c080bed305b800 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 27 Nov 2024 01:53:08 -0800
Subject: [PATCH 20/21] tested with 11.8 and 12.0

---
 examples/secure-boot/build-current-images.sh |  14 +--
 examples/secure-boot/install_gpu_driver.sh   | 115 +++++++++++++------
 examples/secure-boot/pre-init.sh             |   2 +-
 3 files changed, 89 insertions(+), 42 deletions(-)

diff --git a/examples/secure-boot/build-current-images.sh b/examples/secure-boot/build-current-images.sh
index f5834c7..f9147d2 100644
--- a/examples/secure-boot/build-current-images.sh
+++ b/examples/secure-boot/build-current-images.sh
@@ -103,11 +103,11 @@ configure_service_account
 session_name="build-current-images"
 
 readonly timestamp="$(date +%F-%H-%M)"
-#readonly timestamp="2024-11-05-22-55"
+#readonly timestamp="2024-11-27-06-47"
 export timestamp
 
 export tmpdir=/tmp/${timestamp};
-mkdir ${tmpdir}
+mkdir -p ${tmpdir}
 export ZONE="$(jq -r .ZONE env.json)"
 gcloud compute instances list --zones "${ZONE}" --format json > ${tmpdir}/instances.json
 gcloud compute images    list                   --format json > ${tmpdir}/images.json
@@ -115,17 +115,13 @@ gcloud compute images    list                   --format json > ${tmpdir}/images
 # Run generation scripts simultaneously for each dataproc image version
 screen -L -US "${session_name}" -c examples/secure-boot/pre-init.screenrc
 
-# tail -n 3 /tmp/custom-image-*/logs/workflow.log
-# tail -n 3 /tmp/custom-image-*/logs/startup-script.log
-# tail -n 3 /tmp/custom-image-${PURPOSE}-2-*/logs/workflow.log
 function find_disk_usage() {
-  for workflow_log in $(grep -l "Customization script" /tmp/custom-image-*/logs/workflow.log) ;  do
+  grep 'Customization script' /tmp/custom-image-*/logs/workflow.log
+#  grep maximum-disk-used /tmp/custom-image-*/logs/startup-script.log
+  for workflow_log in $(grep -l "Customization script" /tmp/custom-image-*/logs/workflow.log) ; do
     startup_log=$(echo "${workflow_log}" | sed -e 's/workflow.log/startup-script.log/')
     grep -A5 'Filesystem.*1K-blocks' "${startup_log}" | perl examples/secure-boot/genline.pl "${workflow_log}"
   done
 }
 
-# sleep 8m ; grep 'Customization script' /tmp/custom-image-*/logs/workflow.log
-# grep maximum-disk-used /tmp/custom-image-*/logs/startup-script.log
-
 revoke_bindings
diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index 935b416..e9b4a5e 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -128,43 +128,71 @@ readonly ROLE
 # CUDA version and Driver version
 # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
 # https://developer.nvidia.com/cuda-downloads
+# Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
-          [11.8]="560.35.03" [12.4]="560.35.03"  [12.6]="560.35.03"
+          ["11.8"]="560.35.03"
+          ["12.0"]="525.60.13"  ["12.4"]="560.35.03"  ["12.6"]="560.35.03"
 )
 # https://developer.nvidia.com/cudnn-downloads
+if is_debuntu ; then
 readonly -A CUDNN_FOR_CUDA=(
-          [11.8]="9.5.1.17"   [12.4]="9.5.1.17"   [12.6]="9.5.1.17"
+          ["11.8"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.4"]="9.5.1.17"   ["12.6"]="9.5.1.17"
 )
+elif is_rocky ; then
+# rocky:
+#   12.0: 8.8.1.3
+#   12.1: 8.9.3.28
+#   12.2: 8.9.7.29
+#   12.3: 9.0.0.312
+#   12.4: 9.1.1.17
+#   12.5: 9.2.1.18
+#   12.6: 9.5.1.17
+readonly -A CUDNN_FOR_CUDA=(
+          ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"   ["12.4"]="9.1.1.17"   ["12.6"]="9.5.1.17"
+)
+fi
 # https://developer.nvidia.com/nccl/nccl-download
+# 12.2: 2.19.3, 12.5: 2.21.5
 readonly -A NCCL_FOR_CUDA=(
-          [11.8]="2.15.5"     [12.4]="2.23.4"     [12.6]="2.23.4"
+          ["11.8"]="2.15.5"
+          ["12.0"]="2.16.5"  ["12.4"]="2.23.4"     ["12.6"]="2.23.4"
 )
 readonly -A CUDA_SUBVER=(
-          [11.8]="11.8.0"     [12.4]="12.4.1"     [12.6]="12.6.2"
+          ["11.8"]="11.8.0"
+          ["12.0"]="12.0.0"  ["12.4"]="12.4.1"     ["12.6"]="12.6.2"
 )
 
 RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 readonly DEFAULT_CUDA_VERSION='12.4'
 CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
-# CUDA 11 no longer supported on debian12 - 2024-11-22
-if ge_debian12 && version_le "${CUDA_VERSION%%.*}" "11" ; then
+if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then
+  # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27
   CUDA_VERSION="${DEFAULT_CUDA_VERSION}"
 fi
+
+if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then
+  # Only CUDA 12.0 supported on older debuntu
+  CUDA_VERSION="12.0"
+fi
 readonly CUDA_VERSION
 readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}"
 
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
-function le_cuda12() ( set +x ; version_le "${CUDA_VERSION}" "12" ; )
-function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION}" "12" ; )
+function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
+function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
 
 function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
-function le_cuda11() ( set +x ; version_le "${CUDA_VERSION}" "11" ; )
-function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION}" "11" ; )
+function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
+function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
 
-readonly DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
+readonly DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}"
 DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
-if is_debian11 || ge_ubuntu20 ; then DRIVER_VERSION="560.28.03" ; fi
-if is_ubuntu20 && le_cuda11 ; then DRIVER_VERSION="535.183.06" ; fi
+if ( is_debian11 || is_ubuntu20 ) ; then DRIVER_VERSION="560.28.03"  ; fi
+if ( is_ubuntu20 && le_cuda11 )   ; then DRIVER_VERSION="535.183.06" ; fi
+if ( is_rocky && le_cuda11 )      ; then DRIVER_VERSION="525.147.05" ; fi #553.22.1
+if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then DRIVER_VERSION="560.28.03"  ; fi
 
 readonly DRIVER_VERSION
 readonly DRIVER=${DRIVER_VERSION%%.*}
@@ -230,18 +258,33 @@ NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}
 readonly NCCL_REPO_URL
 readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
 
-if ge_cuda12 ; then
-  if (le_debian11 || le_ubuntu18)
-  then CUDA_DRIVER_VERSION="525.60.13"         ; CUDA_URL_VERSION="12.0.0"
-  else CUDA_DRIVER_VERSION="${DRIVER_VERSION}" ; CUDA_URL_VERSION="${CUDA_FULL_VERSION}" ; fi
+function set_cuda_runfile_url() {
+  local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}"
+  local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}"
+
+  if ge_cuda12 ; then
+    if ( le_debian11 || le_ubuntu18 ) ; then
+      RUNFILE_DRIVER_VERSION="525.60.13"
+      RUNFILE_CUDA_VERSION="12.0.0"
+    elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then
+      RUNFILE_DRIVER_VERSION="525.147.05"
+      RUNFILE_CUDA_VERSION="12.0.0"
+    fi
+  else
+    RUNFILE_DRIVER_VERSION="520.61.05"
+    RUNFILE_CUDA_VERSION="11.8.0"
+  fi
 
-  readonly DEFAULT_NVIDIA_CUDA_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_URL_VERSION}/local_installers/cuda_${CUDA_URL_VERSION}_${CUDA_DRIVER_VERSION}_linux.run"
-else
-  readonly DEFAULT_NVIDIA_CUDA_URL="https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
-fi
+  readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run"
+  CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}"
+  DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}"
+  readonly DEFAULT_NVIDIA_CUDA_URL
+
+  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
+  readonly NVIDIA_CUDA_URL
+}
 
-NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
-readonly NVIDIA_CUDA_URL
+set_cuda_runfile_url
 
 # Parameter for NVIDIA-provided Rocky Linux GPU driver
 readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
@@ -763,8 +806,9 @@ function install_cuda_toolkit() {
   if is_debuntu ; then
 #    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
     execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
-     sync
+    sync
   elif is_rocky ; then
+    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
     execute_with_retries dnf -y -q install "${cudatk_package}"
     sync
   fi
@@ -786,7 +830,7 @@ function load_kernel_module() {
 
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
-  if ge_debian12 && is_src_os ; then
+  if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
     add_repo_nvidia_container_toolkit
     apt-get update -qq
@@ -800,7 +844,7 @@ function install_nvidia_gpu_driver() {
           libglvnd0 \
           libcuda1
     #clear_dkms_key
-  elif le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ; then
+  elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then
 
     install_nvidia_userspace_runfile
 
@@ -1048,21 +1092,28 @@ function install_dependencies() {
     execute_with_retries dnf -y -q install pciutils gcc screen
 
     local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
-    local kernel_devel_pkg_out="$(eval "${dnf_cmd} 2>&1")"
-    if [[ "${kernel_devel_pkg_out}" =~ 'Unable to find a match: kernel-devel-' ]] ; then
+    local install_log="${tmpdir}/install.log"
+    set +e
+    eval "${dnf_cmd}" > "${install_log}" 2>&1
+    local retval="$?"
+    set -e
+
+    if [[ "${retval}" == "0" ]] ; then return ; fi
+
+    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
       # this kernel-devel may have been migrated to the vault
       local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
       local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
-      execute_with_retries dnf -y -q --setopt=localpkg_gpgcheck=1 install \
+      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
         "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
         "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
-      sync
-    else
-      execute_with_retries "${dnf_cmd}"
+       )"
     fi
+
+    execute_with_retries "${dnf_cmd}"
   fi
 }
 
diff --git a/examples/secure-boot/pre-init.sh b/examples/secure-boot/pre-init.sh
index bbabe3f..57e8e62 100644
--- a/examples/secure-boot/pre-init.sh
+++ b/examples/secure-boot/pre-init.sh
@@ -143,7 +143,7 @@ esac
 # Install dask with rapids on base image
 PURPOSE="rapids-pre-init"
 customization_script="examples/secure-boot/rapids.sh"
-time generate_from_base_purpose "cuda-pre-init"
+#time generate_from_base_purpose "cuda-pre-init"
 
 ## Install dask without rapids on base image
 #PURPOSE="dask-pre-init"

From 0b4669c18dbb14c529e74b3a9a9cb66e5d13bb1b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 27 Nov 2024 22:21:22 -0800
Subject: [PATCH 21/21] pyspark tests are passing

---
 examples/secure-boot/install_gpu_driver.sh | 45 ++++++++--------------
 1 file changed, 17 insertions(+), 28 deletions(-)

diff --git a/examples/secure-boot/install_gpu_driver.sh b/examples/secure-boot/install_gpu_driver.sh
index e9b4a5e..25efb2a 100644
--- a/examples/secure-boot/install_gpu_driver.sh
+++ b/examples/secure-boot/install_gpu_driver.sh
@@ -187,12 +187,14 @@ function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
 function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
 function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
 
-readonly DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}"
+DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}"
+if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then
+                                         DEFAULT_DRIVER="560.28.03"  ; fi
+if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03"  ; fi
+if ( is_rocky    && le_cuda11 )   ; then DEFAULT_DRIVER="525.147.05" ; fi
+if ( is_ubuntu20 && le_cuda11 )   ; then DEFAULT_DRIVER="535.183.06" ; fi
+if ( is_rocky9   && ge_cuda12 )   ; then DEFAULT_DRIVER="565.57.01"  ; fi
 DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
-if ( is_debian11 || is_ubuntu20 ) ; then DRIVER_VERSION="560.28.03"  ; fi
-if ( is_ubuntu20 && le_cuda11 )   ; then DRIVER_VERSION="535.183.06" ; fi
-if ( is_rocky && le_cuda11 )      ; then DRIVER_VERSION="525.147.05" ; fi #553.22.1
-if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then DRIVER_VERSION="560.28.03"  ; fi
 
 readonly DRIVER_VERSION
 readonly DRIVER=${DRIVER_VERSION%%.*}
@@ -992,7 +994,8 @@ function configure_gpu_script() {
   # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
   # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
   # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
-  cat > ${spark_gpu_script_dir}/getGpusResources.sh <<'EOF'
+  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
+  cat > "${gpus_resources_script}" <<'EOF'
 #!/usr/bin/env bash
 
 #
@@ -1012,31 +1015,17 @@ function configure_gpu_script() {
 # limitations under the License.
 #
 
-CACHE_FILE="/var/run/nvidia-gpu-index.txt"
-if [[ -f "${CACHE_FILE}" ]]; then
-  cat "${CACHE_FILE}"
-  exit 0
-fi
-NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
-if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then
-  NVIDIA_SMI_L="$(cat "${NV_SMI_L_CACHE_FILE}")"
-else
-  NVIDIA_SMI_L="$(nvidia-smi -L | tee "${NV_SMI_L_CACHE_FILE}")"
-fi
+ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
-NUM_MIG_DEVICES=$(echo "${NVIDIA_SMI_L}" | grep -e MIG -e H100 -e A100 | wc -l || echo '0')
-
-if [[ "${NUM_MIG_DEVICES}" -gt "0" ]] ; then
-  MIG_INDEX=$(( $NUM_MIG_DEVICES - 1 ))
-  ADDRS="$(perl -e 'print(join(q{,},map{qq{"$_"}}(0..$ARGV[0])),$/)' "${MIG_INDEX}")"
-else
-  ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
-fi
-
-echo {\"name\": \"gpu\", \"addresses\":[$ADDRS]} | tee "${CACHE_FILE}"
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
 EOF
 
-  chmod a+rwx -R ${spark_gpu_script_dir}
+  chmod a+rx "${gpus_resources_script}"
+
+  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
+  if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then
+    echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}"
+  fi
 }
 
 function configure_gpu_isolation() {