Building and caching deepspeed

Paladinium · Jan 19, 2025 · d5a0257 · d5a0257
1 parent 750dcf0
commit d5a0257
Show file tree

Hide file tree

Showing 2 changed files with 258 additions and 7 deletions.
diff --git a/.github/actions/setup-nvidia/actions.yml b/.github/actions/setup-nvidia/actions.yml
@@ -0,0 +1,254 @@
+name: Setup NVIDIA
+
+description: Set up NVIDIA driver and NVIDIA-docker runtime on Linux runner
+
+inputs:
+  driver-version:
+    description: which driver version to install
+    required: false
+    type: string
+    default: "550.54.15"   # https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-54-15/index.html
+
+runs:
+  using: composite
+  steps:
+    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+      uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+      env:
+        DRIVER_VERSION: ${{ inputs.driver-version }}
+      with:
+        timeout_minutes: 10
+        max_attempts: 3
+        command: |
+          # Is it disgusting to have a full shell script here in this github action? Sure
+          # But is it the best way to make it so that this action relies on nothing else? Absolutely
+          set -eou pipefail
+
+          DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
+          DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+          install_nvidia_docker2_amzn2() {
+              (
+                  set -x
+                  # Needed for yum-config-manager
+                  sudo yum install -y yum-utils
+                  if [[ "${DISTRIBUTION}" == "amzn2023" ]] ; then
+                    YUM_REPO_URL="https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo"
+                  else
+                    # Amazon Linux 2
+                    YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
+                  fi
+
+                  sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
+                  sudo yum install -y nvidia-docker2 nvidia-container-toolkit-1.16.2
+                  sudo systemctl restart docker
+              )
+          }
+
+          install_nvidia_docker2_ubuntu20() {
+              (
+                  set -x
+                  # Install nvidia-driver package if not installed
+                  status="$(dpkg-query -W --showformat='${db:Status-Status}' nvidia-docker2 2>&1)"
+                  if [ ! $? = 0 ] || [ ! "$status" = installed ]; then
+                    sudo apt-get install -y nvidia-docker2 nvidia-container-toolkit-1.16.2
+                    sudo systemctl restart docker
+                  fi
+              )
+          }
+
+          pre_install_nvidia_driver_amzn2() {
+              (
+                  # Purge any nvidia driver installed from RHEL repo
+                  sudo yum remove -y nvidia-driver-latest-dkms
+              )
+          }
+
+          install_nvidia_driver_common() {
+              (
+                  # Try to gather more information about the runner and its existing NVIDIA driver if any
+                  echo "Before installing NVIDIA driver"
+                  lspci
+                  lsmod
+                  modinfo nvidia || true
+
+                  HAS_NVIDIA_DRIVER=0
+                  # Check if NVIDIA driver has already been installed
+                  if [ -x "$(command -v nvidia-smi)" ]; then
+                      set +e
+                      # The driver exists, check its version next. Also check only the first GPU if there are more than one of them
+                      # so that the same driver version is not print over multiple lines
+                      INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0)
+                      NVIDIA_SMI_STATUS=$?
+
+                      if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
+                          echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing"
+                      elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
+                          echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
+                      else
+                          HAS_NVIDIA_DRIVER=1
+                          echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
+                      fi
+                      set -e
+                  fi
+
+                  if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
+                      # CAUTION: this may need to be updated in future
+                      if [ "${DISTRIBUTION}" != ubuntu20.04 ]; then
+                            sudo yum groupinstall -y "Development Tools"
+                            # ensure our kernel install is the same as our underlying kernel,
+                            # groupinstall "Development Tools" has a habit of mismatching kernel headers
+                            sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
+                            sudo modprobe backlight
+                      fi
+                      sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
+
+                      set +e
+                      sudo /bin/bash /tmp/nvidia_driver -s --no-drm
+                      NVIDIA_INSTALLATION_STATUS=$?
+
+                      RESET_GPU=0
+                      if [ "$NVIDIA_INSTALLATION_STATUS" -ne 0 ]; then
+                          sudo cat /var/log/nvidia-installer.log
+                          # Fail to install NVIDIA driver, try to reset the GPU
+                          RESET_GPU=1
+                      elif [ -x "$(command -v nvidia-smi)" ]; then
+                          # Check again if nvidia-smi works even if the driver installation completes successfully
+                          INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0)
+                          NVIDIA_SMI_STATUS=$?
+
+                          if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
+                              RESET_GPU=1
+                          fi
+                      fi
+
+                      if [ "$RESET_GPU" -eq 1 ]; then
+                          NVIDIA_DEVICES=$(lspci -D | grep -i NVIDIA | cut -d' ' -f1)
+                          # The GPU can get stuck in a failure state if somehow the test crashs the GPU microcode. When this
+                          # happens, we'll try to reset all NVIDIA devices https://github.com/pytorch/pytorch/issues/88388
+                          for PCI_ID in $NVIDIA_DEVICES; do
+                              DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable)
+
+                              echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)"
+                              # This requires sudo permission of course
+                              echo "1" | sudo tee /sys/bus/pci/devices/$PCI_ID/reset
+                              sleep 1
+                          done
+                      fi
+
+                      sudo rm -fv /tmp/nvidia_driver
+                      set -e
+                  fi
+              )
+          }
+
+          post_install_nvidia_driver_common() {
+              (
+                  sudo modprobe nvidia || true
+                  echo "After installing NVIDIA driver"
+                  lspci
+                  lsmod
+                  modinfo nvidia || true
+
+                  (
+                      set +e
+
+                      nvidia-smi
+                      # NB: Annoyingly, nvidia-smi command returns successfully with return code 0 even in
+                      # the case where the driver has already crashed as it still can get the driver version
+                      # and some basic information like the bus ID.  However, the rest of the information
+                      # would be missing (ERR!), for example:
+                      #
+                      # +-----------------------------------------------------------------------------+
+                      # | NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
+                      # |-------------------------------+----------------------+----------------------+
+                      # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+                      # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+                      # |                               |                      |               MIG M. |
+                      # |===============================+======================+======================|
+                      # |   0  ERR!                Off  | 00000000:00:1E.0 Off |                 ERR! |
+                      # |ERR!  ERR! ERR!    ERR! / ERR! |   4184MiB / 23028MiB |    ERR!      Default |
+                      # |                               |                      |                 ERR! |
+                      # +-------------------------------+----------------------+----------------------+
+                      #
+                      # +-----------------------------------------------------------------------------+
+                      # | Processes:                                                                  |
+                      # |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+                      # |        ID   ID                                                   Usage      |
+                      # |=============================================================================|
+                      # +-----------------------------------------------------------------------------+
+                      #
+                      # This should be reported as a failure instead as it will guarantee to fail when
+                      # Docker tries to run with --gpus all
+                      #
+                      # So, the correct check here is to query one of the missing piece of info like
+                      # GPU name, so that the command can fail accordingly
+                      nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
+                      NVIDIA_SMI_STATUS=$?
+
+                      # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
+                      if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then
+                          echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}"
+                      else
+                          echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}"
+                          exit ${NVIDIA_SMI_STATUS}
+                      fi
+                      set -e
+                  )
+              )
+          }
+
+          install_nvidia_driver_amzn2() {
+              (
+                  set -x
+                  pre_install_nvidia_driver_amzn2
+                  install_nvidia_driver_common
+                  post_install_nvidia_driver_common
+              )
+          }
+
+          install_nvidia_driver_ubuntu20() {
+              (
+                  set -x
+                  install_nvidia_driver_common
+                  post_install_nvidia_driver_common
+              )
+          }
+
+          echo "== Installing nvidia driver ${DRIVER_FN} =="
+          case "${DISTRIBUTION}" in
+              amzn*)
+                  install_nvidia_driver_amzn2
+                  ;;
+              ubuntu20.04)
+                  install_nvidia_driver_ubuntu20
+                  ;;
+              *)
+                  echo "ERROR: Unknown distribution ${DISTRIBUTION}"
+                  exit 1
+                  ;;
+          esac
+
+          # Install container toolkit based on distribution
+          echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
+          case "${DISTRIBUTION}" in
+              amzn*)
+                  install_nvidia_docker2_amzn2
+                  ;;
+              ubuntu20.04)
+                  install_nvidia_docker2_ubuntu20
+                  ;;
+              *)
+                  echo "ERROR: Unknown distribution ${DISTRIBUTION}"
+                  exit 1
+                  ;;
+          esac
+          echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+          # Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with
+          # more than one GPUs. This just needs to be run once. The command fails
+          # on subsequent runs and complains that the mode is already on, but that's
+          # ok
+          sudo nvidia-persistenced || true
+          # This should show persistence mode ON
+          nvidia-smi
diff --git a/.github/workflows/deepspeed.yml b/.github/workflows/deepspeed.yml
@@ -13,7 +13,7 @@ permissions:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
 
     steps:
       - name: Free Disk Space (Ubuntu)
@@ -26,16 +26,13 @@ jobs:
           large-packages: true
           docker-images: false
           swap-storage: true
-      - name: Install NVIDIA CUDA Toolkit
-        uses: Jimver/[email protected]
-        id: cuda-toolkit
-        with:
-          cuda: '12.4.1'
+      - name: Test that setup-nvidia works
+        uses: ./.github/actions/setup-nvidia
       - name: Run the build process with Docker
         uses: addnab/docker-run-action@v3
         with:
           image: ${{ github.event.inputs.tag }}
-          options: -v ${{ github.workspace }}/build:/deepspeed --gpus=all -it
+          options: -v ${{ github.workspace }}/build:/deepspeed --gpus=all
           run: |
             /build_deepspeed.sh
       - name: Upload whl as artifact