Skip to content

Commit

Permalink
Building and caching deepspeed
Browse files Browse the repository at this point in the history
  • Loading branch information
Paladinium committed Jan 19, 2025
1 parent 750dcf0 commit d5a0257
Show file tree
Hide file tree
Showing 2 changed files with 258 additions and 7 deletions.
254 changes: 254 additions & 0 deletions .github/actions/setup-nvidia/actions.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
name: Setup NVIDIA

description: Set up NVIDIA driver and NVIDIA-docker runtime on Linux runner

inputs:
driver-version:
description: which driver version to install
required: false
type: string
default: "550.54.15" # https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-54-15/index.html

runs:
using: composite
steps:
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
env:
DRIVER_VERSION: ${{ inputs.driver-version }}
with:
timeout_minutes: 10
max_attempts: 3
command: |
# Is it disgusting to have a full shell script here in this github action? Sure
# But is it the best way to make it so that this action relies on nothing else? Absolutely
set -eou pipefail
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
install_nvidia_docker2_amzn2() {
(
set -x
# Needed for yum-config-manager
sudo yum install -y yum-utils
if [[ "${DISTRIBUTION}" == "amzn2023" ]] ; then
YUM_REPO_URL="https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo"
else
# Amazon Linux 2
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
fi
sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
sudo yum install -y nvidia-docker2 nvidia-container-toolkit-1.16.2
sudo systemctl restart docker
)
}
install_nvidia_docker2_ubuntu20() {
(
set -x
# Install nvidia-driver package if not installed
status="$(dpkg-query -W --showformat='${db:Status-Status}' nvidia-docker2 2>&1)"
if [ ! $? = 0 ] || [ ! "$status" = installed ]; then
sudo apt-get install -y nvidia-docker2 nvidia-container-toolkit-1.16.2
sudo systemctl restart docker
fi
)
}
pre_install_nvidia_driver_amzn2() {
(
# Purge any nvidia driver installed from RHEL repo
sudo yum remove -y nvidia-driver-latest-dkms
)
}
install_nvidia_driver_common() {
(
# Try to gather more information about the runner and its existing NVIDIA driver if any
echo "Before installing NVIDIA driver"
lspci
lsmod
modinfo nvidia || true
HAS_NVIDIA_DRIVER=0
# Check if NVIDIA driver has already been installed
if [ -x "$(command -v nvidia-smi)" ]; then
set +e
# The driver exists, check its version next. Also check only the first GPU if there are more than one of them
# so that the same driver version is not print over multiple lines
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0)
NVIDIA_SMI_STATUS=$?
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing"
elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
else
HAS_NVIDIA_DRIVER=1
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
fi
set -e
fi
if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
# CAUTION: this may need to be updated in future
if [ "${DISTRIBUTION}" != ubuntu20.04 ]; then
sudo yum groupinstall -y "Development Tools"
# ensure our kernel install is the same as our underlying kernel,
# groupinstall "Development Tools" has a habit of mismatching kernel headers
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
sudo modprobe backlight
fi
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
set +e
sudo /bin/bash /tmp/nvidia_driver -s --no-drm
NVIDIA_INSTALLATION_STATUS=$?
RESET_GPU=0
if [ "$NVIDIA_INSTALLATION_STATUS" -ne 0 ]; then
sudo cat /var/log/nvidia-installer.log
# Fail to install NVIDIA driver, try to reset the GPU
RESET_GPU=1
elif [ -x "$(command -v nvidia-smi)" ]; then
# Check again if nvidia-smi works even if the driver installation completes successfully
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0)
NVIDIA_SMI_STATUS=$?
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
RESET_GPU=1
fi
fi
if [ "$RESET_GPU" -eq 1 ]; then
NVIDIA_DEVICES=$(lspci -D | grep -i NVIDIA | cut -d' ' -f1)
# The GPU can get stuck in a failure state if somehow the test crashs the GPU microcode. When this
# happens, we'll try to reset all NVIDIA devices https://github.com/pytorch/pytorch/issues/88388
for PCI_ID in $NVIDIA_DEVICES; do
DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable)
echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)"
# This requires sudo permission of course
echo "1" | sudo tee /sys/bus/pci/devices/$PCI_ID/reset
sleep 1
done
fi
sudo rm -fv /tmp/nvidia_driver
set -e
fi
)
}
post_install_nvidia_driver_common() {
(
sudo modprobe nvidia || true
echo "After installing NVIDIA driver"
lspci
lsmod
modinfo nvidia || true
(
set +e
nvidia-smi
# NB: Annoyingly, nvidia-smi command returns successfully with return code 0 even in
# the case where the driver has already crashed as it still can get the driver version
# and some basic information like the bus ID. However, the rest of the information
# would be missing (ERR!), for example:
#
# +-----------------------------------------------------------------------------+
# | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 |
# |-------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
# | | | MIG M. |
# |===============================+======================+======================|
# | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! |
# |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default |
# | | | ERR! |
# +-------------------------------+----------------------+----------------------+
#
# +-----------------------------------------------------------------------------+
# | Processes: |
# | GPU GI CI PID Type Process name GPU Memory |
# | ID ID Usage |
# |=============================================================================|
# +-----------------------------------------------------------------------------+
#
# This should be reported as a failure instead as it will guarantee to fail when
# Docker tries to run with --gpus all
#
# So, the correct check here is to query one of the missing piece of info like
# GPU name, so that the command can fail accordingly
nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
NVIDIA_SMI_STATUS=$?
# Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then
echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}"
else
echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}"
exit ${NVIDIA_SMI_STATUS}
fi
set -e
)
)
}
install_nvidia_driver_amzn2() {
(
set -x
pre_install_nvidia_driver_amzn2
install_nvidia_driver_common
post_install_nvidia_driver_common
)
}
install_nvidia_driver_ubuntu20() {
(
set -x
install_nvidia_driver_common
post_install_nvidia_driver_common
)
}
echo "== Installing nvidia driver ${DRIVER_FN} =="
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_driver_amzn2
;;
ubuntu20.04)
install_nvidia_driver_ubuntu20
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac
# Install container toolkit based on distribution
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_docker2_amzn2
;;
ubuntu20.04)
install_nvidia_docker2_ubuntu20
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac
echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
# Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with
# more than one GPUs. This just needs to be run once. The command fails
# on subsequent runs and complains that the mode is already on, but that's
# ok
sudo nvidia-persistenced || true
# This should show persistence mode ON
nvidia-smi
11 changes: 4 additions & 7 deletions .github/workflows/deepspeed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ permissions:

jobs:
build:
runs-on: ubuntu-latest
runs-on: ubuntu-20.04

steps:
- name: Free Disk Space (Ubuntu)
Expand All @@ -26,16 +26,13 @@ jobs:
large-packages: true
docker-images: false
swap-storage: true
- name: Install NVIDIA CUDA Toolkit
uses: Jimver/[email protected]
id: cuda-toolkit
with:
cuda: '12.4.1'
- name: Test that setup-nvidia works
uses: ./.github/actions/setup-nvidia
- name: Run the build process with Docker
uses: addnab/docker-run-action@v3
with:
image: ${{ github.event.inputs.tag }}
options: -v ${{ github.workspace }}/build:/deepspeed --gpus=all -it
options: -v ${{ github.workspace }}/build:/deepspeed --gpus=all
run: |
/build_deepspeed.sh
- name: Upload whl as artifact
Expand Down

0 comments on commit d5a0257

Please sign in to comment.