forked from erew123/alltalk_tts
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
750dcf0
commit d5a0257
Showing
2 changed files
with
258 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,254 @@ | ||
name: Setup NVIDIA | ||
|
||
description: Set up NVIDIA driver and NVIDIA-docker runtime on Linux runner | ||
|
||
inputs: | ||
driver-version: | ||
description: which driver version to install | ||
required: false | ||
type: string | ||
default: "550.54.15" # https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-54-15/index.html | ||
|
||
runs: | ||
using: composite | ||
steps: | ||
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG | ||
uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482 | ||
env: | ||
DRIVER_VERSION: ${{ inputs.driver-version }} | ||
with: | ||
timeout_minutes: 10 | ||
max_attempts: 3 | ||
command: | | ||
# Is it disgusting to have a full shell script here in this github action? Sure | ||
# But is it the best way to make it so that this action relies on nothing else? Absolutely | ||
set -eou pipefail | ||
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) | ||
DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" | ||
install_nvidia_docker2_amzn2() { | ||
( | ||
set -x | ||
# Needed for yum-config-manager | ||
sudo yum install -y yum-utils | ||
if [[ "${DISTRIBUTION}" == "amzn2023" ]] ; then | ||
YUM_REPO_URL="https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo" | ||
else | ||
# Amazon Linux 2 | ||
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo" | ||
fi | ||
sudo yum-config-manager --add-repo "${YUM_REPO_URL}" | ||
sudo yum install -y nvidia-docker2 nvidia-container-toolkit-1.16.2 | ||
sudo systemctl restart docker | ||
) | ||
} | ||
install_nvidia_docker2_ubuntu20() { | ||
( | ||
set -x | ||
# Install nvidia-driver package if not installed | ||
status="$(dpkg-query -W --showformat='${db:Status-Status}' nvidia-docker2 2>&1)" | ||
if [ ! $? = 0 ] || [ ! "$status" = installed ]; then | ||
sudo apt-get install -y nvidia-docker2 nvidia-container-toolkit-1.16.2 | ||
sudo systemctl restart docker | ||
fi | ||
) | ||
} | ||
pre_install_nvidia_driver_amzn2() { | ||
( | ||
# Purge any nvidia driver installed from RHEL repo | ||
sudo yum remove -y nvidia-driver-latest-dkms | ||
) | ||
} | ||
install_nvidia_driver_common() { | ||
( | ||
# Try to gather more information about the runner and its existing NVIDIA driver if any | ||
echo "Before installing NVIDIA driver" | ||
lspci | ||
lsmod | ||
modinfo nvidia || true | ||
HAS_NVIDIA_DRIVER=0 | ||
# Check if NVIDIA driver has already been installed | ||
if [ -x "$(command -v nvidia-smi)" ]; then | ||
set +e | ||
# The driver exists, check its version next. Also check only the first GPU if there are more than one of them | ||
# so that the same driver version is not print over multiple lines | ||
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0) | ||
NVIDIA_SMI_STATUS=$? | ||
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then | ||
echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing" | ||
elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then | ||
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing" | ||
else | ||
HAS_NVIDIA_DRIVER=1 | ||
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation" | ||
fi | ||
set -e | ||
fi | ||
if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then | ||
# CAUTION: this may need to be updated in future | ||
if [ "${DISTRIBUTION}" != ubuntu20.04 ]; then | ||
sudo yum groupinstall -y "Development Tools" | ||
# ensure our kernel install is the same as our underlying kernel, | ||
# groupinstall "Development Tools" has a habit of mismatching kernel headers | ||
sudo yum install -y "kernel-devel-uname-r == $(uname -r)" | ||
sudo modprobe backlight | ||
fi | ||
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" | ||
set +e | ||
sudo /bin/bash /tmp/nvidia_driver -s --no-drm | ||
NVIDIA_INSTALLATION_STATUS=$? | ||
RESET_GPU=0 | ||
if [ "$NVIDIA_INSTALLATION_STATUS" -ne 0 ]; then | ||
sudo cat /var/log/nvidia-installer.log | ||
# Fail to install NVIDIA driver, try to reset the GPU | ||
RESET_GPU=1 | ||
elif [ -x "$(command -v nvidia-smi)" ]; then | ||
# Check again if nvidia-smi works even if the driver installation completes successfully | ||
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0) | ||
NVIDIA_SMI_STATUS=$? | ||
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then | ||
RESET_GPU=1 | ||
fi | ||
fi | ||
if [ "$RESET_GPU" -eq 1 ]; then | ||
NVIDIA_DEVICES=$(lspci -D | grep -i NVIDIA | cut -d' ' -f1) | ||
# The GPU can get stuck in a failure state if somehow the test crashs the GPU microcode. When this | ||
# happens, we'll try to reset all NVIDIA devices https://github.com/pytorch/pytorch/issues/88388 | ||
for PCI_ID in $NVIDIA_DEVICES; do | ||
DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable) | ||
echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)" | ||
# This requires sudo permission of course | ||
echo "1" | sudo tee /sys/bus/pci/devices/$PCI_ID/reset | ||
sleep 1 | ||
done | ||
fi | ||
sudo rm -fv /tmp/nvidia_driver | ||
set -e | ||
fi | ||
) | ||
} | ||
post_install_nvidia_driver_common() { | ||
( | ||
sudo modprobe nvidia || true | ||
echo "After installing NVIDIA driver" | ||
lspci | ||
lsmod | ||
modinfo nvidia || true | ||
( | ||
set +e | ||
nvidia-smi | ||
# NB: Annoyingly, nvidia-smi command returns successfully with return code 0 even in | ||
# the case where the driver has already crashed as it still can get the driver version | ||
# and some basic information like the bus ID. However, the rest of the information | ||
# would be missing (ERR!), for example: | ||
# | ||
# +-----------------------------------------------------------------------------+ | ||
# | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 | | ||
# |-------------------------------+----------------------+----------------------+ | ||
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | ||
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | ||
# | | | MIG M. | | ||
# |===============================+======================+======================| | ||
# | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! | | ||
# |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default | | ||
# | | | ERR! | | ||
# +-------------------------------+----------------------+----------------------+ | ||
# | ||
# +-----------------------------------------------------------------------------+ | ||
# | Processes: | | ||
# | GPU GI CI PID Type Process name GPU Memory | | ||
# | ID ID Usage | | ||
# |=============================================================================| | ||
# +-----------------------------------------------------------------------------+ | ||
# | ||
# This should be reported as a failure instead as it will guarantee to fail when | ||
# Docker tries to run with --gpus all | ||
# | ||
# So, the correct check here is to query one of the missing piece of info like | ||
# GPU name, so that the command can fail accordingly | ||
nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 | ||
NVIDIA_SMI_STATUS=$? | ||
# Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285 | ||
if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then | ||
echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}" | ||
else | ||
echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}" | ||
exit ${NVIDIA_SMI_STATUS} | ||
fi | ||
set -e | ||
) | ||
) | ||
} | ||
install_nvidia_driver_amzn2() { | ||
( | ||
set -x | ||
pre_install_nvidia_driver_amzn2 | ||
install_nvidia_driver_common | ||
post_install_nvidia_driver_common | ||
) | ||
} | ||
install_nvidia_driver_ubuntu20() { | ||
( | ||
set -x | ||
install_nvidia_driver_common | ||
post_install_nvidia_driver_common | ||
) | ||
} | ||
echo "== Installing nvidia driver ${DRIVER_FN} ==" | ||
case "${DISTRIBUTION}" in | ||
amzn*) | ||
install_nvidia_driver_amzn2 | ||
;; | ||
ubuntu20.04) | ||
install_nvidia_driver_ubuntu20 | ||
;; | ||
*) | ||
echo "ERROR: Unknown distribution ${DISTRIBUTION}" | ||
exit 1 | ||
;; | ||
esac | ||
# Install container toolkit based on distribution | ||
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} ==" | ||
case "${DISTRIBUTION}" in | ||
amzn*) | ||
install_nvidia_docker2_amzn2 | ||
;; | ||
ubuntu20.04) | ||
install_nvidia_docker2_ubuntu20 | ||
;; | ||
*) | ||
echo "ERROR: Unknown distribution ${DISTRIBUTION}" | ||
exit 1 | ||
;; | ||
esac | ||
echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" | ||
# Fix https://github.com/NVIDIA/nvidia-docker/issues/1648 on runners with | ||
# more than one GPUs. This just needs to be run once. The command fails | ||
# on subsequent runs and complains that the mode is already on, but that's | ||
# ok | ||
sudo nvidia-persistenced || true | ||
# This should show persistence mode ON | ||
nvidia-smi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ permissions: | |
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
runs-on: ubuntu-20.04 | ||
|
||
steps: | ||
- name: Free Disk Space (Ubuntu) | ||
|
@@ -26,16 +26,13 @@ jobs: | |
large-packages: true | ||
docker-images: false | ||
swap-storage: true | ||
- name: Install NVIDIA CUDA Toolkit | ||
uses: Jimver/[email protected] | ||
id: cuda-toolkit | ||
with: | ||
cuda: '12.4.1' | ||
- name: Test that setup-nvidia works | ||
uses: ./.github/actions/setup-nvidia | ||
- name: Run the build process with Docker | ||
uses: addnab/docker-run-action@v3 | ||
with: | ||
image: ${{ github.event.inputs.tag }} | ||
options: -v ${{ github.workspace }}/build:/deepspeed --gpus=all -it | ||
options: -v ${{ github.workspace }}/build:/deepspeed --gpus=all | ||
run: | | ||
/build_deepspeed.sh | ||
- name: Upload whl as artifact | ||
|