Skip to content

Commit

Permalink
Add Github workflow to run Nvidia tests via Testflinger (canonical#165)
Browse files Browse the repository at this point in the history
Signed-off-by: Lincoln Wallace <[email protected]>
Co-authored-by: Farshid Tavakolizadeh <[email protected]>
  • Loading branch information
locnnil and farshidtz authored Aug 7, 2024
1 parent 3489233 commit 3cf4c1d
Show file tree
Hide file tree
Showing 6 changed files with 227 additions and 0 deletions.
54 changes: 54 additions & 0 deletions .github/workflows/nvidia-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Nvidia Graphics Tests

on:
# Uncomment this trigger only during workflow development
# pull_request:
# branches: [ main ]
# Manual trigger
workflow_dispatch:
inputs:
snap-channel:
description: 'Docker snap channel'
required: true
type: string
default: 'latest/edge'

jobs:
test:
runs-on: [self-hosted, testflinger]
env:
TESTFLINGER_DIR: .github/workflows/testflinger
strategy:
fail-fast: true
matrix:
job-queue:
- 202007-28059
# - 202008-2816s7
# - 202112-29789
# noprovision node, for CI testing
# - 202302-31212
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Create Testflinger job queue
run: |
export JOB_QUEUE="${{ matrix.job-queue }}"
export SNAP_CHANNEL="${{ inputs.snap-channel }}"
envsubst '$JOB_QUEUE' \
< $TESTFLINGER_DIR/nvidia-job.yaml \
> $TESTFLINGER_DIR/nvidia-job.temp
envsubst '$SNAP_CHANNEL' \
< $TESTFLINGER_DIR/scripts/setup.sh \
> $TESTFLINGER_DIR/scripts/setup.temp
mv $TESTFLINGER_DIR/nvidia-job.temp $TESTFLINGER_DIR/nvidia-job.yaml
mv $TESTFLINGER_DIR/scripts/setup.temp $TESTFLINGER_DIR/scripts/setup.sh
- name: Submit Testflinger job
uses: canonical/testflinger/.github/actions/submit@main
with:
poll: true
job-path: ${{ env.TESTFLINGER_DIR }}/nvidia-job.yaml
28 changes: 28 additions & 0 deletions .github/workflows/testflinger/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Testflinger scripts

This directory contains the scripts used for Nvidia testing via Github actions and Testflinger.
The tests run on devices within Canonical's test farm.

## Run locally
Running the tests locally is only possible if your machine has access to the Testflinger server.

Export the following variables:
```bash
export JOB_QUEUE=<queue> SNAP_CHANNEL=<channel>
```

Then, modify the files:
```bash
envsubst '$JOB_QUEUE' < nvidia-job.yaml > temp-job.yaml

envsubst '$SNAP_CHANNEL' < scripts/setup.sh > scripts/temp-setup.sh

sed -i "s|setup.sh|temp-setup.sh|" temp-job.yaml

sed -i "s|.github/workflows/testflinger/||" temp-job.yaml
```

Finally, submit the job:
```bash
testflinger submit --poll temp-job.yaml
```
36 changes: 36 additions & 0 deletions .github/workflows/testflinger/nvidia-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# This is a template
# Some variables should be replaced with envsubst before use
job_queue: $JOB_QUEUE
global_timeout: 3600
output_timeout: 1800
provision_data:
distro: "noble"
test_data:

# Copy files from the GH runner to the Testflinger Agent
attachments:
- local: ".github/workflows/testflinger/scripts"
agent: "scripts"

# Run commands on the Testflinger Agent
test_cmds: |
#!/usr/bin/env bash
set -ex
# List the attached files
find attachments/test
SCRIPTS=./attachments/test/scripts
echo "Testing: DEVICE_IP = $DEVICE_IP"
# Setup the environment on the target device
ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/setup.sh)"
# Reboot the device in background to avoid breaking the SSH connection prematurely
ssh ubuntu@$DEVICE_IP "(sleep 3 && sudo reboot) &"
echo "Wait for the device to boot and start its SSH server"
$SCRIPTS/wait_for_port.sh $DEVICE_IP 22
# Run the tests
ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/test.sh)"
73 changes: 73 additions & 0 deletions .github/workflows/testflinger/scripts/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -ex

apt_update() {
# ignore errors, some nodes fail to access the repos
set +e
sudo apt-get update
set -e
}

install_docker() {
# SNAP_CHANNEL may be set by the caller, or replaced in CI
DOCKER_SNAP_CHANNEL=$SNAP_CHANNEL
if [[ -z "$DOCKER_SNAP_CHANNEL" ]]; then
DOCKER_SNAP_CHANNEL="latest/edge"
fi

# install docker-snap
sudo snap install docker --channel="$DOCKER_SNAP_CHANNEL"

# check the installation
docker --version || exit 1
}

setup_classic() {
. /etc/os-release
UBUNTU_VERSION="${VERSION_ID//./}"

apt_update
sudo apt-get install -y curl

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$UBUNTU_VERSION/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb

apt_update
sudo apt-get -y install cuda-toolkit-12-5
sudo apt-get install -y nvidia-driver-555-open
sudo apt-get install -y cuda-drivers-555

curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg &&
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list

apt_update
sudo apt-get install -y nvidia-container-toolkit
}

setup_core() {
sudo snap install nvidia-core22
sudo snap install nvidia-assemble --channel 22/stable
}

setup() {
. /etc/os-release

install_docker

if [[ $ID == "ubuntu" ]]; then
setup_classic

elif [[ $ID == "ubuntu-core" ]]; then
setup_core

else
echo "Unexpected operating system ID: $ID"
exit 1
fi
}

setup

echo "A reboot is required!"
25 changes: 25 additions & 0 deletions .github/workflows/testflinger/scripts/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -ex

# Test nvidia-smi
smi_test() {
. /etc/os-release

if [[ $ID == "ubuntu" ]]; then
sudo docker run --rm --runtime=nvidia --gpus all --env PATH="${PATH}:/var/lib/snapd/hostfs/usr/bin" ubuntu nvidia-smi || exit 1
elif [[ $ID == "ubuntu-core" ]]; then
sudo docker run --rm --runtime nvidia --gpus all -it ubuntu bash -c "/snap/docker/*/graphics/bin/nvidia-smi" || exit 1
else
echo "Unexpected operating system ID: $ID"
exit 1
fi
}

# Test a vector addition sample workload
vector_add_test() {
sudo docker run --rm --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 || exit 1
}

smi_test

vector_add_test
11 changes: 11 additions & 0 deletions .github/workflows/testflinger/scripts/wait_for_port.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -ex

# install dependencies
sudo apt install -y netcat

# check connection to the device
while ! nc -z $1 $2; do
echo "Waiting for $1:$2 ..."
sleep 10
done

0 comments on commit 3cf4c1d

Please sign in to comment.