forked from canonical/docker-snap
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Github workflow to run Nvidia tests via Testflinger (canonical#165)
Signed-off-by: Lincoln Wallace <[email protected]> Co-authored-by: Farshid Tavakolizadeh <[email protected]>
- Loading branch information
Showing
6 changed files
with
227 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
name: Nvidia Graphics Tests | ||
|
||
on: | ||
# Uncomment this trigger only during workflow development | ||
# pull_request: | ||
# branches: [ main ] | ||
# Manual trigger | ||
workflow_dispatch: | ||
inputs: | ||
snap-channel: | ||
description: 'Docker snap channel' | ||
required: true | ||
type: string | ||
default: 'latest/edge' | ||
|
||
jobs: | ||
test: | ||
runs-on: [self-hosted, testflinger] | ||
env: | ||
TESTFLINGER_DIR: .github/workflows/testflinger | ||
strategy: | ||
fail-fast: true | ||
matrix: | ||
job-queue: | ||
- 202007-28059 | ||
# - 202008-2816s7 | ||
# - 202112-29789 | ||
# noprovision node, for CI testing | ||
# - 202302-31212 | ||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
- name: Create Testflinger job queue | ||
run: | | ||
export JOB_QUEUE="${{ matrix.job-queue }}" | ||
export SNAP_CHANNEL="${{ inputs.snap-channel }}" | ||
envsubst '$JOB_QUEUE' \ | ||
< $TESTFLINGER_DIR/nvidia-job.yaml \ | ||
> $TESTFLINGER_DIR/nvidia-job.temp | ||
envsubst '$SNAP_CHANNEL' \ | ||
< $TESTFLINGER_DIR/scripts/setup.sh \ | ||
> $TESTFLINGER_DIR/scripts/setup.temp | ||
mv $TESTFLINGER_DIR/nvidia-job.temp $TESTFLINGER_DIR/nvidia-job.yaml | ||
mv $TESTFLINGER_DIR/scripts/setup.temp $TESTFLINGER_DIR/scripts/setup.sh | ||
- name: Submit Testflinger job | ||
uses: canonical/testflinger/.github/actions/submit@main | ||
with: | ||
poll: true | ||
job-path: ${{ env.TESTFLINGER_DIR }}/nvidia-job.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Testflinger scripts | ||
|
||
This directory contains the scripts used for Nvidia testing via Github actions and Testflinger. | ||
The tests run on devices within Canonical's test farm. | ||
|
||
## Run locally | ||
Running the tests locally is only possible if your machine has access to the Testflinger server. | ||
|
||
Export the following variables: | ||
```bash | ||
export JOB_QUEUE=<queue> SNAP_CHANNEL=<channel> | ||
``` | ||
|
||
Then, modify the files: | ||
```bash | ||
envsubst '$JOB_QUEUE' < nvidia-job.yaml > temp-job.yaml | ||
|
||
envsubst '$SNAP_CHANNEL' < scripts/setup.sh > scripts/temp-setup.sh | ||
|
||
sed -i "s|setup.sh|temp-setup.sh|" temp-job.yaml | ||
|
||
sed -i "s|.github/workflows/testflinger/||" temp-job.yaml | ||
``` | ||
|
||
Finally, submit the job: | ||
```bash | ||
testflinger submit --poll temp-job.yaml | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# This is a template | ||
# Some variables should be replaced with envsubst before use | ||
job_queue: $JOB_QUEUE | ||
global_timeout: 3600 | ||
output_timeout: 1800 | ||
provision_data: | ||
distro: "noble" | ||
test_data: | ||
|
||
# Copy files from the GH runner to the Testflinger Agent | ||
attachments: | ||
- local: ".github/workflows/testflinger/scripts" | ||
agent: "scripts" | ||
|
||
# Run commands on the Testflinger Agent | ||
test_cmds: | | ||
#!/usr/bin/env bash | ||
set -ex | ||
# List the attached files | ||
find attachments/test | ||
SCRIPTS=./attachments/test/scripts | ||
echo "Testing: DEVICE_IP = $DEVICE_IP" | ||
# Setup the environment on the target device | ||
ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/setup.sh)" | ||
# Reboot the device in background to avoid breaking the SSH connection prematurely | ||
ssh ubuntu@$DEVICE_IP "(sleep 3 && sudo reboot) &" | ||
echo "Wait for the device to boot and start its SSH server" | ||
$SCRIPTS/wait_for_port.sh $DEVICE_IP 22 | ||
# Run the tests | ||
ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/test.sh)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/usr/bin/env bash | ||
set -ex | ||
|
||
apt_update() { | ||
# ignore errors, some nodes fail to access the repos | ||
set +e | ||
sudo apt-get update | ||
set -e | ||
} | ||
|
||
install_docker() { | ||
# SNAP_CHANNEL may be set by the caller, or replaced in CI | ||
DOCKER_SNAP_CHANNEL=$SNAP_CHANNEL | ||
if [[ -z "$DOCKER_SNAP_CHANNEL" ]]; then | ||
DOCKER_SNAP_CHANNEL="latest/edge" | ||
fi | ||
|
||
# install docker-snap | ||
sudo snap install docker --channel="$DOCKER_SNAP_CHANNEL" | ||
|
||
# check the installation | ||
docker --version || exit 1 | ||
} | ||
|
||
setup_classic() { | ||
. /etc/os-release | ||
UBUNTU_VERSION="${VERSION_ID//./}" | ||
|
||
apt_update | ||
sudo apt-get install -y curl | ||
|
||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$UBUNTU_VERSION/x86_64/cuda-keyring_1.1-1_all.deb | ||
sudo dpkg -i cuda-keyring_1.1-1_all.deb | ||
|
||
apt_update | ||
sudo apt-get -y install cuda-toolkit-12-5 | ||
sudo apt-get install -y nvidia-driver-555-open | ||
sudo apt-get install -y cuda-drivers-555 | ||
|
||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && | ||
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | | ||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | | ||
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list | ||
|
||
apt_update | ||
sudo apt-get install -y nvidia-container-toolkit | ||
} | ||
|
||
setup_core() { | ||
sudo snap install nvidia-core22 | ||
sudo snap install nvidia-assemble --channel 22/stable | ||
} | ||
|
||
setup() { | ||
. /etc/os-release | ||
|
||
install_docker | ||
|
||
if [[ $ID == "ubuntu" ]]; then | ||
setup_classic | ||
|
||
elif [[ $ID == "ubuntu-core" ]]; then | ||
setup_core | ||
|
||
else | ||
echo "Unexpected operating system ID: $ID" | ||
exit 1 | ||
fi | ||
} | ||
|
||
setup | ||
|
||
echo "A reboot is required!" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#!/usr/bin/env bash | ||
set -ex | ||
|
||
# Test nvidia-smi | ||
smi_test() { | ||
. /etc/os-release | ||
|
||
if [[ $ID == "ubuntu" ]]; then | ||
sudo docker run --rm --runtime=nvidia --gpus all --env PATH="${PATH}:/var/lib/snapd/hostfs/usr/bin" ubuntu nvidia-smi || exit 1 | ||
elif [[ $ID == "ubuntu-core" ]]; then | ||
sudo docker run --rm --runtime nvidia --gpus all -it ubuntu bash -c "/snap/docker/*/graphics/bin/nvidia-smi" || exit 1 | ||
else | ||
echo "Unexpected operating system ID: $ID" | ||
exit 1 | ||
fi | ||
} | ||
|
||
# Test a vector addition sample workload | ||
vector_add_test() { | ||
sudo docker run --rm --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 || exit 1 | ||
} | ||
|
||
smi_test | ||
|
||
vector_add_test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#!/usr/bin/env bash | ||
set -ex | ||
|
||
# install dependencies | ||
sudo apt install -y netcat | ||
|
||
# check connection to the device | ||
while ! nc -z $1 $2; do | ||
echo "Waiting for $1:$2 ..." | ||
sleep 10 | ||
done |