Add Github workflow to run Nvidia tests via Testflinger (canonical#165)

Signed-off-by: Lincoln Wallace <[email protected]> Co-authored-by: Farshid Tavakolizadeh <[email protected]>
hartmutobendorf · Aug 7, 2024 · 3cf4c1d · 3cf4c1d
1 parent 3489233
commit 3cf4c1d
Show file tree

Hide file tree

Showing 6 changed files with 227 additions and 0 deletions.
diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml
@@ -0,0 +1,54 @@
+name: Nvidia Graphics Tests
+
+on:
+  # Uncomment this trigger only during workflow development
+  # pull_request:
+  #   branches: [ main ]
+  # Manual trigger
+  workflow_dispatch:
+    inputs:
+      snap-channel:
+        description: 'Docker snap channel'
+        required: true
+        type: string
+        default: 'latest/edge'
+
+jobs:
+    test:
+        runs-on: [self-hosted, testflinger]
+        env:
+          TESTFLINGER_DIR: .github/workflows/testflinger
+        strategy:
+          fail-fast: true
+          matrix:
+            job-queue:
+              - 202007-28059
+              # - 202008-2816s7
+              # - 202112-29789
+              # noprovision node, for CI testing
+              # - 202302-31212
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - name: Create Testflinger job queue
+              run: |
+                export JOB_QUEUE="${{ matrix.job-queue }}"
+                export SNAP_CHANNEL="${{ inputs.snap-channel }}"
+                
+                envsubst '$JOB_QUEUE' \
+                  < $TESTFLINGER_DIR/nvidia-job.yaml \
+                  > $TESTFLINGER_DIR/nvidia-job.temp
+
+                envsubst '$SNAP_CHANNEL' \
+                  < $TESTFLINGER_DIR/scripts/setup.sh \
+                  > $TESTFLINGER_DIR/scripts/setup.temp
+
+                mv $TESTFLINGER_DIR/nvidia-job.temp $TESTFLINGER_DIR/nvidia-job.yaml
+                mv $TESTFLINGER_DIR/scripts/setup.temp $TESTFLINGER_DIR/scripts/setup.sh
+
+            - name: Submit Testflinger job
+              uses: canonical/testflinger/.github/actions/submit@main
+              with:
+                poll: true
+                job-path: ${{ env.TESTFLINGER_DIR }}/nvidia-job.yaml
diff --git a/.github/workflows/testflinger/README.md b/.github/workflows/testflinger/README.md
@@ -0,0 +1,28 @@
+# Testflinger scripts
+
+This directory contains the scripts used for Nvidia testing via Github actions and Testflinger.
+The tests run on devices within Canonical's test farm.
+
+## Run locally
+Running the tests locally is only possible if your machine has access to the Testflinger server.
+
+Export the following variables:
+```bash
+export JOB_QUEUE=<queue> SNAP_CHANNEL=<channel>
+```
+
+Then, modify the files:
+```bash
+envsubst '$JOB_QUEUE' < nvidia-job.yaml > temp-job.yaml
+
+envsubst '$SNAP_CHANNEL' < scripts/setup.sh > scripts/temp-setup.sh
+
+sed -i "s|setup.sh|temp-setup.sh|" temp-job.yaml
+
+sed -i "s|.github/workflows/testflinger/||" temp-job.yaml
+```
+
+Finally, submit the job:
+```bash
+testflinger submit --poll temp-job.yaml
+```
diff --git a/.github/workflows/testflinger/nvidia-job.yaml b/.github/workflows/testflinger/nvidia-job.yaml
@@ -0,0 +1,36 @@
+# This is a template
+# Some variables should be replaced with envsubst before use
+job_queue: $JOB_QUEUE
+global_timeout: 3600
+output_timeout: 1800
+provision_data:
+    distro: "noble"
+test_data:
+
+  # Copy files from the GH runner to the Testflinger Agent
+  attachments:
+    - local: ".github/workflows/testflinger/scripts"
+      agent: "scripts"
+
+  # Run commands on the Testflinger Agent
+  test_cmds: |
+    #!/usr/bin/env bash
+    set -ex
+
+    # List the attached files
+    find attachments/test
+
+    SCRIPTS=./attachments/test/scripts
+
+    echo "Testing: DEVICE_IP = $DEVICE_IP"
+    # Setup the environment on the target device
+    ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/setup.sh)"
+
+    # Reboot the device in background to avoid breaking the SSH connection prematurely
+    ssh ubuntu@$DEVICE_IP "(sleep 3 && sudo reboot) &"
+
+    echo "Wait for the device to boot and start its SSH server"
+    $SCRIPTS/wait_for_port.sh $DEVICE_IP 22
+
+    # Run the tests
+    ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/test.sh)"
diff --git a/.github/workflows/testflinger/scripts/setup.sh b/.github/workflows/testflinger/scripts/setup.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+set -ex
+
+apt_update() {
+  # ignore errors, some nodes fail to access the repos
+  set +e
+  sudo apt-get update
+  set -e
+}
+
+install_docker() {
+  # SNAP_CHANNEL may be set by the caller, or replaced in CI
+  DOCKER_SNAP_CHANNEL=$SNAP_CHANNEL
+  if [[ -z "$DOCKER_SNAP_CHANNEL" ]]; then
+    DOCKER_SNAP_CHANNEL="latest/edge"
+  fi
+
+  # install docker-snap
+  sudo snap install docker --channel="$DOCKER_SNAP_CHANNEL"
+
+  # check the installation
+  docker --version || exit 1
+}
+
+setup_classic() {
+  . /etc/os-release
+  UBUNTU_VERSION="${VERSION_ID//./}"
+
+  apt_update
+  sudo apt-get install -y curl
+
+  wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$UBUNTU_VERSION/x86_64/cuda-keyring_1.1-1_all.deb
+  sudo dpkg -i cuda-keyring_1.1-1_all.deb
+
+  apt_update
+  sudo apt-get -y install cuda-toolkit-12-5
+  sudo apt-get install -y nvidia-driver-555-open
+  sudo apt-get install -y cuda-drivers-555
+
+  curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg &&
+    curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |
+    sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |
+      sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+
+  apt_update
+  sudo apt-get install -y nvidia-container-toolkit
+}
+
+setup_core() {
+  sudo snap install nvidia-core22
+  sudo snap install nvidia-assemble --channel 22/stable
+}
+
+setup() {
+  . /etc/os-release
+
+  install_docker
+
+  if [[ $ID == "ubuntu" ]]; then
+    setup_classic
+
+  elif [[ $ID == "ubuntu-core" ]]; then
+    setup_core
+
+  else
+    echo "Unexpected operating system ID: $ID"
+    exit 1
+  fi
+}
+
+setup
+
+echo "A reboot is required!"
diff --git a/.github/workflows/testflinger/scripts/test.sh b/.github/workflows/testflinger/scripts/test.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -ex
+
+# Test nvidia-smi
+smi_test() {
+  . /etc/os-release
+
+  if [[ $ID == "ubuntu" ]]; then
+    sudo docker run --rm --runtime=nvidia --gpus all --env PATH="${PATH}:/var/lib/snapd/hostfs/usr/bin" ubuntu nvidia-smi || exit 1
+  elif [[ $ID == "ubuntu-core" ]]; then
+    sudo docker run --rm --runtime nvidia --gpus all -it ubuntu bash -c "/snap/docker/*/graphics/bin/nvidia-smi" || exit 1
+  else
+    echo "Unexpected operating system ID: $ID"
+    exit 1
+  fi
+}
+
+# Test a vector addition sample workload
+vector_add_test() {
+  sudo docker run --rm --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 || exit 1
+}
+
+smi_test
+
+vector_add_test
diff --git a/.github/workflows/testflinger/scripts/wait_for_port.sh b/.github/workflows/testflinger/scripts/wait_for_port.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -ex
+
+# install dependencies
+sudo apt install -y netcat
+
+# check connection to the device
+while ! nc -z $1 $2; do
+  echo "Waiting for $1:$2 ..."
+  sleep 10
+done