From 3cf4c1d89b3de79a58756f93191a98f852a1f693 Mon Sep 17 00:00:00 2001 From: Lincoln Wallace Date: Wed, 7 Aug 2024 10:23:59 -0300 Subject: [PATCH] Add Github workflow to run Nvidia tests via Testflinger (#165) Signed-off-by: Lincoln Wallace Co-authored-by: Farshid Tavakolizadeh --- .github/workflows/nvidia-test.yml | 54 ++++++++++++++ .github/workflows/testflinger/README.md | 28 +++++++ .github/workflows/testflinger/nvidia-job.yaml | 36 +++++++++ .../workflows/testflinger/scripts/setup.sh | 73 +++++++++++++++++++ .github/workflows/testflinger/scripts/test.sh | 25 +++++++ .../testflinger/scripts/wait_for_port.sh | 11 +++ 6 files changed, 227 insertions(+) create mode 100644 .github/workflows/nvidia-test.yml create mode 100644 .github/workflows/testflinger/README.md create mode 100644 .github/workflows/testflinger/nvidia-job.yaml create mode 100755 .github/workflows/testflinger/scripts/setup.sh create mode 100755 .github/workflows/testflinger/scripts/test.sh create mode 100755 .github/workflows/testflinger/scripts/wait_for_port.sh diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml new file mode 100644 index 0000000..f7d0bb0 --- /dev/null +++ b/.github/workflows/nvidia-test.yml @@ -0,0 +1,54 @@ +name: Nvidia Graphics Tests + +on: + # Uncomment this trigger only during workflow development + # pull_request: + # branches: [ main ] + # Manual trigger + workflow_dispatch: + inputs: + snap-channel: + description: 'Docker snap channel' + required: true + type: string + default: 'latest/edge' + +jobs: + test: + runs-on: [self-hosted, testflinger] + env: + TESTFLINGER_DIR: .github/workflows/testflinger + strategy: + fail-fast: true + matrix: + job-queue: + - 202007-28059 + # - 202008-2816s7 + # - 202112-29789 + # noprovision node, for CI testing + # - 202302-31212 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Create Testflinger job queue + run: | + export JOB_QUEUE="${{ matrix.job-queue }}" + export SNAP_CHANNEL="${{ inputs.snap-channel }}" + + envsubst '$JOB_QUEUE' \ + < $TESTFLINGER_DIR/nvidia-job.yaml \ + > $TESTFLINGER_DIR/nvidia-job.temp + + envsubst '$SNAP_CHANNEL' \ + < $TESTFLINGER_DIR/scripts/setup.sh \ + > $TESTFLINGER_DIR/scripts/setup.temp + + mv $TESTFLINGER_DIR/nvidia-job.temp $TESTFLINGER_DIR/nvidia-job.yaml + mv $TESTFLINGER_DIR/scripts/setup.temp $TESTFLINGER_DIR/scripts/setup.sh + + - name: Submit Testflinger job + uses: canonical/testflinger/.github/actions/submit@main + with: + poll: true + job-path: ${{ env.TESTFLINGER_DIR }}/nvidia-job.yaml diff --git a/.github/workflows/testflinger/README.md b/.github/workflows/testflinger/README.md new file mode 100644 index 0000000..901433f --- /dev/null +++ b/.github/workflows/testflinger/README.md @@ -0,0 +1,28 @@ +# Testflinger scripts + +This directory contains the scripts used for Nvidia testing via Github actions and Testflinger. +The tests run on devices within Canonical's test farm. + +## Run locally +Running the tests locally is only possible if your machine has access to the Testflinger server. + +Export the following variables: +```bash +export JOB_QUEUE= SNAP_CHANNEL= +``` + +Then, modify the files: +```bash +envsubst '$JOB_QUEUE' < nvidia-job.yaml > temp-job.yaml + +envsubst '$SNAP_CHANNEL' < scripts/setup.sh > scripts/temp-setup.sh + +sed -i "s|setup.sh|temp-setup.sh|" temp-job.yaml + +sed -i "s|.github/workflows/testflinger/||" temp-job.yaml +``` + +Finally, submit the job: +```bash +testflinger submit --poll temp-job.yaml +``` \ No newline at end of file diff --git a/.github/workflows/testflinger/nvidia-job.yaml b/.github/workflows/testflinger/nvidia-job.yaml new file mode 100644 index 0000000..929dc58 --- /dev/null +++ b/.github/workflows/testflinger/nvidia-job.yaml @@ -0,0 +1,36 @@ +# This is a template +# Some variables should be replaced with envsubst before use +job_queue: $JOB_QUEUE +global_timeout: 3600 +output_timeout: 1800 +provision_data: + distro: "noble" +test_data: + + # Copy files from the GH runner to the Testflinger Agent + attachments: + - local: ".github/workflows/testflinger/scripts" + agent: "scripts" + + # Run commands on the Testflinger Agent + test_cmds: | + #!/usr/bin/env bash + set -ex + + # List the attached files + find attachments/test + + SCRIPTS=./attachments/test/scripts + + echo "Testing: DEVICE_IP = $DEVICE_IP" + # Setup the environment on the target device + ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/setup.sh)" + + # Reboot the device in background to avoid breaking the SSH connection prematurely + ssh ubuntu@$DEVICE_IP "(sleep 3 && sudo reboot) &" + + echo "Wait for the device to boot and start its SSH server" + $SCRIPTS/wait_for_port.sh $DEVICE_IP 22 + + # Run the tests + ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/test.sh)" \ No newline at end of file diff --git a/.github/workflows/testflinger/scripts/setup.sh b/.github/workflows/testflinger/scripts/setup.sh new file mode 100755 index 0000000..9d16cf2 --- /dev/null +++ b/.github/workflows/testflinger/scripts/setup.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +set -ex + +apt_update() { + # ignore errors, some nodes fail to access the repos + set +e + sudo apt-get update + set -e +} + +install_docker() { + # SNAP_CHANNEL may be set by the caller, or replaced in CI + DOCKER_SNAP_CHANNEL=$SNAP_CHANNEL + if [[ -z "$DOCKER_SNAP_CHANNEL" ]]; then + DOCKER_SNAP_CHANNEL="latest/edge" + fi + + # install docker-snap + sudo snap install docker --channel="$DOCKER_SNAP_CHANNEL" + + # check the installation + docker --version || exit 1 +} + +setup_classic() { + . /etc/os-release + UBUNTU_VERSION="${VERSION_ID//./}" + + apt_update + sudo apt-get install -y curl + + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$UBUNTU_VERSION/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + + apt_update + sudo apt-get -y install cuda-toolkit-12-5 + sudo apt-get install -y nvidia-driver-555-open + sudo apt-get install -y cuda-drivers-555 + + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + + apt_update + sudo apt-get install -y nvidia-container-toolkit +} + +setup_core() { + sudo snap install nvidia-core22 + sudo snap install nvidia-assemble --channel 22/stable +} + +setup() { + . /etc/os-release + + install_docker + + if [[ $ID == "ubuntu" ]]; then + setup_classic + + elif [[ $ID == "ubuntu-core" ]]; then + setup_core + + else + echo "Unexpected operating system ID: $ID" + exit 1 + fi +} + +setup + +echo "A reboot is required!" diff --git a/.github/workflows/testflinger/scripts/test.sh b/.github/workflows/testflinger/scripts/test.sh new file mode 100755 index 0000000..64cf984 --- /dev/null +++ b/.github/workflows/testflinger/scripts/test.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -ex + +# Test nvidia-smi +smi_test() { + . /etc/os-release + + if [[ $ID == "ubuntu" ]]; then + sudo docker run --rm --runtime=nvidia --gpus all --env PATH="${PATH}:/var/lib/snapd/hostfs/usr/bin" ubuntu nvidia-smi || exit 1 + elif [[ $ID == "ubuntu-core" ]]; then + sudo docker run --rm --runtime nvidia --gpus all -it ubuntu bash -c "/snap/docker/*/graphics/bin/nvidia-smi" || exit 1 + else + echo "Unexpected operating system ID: $ID" + exit 1 + fi +} + +# Test a vector addition sample workload +vector_add_test() { + sudo docker run --rm --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 || exit 1 +} + +smi_test + +vector_add_test diff --git a/.github/workflows/testflinger/scripts/wait_for_port.sh b/.github/workflows/testflinger/scripts/wait_for_port.sh new file mode 100755 index 0000000..2e00a8a --- /dev/null +++ b/.github/workflows/testflinger/scripts/wait_for_port.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -ex + +# install dependencies +sudo apt install -y netcat + +# check connection to the device +while ! nc -z $1 $2; do + echo "Waiting for $1:$2 ..." + sleep 10 +done