From 3a1237b59155c7cf776d0c134a9dc34418dea588 Mon Sep 17 00:00:00 2001 From: Fabrice Normandin Date: Wed, 24 Apr 2024 09:37:26 -0400 Subject: [PATCH] Run integration tests with all clusters periodically (#117) * Add reusable workflow used in build and DRAC tests - Extracts the integration test part of `build.yml` into a reusable workflow called `testing.yml`. - This workflow now also explicitly requires an existing SSH connection to be alive (a socket at the ControlPath of the SSH config) in order to run tests on the DRAC clusters - Also adds a new `full_cluster_tests.yml` workflow that is run periodically (once a week) and runs the integration tests on all slurm clusters (not just `'mila'`, as in `build.yml`). Signed-off-by: Fabrice Normandin * Also use test workflow for mock slurm cluster Signed-off-by: Fabrice Normandin * Partially revert last commit (mock slurm tests) Signed-off-by: Fabrice Normandin * Make the timeout-minutes a parameter Signed-off-by: Fabrice Normandin * Add timeout of 5 mins for setup of SLURM cluster Signed-off-by: Fabrice Normandin --------- Signed-off-by: Fabrice Normandin --- .github/workflows/build.yml | 47 +++++------------ .github/workflows/full_cluster_tests.yml | 30 +++++++++++ .github/workflows/testing.yml | 64 ++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 35 deletions(-) create mode 100644 .github/workflows/full_cluster_tests.yml create mode 100644 .github/workflows/testing.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ec1b9b0c..5e5a8629 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -76,8 +76,6 @@ jobs: strategy: max-parallel: 5 matrix: - # TODO: We should ideally also run this with Windows/Mac clients and a Linux - # server. Unsure how to set that up with GitHub Actions though. platform: [ubuntu-latest] python-version: ['3.8', '3.9', '3.10', '3.11'] @@ -98,6 +96,7 @@ jobs: # NOTE: Replacing this with our customized version of # - uses: koesterlab/setup-slurm-action@v1 - uses: ./.github/custom_setup_slurm_action + timeout-minutes: 5 - name: Test if the slurm cluster is setup correctly run: srun --nodes=1 --ntasks=1 --cpus-per-task=1 --mem=1G --time=00:01:00 hostname @@ -143,43 +142,21 @@ jobs: fail_ci_if_error: false real-slurm-integration-tests: + name: integration tests with a real SLURM cluster needs: [mock-slurm-integration-tests] - runs-on: self-hosted strategy: - max-parallel: 5 + max-parallel: 1 matrix: # TODO: We should ideally also run this with Windows/Mac clients and a Linux # server. Unsure how to set that up with GitHub Actions though. - python-version: ['3.8', '3.9', '3.10', '3.11'] - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install poetry - poetry install --with=dev - - - name: Launch integration tests - id: self_hosted_integration_tests - run: poetry run pytest --slow --cov=milatools --cov-report=xml --cov-append -vvv --log-level=DEBUG - timeout-minutes: 30 - env: - SLURM_CLUSTER: mila - - - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v3 - with: - file: ./coverage.xml - flags: integrationtests - env_vars: PLATFORM,PYTHON - name: codecov-umbrella - fail_ci_if_error: false + python-version: ['3.11'] + cluster: ['mila'] + uses: ./.github/workflows/testing.yml + with: + cluster: ${{ matrix.cluster }} + python-version: ${{ matrix.python-version }} + timeout-minutes: 30 + secrets: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/full_cluster_tests.yml b/.github/workflows/full_cluster_tests.yml new file mode 100644 index 00000000..4db46205 --- /dev/null +++ b/.github/workflows/full_cluster_tests.yml @@ -0,0 +1,30 @@ +# Run integration tests on a self-hosted runner using all slurm clusters. +on: + push: + branches: "master" # every time a push is made to `master`, OR + schedule: + - cron: "30 6 * * 1" # every Monday at 6:30 AM UTC (2:30 AM Montreal time) OR + workflow_dispatch: # when the workflow is manually triggered + +# https://stackoverflow.com/a/72408109/6388696 +# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-concurrency-to-cancel-any-in-progress-job-or-run +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + real-slurm-integration-tests: + name: integration tests with a real SLURM cluster + strategy: + max-parallel: 5 + matrix: + # TODO: Setup self-hosted runners inside Mac and Windows VMs. + python-version: ['3.11'] + cluster: ['mila', 'narval', 'beluga', 'cedar', 'graham'] + uses: ./.github/workflows/testing.yml + with: + cluster: ${{ matrix.cluster }} + python-version: ${{ matrix.python-version }} + timeout-minutes: 60 + secrets: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml new file mode 100644 index 00000000..176845ec --- /dev/null +++ b/.github/workflows/testing.yml @@ -0,0 +1,64 @@ +on: + workflow_call: + inputs: + cluster: + required: true + type: string + python-version: + required: false + type: string + default: '3.11' + timeout-minutes: + required: false + type: number + default: 30 + secrets: + CODECOV_TOKEN: + required: true + + workflow_dispatch: + + +jobs: + real-slurm-integration-tests: + name: integration tests with a real SLURM cluster + runs-on: self-hosted + + steps: + - name: Check that we have the pre-existing connection to the SLURM cluster. + # TODO: mila cluster doesn't use 2FA yet, so we can actually create the connection + # to run the tests; we don't need it to be already running. + if: ${{ inputs.cluster != 'mila' && inputs.cluster != 'localhost'}} + run: + # Check that the control socket is running on the self-hosted runner so + # that we don't have to go through 2FA on DRAC clusters. + ssh -O check -oStrictHostKeyChecking=no ${{ inputs.cluster }} + + - uses: actions/checkout@v4 + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install poetry + poetry install --with=dev + + - name: Launch integration tests + id: self_hosted_integration_tests + run: poetry run pytest --slow --cov=milatools --cov-report=xml --cov-append -vvv --log-level=DEBUG + timeout-minutes: ${{ inputs.timeout-minutes }} + env: + SLURM_CLUSTER: ${{ inputs.cluster }} + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + flags: integrationtests + env_vars: PLATFORM,PYTHON + name: codecov-umbrella + fail_ci_if_error: false