diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ec1b9b0c..5e5a8629 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -76,8 +76,6 @@ jobs: strategy: max-parallel: 5 matrix: - # TODO: We should ideally also run this with Windows/Mac clients and a Linux - # server. Unsure how to set that up with GitHub Actions though. platform: [ubuntu-latest] python-version: ['3.8', '3.9', '3.10', '3.11'] @@ -98,6 +96,7 @@ jobs: # NOTE: Replacing this with our customized version of # - uses: koesterlab/setup-slurm-action@v1 - uses: ./.github/custom_setup_slurm_action + timeout-minutes: 5 - name: Test if the slurm cluster is setup correctly run: srun --nodes=1 --ntasks=1 --cpus-per-task=1 --mem=1G --time=00:01:00 hostname @@ -143,43 +142,21 @@ jobs: fail_ci_if_error: false real-slurm-integration-tests: + name: integration tests with a real SLURM cluster needs: [mock-slurm-integration-tests] - runs-on: self-hosted strategy: - max-parallel: 5 + max-parallel: 1 matrix: # TODO: We should ideally also run this with Windows/Mac clients and a Linux # server. Unsure how to set that up with GitHub Actions though. - python-version: ['3.8', '3.9', '3.10', '3.11'] - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install poetry - poetry install --with=dev - - - name: Launch integration tests - id: self_hosted_integration_tests - run: poetry run pytest --slow --cov=milatools --cov-report=xml --cov-append -vvv --log-level=DEBUG - timeout-minutes: 30 - env: - SLURM_CLUSTER: mila - - - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v3 - with: - file: ./coverage.xml - flags: integrationtests - env_vars: PLATFORM,PYTHON - name: codecov-umbrella - fail_ci_if_error: false + python-version: ['3.11'] + cluster: ['mila'] + uses: ./.github/workflows/testing.yml + with: + cluster: ${{ matrix.cluster }} + python-version: ${{ matrix.python-version }} + timeout-minutes: 30 + secrets: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/full_cluster_tests.yml b/.github/workflows/full_cluster_tests.yml new file mode 100644 index 00000000..4db46205 --- /dev/null +++ b/.github/workflows/full_cluster_tests.yml @@ -0,0 +1,30 @@ +# Run integration tests on a self-hosted runner using all slurm clusters. +on: + push: + branches: "master" # every time a push is made to `master`, OR + schedule: + - cron: "30 6 * * 1" # every Monday at 6:30 AM UTC (2:30 AM Montreal time) OR + workflow_dispatch: # when the workflow is manually triggered + +# https://stackoverflow.com/a/72408109/6388696 +# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-concurrency-to-cancel-any-in-progress-job-or-run +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + real-slurm-integration-tests: + name: integration tests with a real SLURM cluster + strategy: + max-parallel: 5 + matrix: + # TODO: Setup self-hosted runners inside Mac and Windows VMs. + python-version: ['3.11'] + cluster: ['mila', 'narval', 'beluga', 'cedar', 'graham'] + uses: ./.github/workflows/testing.yml + with: + cluster: ${{ matrix.cluster }} + python-version: ${{ matrix.python-version }} + timeout-minutes: 60 + secrets: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml new file mode 100644 index 00000000..176845ec --- /dev/null +++ b/.github/workflows/testing.yml @@ -0,0 +1,64 @@ +on: + workflow_call: + inputs: + cluster: + required: true + type: string + python-version: + required: false + type: string + default: '3.11' + timeout-minutes: + required: false + type: number + default: 30 + secrets: + CODECOV_TOKEN: + required: true + + workflow_dispatch: + + +jobs: + real-slurm-integration-tests: + name: integration tests with a real SLURM cluster + runs-on: self-hosted + + steps: + - name: Check that we have the pre-existing connection to the SLURM cluster. + # TODO: mila cluster doesn't use 2FA yet, so we can actually create the connection + # to run the tests; we don't need it to be already running. + if: ${{ inputs.cluster != 'mila' && inputs.cluster != 'localhost'}} + run: + # Check that the control socket is running on the self-hosted runner so + # that we don't have to go through 2FA on DRAC clusters. + ssh -O check -oStrictHostKeyChecking=no ${{ inputs.cluster }} + + - uses: actions/checkout@v4 + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install poetry + poetry install --with=dev + + - name: Launch integration tests + id: self_hosted_integration_tests + run: poetry run pytest --slow --cov=milatools --cov-report=xml --cov-append -vvv --log-level=DEBUG + timeout-minutes: ${{ inputs.timeout-minutes }} + env: + SLURM_CLUSTER: ${{ inputs.cluster }} + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + flags: integrationtests + env_vars: PLATFORM,PYTHON + name: codecov-umbrella + fail_ci_if_error: false