From 8642e2dea87d9b2e16d99844dfd17381a8931e15 Mon Sep 17 00:00:00 2001 From: Vivek Miglani Date: Mon, 21 Oct 2024 07:49:11 -0700 Subject: [PATCH] Add auto retries for Captum OSS GitHub Actions Summary: We frequently see sporadic failures in Captum GitHub actions test workflows, often related to package download, http errors, conda environment setup, etc. We add auto-retries to automatically retry failed workflows rather than needing to do this manually. Differential Revision: D64693773 --- .github/workflows/retry.yml | 19 ++++++ .github/workflows/test-conda-cpu.yml | 62 ++++++++++++-------- .github/workflows/test-pip-cpu-with-mypy.yml | 14 +++++ .github/workflows/test-pip-cpu.yml | 14 +++++ .github/workflows/test-pip-gpu.yml | 14 +++++ 5 files changed, 99 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/retry.yml diff --git a/.github/workflows/retry.yml b/.github/workflows/retry.yml new file mode 100644 index 0000000000..8acb101f9f --- /dev/null +++ b/.github/workflows/retry.yml @@ -0,0 +1,19 @@ +name: Retry Test +on: + workflow_dispatch: + inputs: + run_id: + required: true +jobs: + rerun-on-failure: + permissions: write-all + runs-on: ubuntu-latest + steps: + - name: rerun ${{ inputs.run_id }} + env: + GH_REPO: ${{ github.repository }} + GH_TOKEN: ${{ github.token }} + GH_DEBUG: api + run: | + gh run watch ${{ inputs.run_id }} > /dev/null 2>&1 + gh run rerun ${{ inputs.run_id }} --failed diff --git a/.github/workflows/test-conda-cpu.yml b/.github/workflows/test-conda-cpu.yml index 3295edcca8..3496aa7485 100644 --- a/.github/workflows/test-conda-cpu.yml +++ b/.github/workflows/test-conda-cpu.yml @@ -1,34 +1,48 @@ name: Unit-tests for Conda install on: - pull_request: - push: - branches: - - master +pull_request: +push: + branches: + - master - workflow_dispatch: +workflow_dispatch: env: - CHANNEL: "nightly" +CHANNEL: "nightly" jobs: - tests: - strategy: - matrix: - python_version: ["3.8", "3.9", "3.10", "3.11"] - fail-fast: false - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - with: - runner: linux.12xlarge - repository: pytorch/captum - script: | - # Set up Environment Variables - export PYTHON_VERSION="${{ matrix.python_version }}" +tests: + strategy: + matrix: + python_version: ["3.8", "3.9", "3.10", "3.11"] + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.12xlarge + repository: pytorch/captum + script: | + # Set up Environment Variables + export PYTHON_VERSION="${{ matrix.python_version }}" - # Create Conda Env - conda create -yp ci_env python="${PYTHON_VERSION}" - conda activate /pytorch/captum/ci_env - ./scripts/install_via_conda.sh -n + # Create Conda Env + conda create -yp ci_env python="${PYTHON_VERSION}" + conda activate /pytorch/captum/ci_env + ./scripts/install_via_conda.sh -n - # Run Tests - python3 -m pytest -ra --cov=. --cov-report term-missing + # Run Tests + python3 -m pytest -ra --cov=. --cov-report term-missing + +auto-retry: + name: Auto retry on failure + if: failure() && fromJSON(github.run_attempt) < 2 + runs-on: ubuntu-latest + steps: + - name: Start rerun workflow + env: + GH_REPO: ${{ github.repository }} + GH_TOKEN: ${{ github.token }} + GH_DEBUG: api + run: | + gh workflow run retry_build.yml \ + -F run_id=${{ github.run_id }} diff --git a/.github/workflows/test-pip-cpu-with-mypy.yml b/.github/workflows/test-pip-cpu-with-mypy.yml index 7e166261e4..e4090611e4 100644 --- a/.github/workflows/test-pip-cpu-with-mypy.yml +++ b/.github/workflows/test-pip-cpu-with-mypy.yml @@ -25,3 +25,17 @@ jobs: ./scripts/run_mypy.sh # Run Tests python3 -m pytest -ra --cov=. --cov-report term-missing + + auto-retry: + name: Auto retry on failure + if: failure() && fromJSON(github.run_attempt) < 2 + runs-on: ubuntu-latest + steps: + - name: Start rerun workflow + env: + GH_REPO: ${{ github.repository }} + GH_TOKEN: ${{ github.token }} + GH_DEBUG: api + run: | + gh workflow run retry_build.yml \ + -F run_id=${{ github.run_id }} diff --git a/.github/workflows/test-pip-cpu.yml b/.github/workflows/test-pip-cpu.yml index a83f18e05d..2b4826abee 100644 --- a/.github/workflows/test-pip-cpu.yml +++ b/.github/workflows/test-pip-cpu.yml @@ -35,3 +35,17 @@ jobs: ./scripts/install_via_pip.sh ${{ matrix.pytorch_args }} ${{ matrix.transformers_args }} # Run Tests python3 -m pytest -ra --cov=. --cov-report term-missing + + auto-retry: + name: Auto retry on failure + if: failure() && fromJSON(github.run_attempt) < 2 + runs-on: ubuntu-latest + steps: + - name: Start rerun workflow + env: + GH_REPO: ${{ github.repository }} + GH_TOKEN: ${{ github.token }} + GH_DEBUG: api + run: | + gh workflow run retry_build.yml \ + -F run_id=${{ github.run_id }} diff --git a/.github/workflows/test-pip-gpu.yml b/.github/workflows/test-pip-gpu.yml index 117f515f48..9dde898794 100644 --- a/.github/workflows/test-pip-gpu.yml +++ b/.github/workflows/test-pip-gpu.yml @@ -30,3 +30,17 @@ jobs: # Run Tests python3 -m pytest -ra --cov=. --cov-report term-missing + + auto-retry: + name: Auto retry on failure + if: failure() && fromJSON(github.run_attempt) < 2 + runs-on: ubuntu-latest + steps: + - name: Start rerun workflow + env: + GH_REPO: ${{ github.repository }} + GH_TOKEN: ${{ github.token }} + GH_DEBUG: api + run: | + gh workflow run retry_build.yml \ + -F run_id=${{ github.run_id }}