From 8642e2dea87d9b2e16d99844dfd17381a8931e15 Mon Sep 17 00:00:00 2001
From: Vivek Miglani <vivekm@meta.com>
Date: Mon, 21 Oct 2024 07:49:11 -0700
Subject: [PATCH] Add auto retries for Captum OSS GitHub Actions

Summary:
We frequently see sporadic failures in Captum GitHub actions test workflows, often related to package download, http errors, conda environment setup, etc.

We add auto-retries to automatically retry failed workflows rather than needing to do this manually.

Differential Revision: D64693773
---
 .github/workflows/retry.yml                  | 19 ++++++
 .github/workflows/test-conda-cpu.yml         | 62 ++++++++++++--------
 .github/workflows/test-pip-cpu-with-mypy.yml | 14 +++++
 .github/workflows/test-pip-cpu.yml           | 14 +++++
 .github/workflows/test-pip-gpu.yml           | 14 +++++
 5 files changed, 99 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/retry.yml

diff --git a/.github/workflows/retry.yml b/.github/workflows/retry.yml
new file mode 100644
index 0000000000..8acb101f9f
--- /dev/null
+++ b/.github/workflows/retry.yml
@@ -0,0 +1,19 @@
+name: Retry Test
+on:
+  workflow_dispatch:
+    inputs:
+      run_id:
+        required: true
+jobs:
+  rerun-on-failure:
+    permissions: write-all
+    runs-on: ubuntu-latest
+    steps:
+      - name: rerun ${{ inputs.run_id }}
+        env:
+          GH_REPO: ${{ github.repository }}
+          GH_TOKEN: ${{ github.token }}
+          GH_DEBUG: api
+        run: |
+          gh run watch ${{ inputs.run_id }} > /dev/null 2>&1
+          gh run rerun ${{ inputs.run_id }} --failed
diff --git a/.github/workflows/test-conda-cpu.yml b/.github/workflows/test-conda-cpu.yml
index 3295edcca8..3496aa7485 100644
--- a/.github/workflows/test-conda-cpu.yml
+++ b/.github/workflows/test-conda-cpu.yml
@@ -1,34 +1,48 @@
 name: Unit-tests for Conda install
 
 on:
-  pull_request:
-  push:
-    branches:
-      - master
+pull_request:
+push:
+  branches:
+    - master
 
-  workflow_dispatch:
+workflow_dispatch:
 
 env:
-  CHANNEL: "nightly"
+CHANNEL: "nightly"
 
 jobs:
-  tests:
-    strategy:
-      matrix:
-        python_version: ["3.8", "3.9", "3.10", "3.11"]
-      fail-fast: false
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.12xlarge
-      repository: pytorch/captum
-      script: |
-        # Set up Environment Variables
-        export PYTHON_VERSION="${{ matrix.python_version }}"
+tests:
+  strategy:
+    matrix:
+      python_version: ["3.8", "3.9", "3.10", "3.11"]
+    fail-fast: false
+  uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+  with:
+    runner: linux.12xlarge
+    repository: pytorch/captum
+    script: |
+      # Set up Environment Variables
+      export PYTHON_VERSION="${{ matrix.python_version }}"
 
-        # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}"
-        conda activate /pytorch/captum/ci_env
-        ./scripts/install_via_conda.sh -n
+      # Create Conda Env
+      conda create -yp ci_env python="${PYTHON_VERSION}"
+      conda activate /pytorch/captum/ci_env
+      ./scripts/install_via_conda.sh -n
 
-        # Run Tests
-        python3 -m pytest -ra --cov=. --cov-report term-missing
+      # Run Tests
+      python3 -m pytest -ra --cov=. --cov-report term-missing
+
+auto-retry:
+  name: Auto retry on failure
+  if: failure() && fromJSON(github.run_attempt) < 2
+  runs-on: ubuntu-latest
+  steps:
+    - name: Start rerun workflow
+      env:
+        GH_REPO: ${{ github.repository }}
+        GH_TOKEN: ${{ github.token }}
+        GH_DEBUG: api
+      run: |
+        gh workflow run retry_build.yml \
+          -F run_id=${{ github.run_id }}
diff --git a/.github/workflows/test-pip-cpu-with-mypy.yml b/.github/workflows/test-pip-cpu-with-mypy.yml
index 7e166261e4..e4090611e4 100644
--- a/.github/workflows/test-pip-cpu-with-mypy.yml
+++ b/.github/workflows/test-pip-cpu-with-mypy.yml
@@ -25,3 +25,17 @@ jobs:
         ./scripts/run_mypy.sh
         # Run Tests
         python3 -m pytest -ra --cov=. --cov-report term-missing
+
+  auto-retry:
+    name: Auto retry on failure
+    if: failure() && fromJSON(github.run_attempt) < 2
+    runs-on: ubuntu-latest
+    steps:
+      - name: Start rerun workflow
+        env:
+          GH_REPO: ${{ github.repository }}
+          GH_TOKEN: ${{ github.token }}
+          GH_DEBUG: api
+        run: |
+          gh workflow run retry_build.yml \
+            -F run_id=${{ github.run_id }}
diff --git a/.github/workflows/test-pip-cpu.yml b/.github/workflows/test-pip-cpu.yml
index a83f18e05d..2b4826abee 100644
--- a/.github/workflows/test-pip-cpu.yml
+++ b/.github/workflows/test-pip-cpu.yml
@@ -35,3 +35,17 @@ jobs:
         ./scripts/install_via_pip.sh ${{ matrix.pytorch_args }} ${{ matrix.transformers_args }}
         # Run Tests
         python3 -m pytest -ra --cov=. --cov-report term-missing
+
+  auto-retry:
+    name: Auto retry on failure
+    if: failure() && fromJSON(github.run_attempt) < 2
+    runs-on: ubuntu-latest
+    steps:
+      - name: Start rerun workflow
+        env:
+          GH_REPO: ${{ github.repository }}
+          GH_TOKEN: ${{ github.token }}
+          GH_DEBUG: api
+        run: |
+          gh workflow run retry_build.yml \
+            -F run_id=${{ github.run_id }}
diff --git a/.github/workflows/test-pip-gpu.yml b/.github/workflows/test-pip-gpu.yml
index 117f515f48..9dde898794 100644
--- a/.github/workflows/test-pip-gpu.yml
+++ b/.github/workflows/test-pip-gpu.yml
@@ -30,3 +30,17 @@ jobs:
 
         # Run Tests
         python3 -m pytest -ra --cov=. --cov-report term-missing
+
+  auto-retry:
+    name: Auto retry on failure
+    if: failure() && fromJSON(github.run_attempt) < 2
+    runs-on: ubuntu-latest
+    steps:
+      - name: Start rerun workflow
+        env:
+          GH_REPO: ${{ github.repository }}
+          GH_TOKEN: ${{ github.token }}
+          GH_DEBUG: api
+        run: |
+          gh workflow run retry_build.yml \
+            -F run_id=${{ github.run_id }}