From c519f1fae8267ef5bf48be001534689ff3c993aa Mon Sep 17 00:00:00 2001
From: Chakru <161002324+chakru-r@users.noreply.github.com>
Date: Fri, 6 Dec 2024 20:44:48 +0530
Subject: [PATCH] test(smoke-tests): parallelise smoke test execution

---
 .github/workflows/docker-unified.yml         | 50 +++++++++++++------
 smoke-test/.gitignore                        |  4 +-
 smoke-test/build.gradle                      | 38 ++++++--------
 smoke-test/conftest.py                       | 52 ++++++++++++++++++++
 smoke-test/smoke.sh                          | 25 ++++++----
 smoke-test/tests/cypress/integration_test.py | 49 ++++++++++++------
 6 files changed, 155 insertions(+), 63 deletions(-)

diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index 03a9b3afc3bc5..47c26068347c0 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -1011,18 +1011,39 @@ jobs:
     needs: setup
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
+      cypress_batch_count: ${{ steps.set-batch-count.outputs.cypress_batch_count }}
+      python_batch_count: ${{ steps.set-batch-count.outputs.python_batch_count }}
     steps:
+      - id: set-batch-count
+        # Tests are split simply to ensure the configured number of batches for parallelization. This may need some
+        # increase as a new tests added increase the duration where an additional parallel batch helps.
+        # python_batch_count is used to split pytests in the smoke-test (batches of actual test functions)
+        # cypress_batch_count is used to split the collection of cypress test specs into batches.
+        run: |
+          echo "cypress_batch_count=11" >> "$GITHUB_OUTPUT"
+          echo "python_batch_count=5" >> "$GITHUB_OUTPUT"
+
       - id: set-matrix
+        # For m batches for python and n batches for cypress, we need a test matrix of python x m + cypress x n.
+        # while the github action matrix generation can handle these two parts individually, there isnt a way to use the
+        # two generated matrices for the same job. So, produce that matrix with scripting and use the include directive
+        # to add it to the test matrix.
         run: |
-          if [ '${{ needs.setup.outputs.frontend_only }}' == 'true' ]; then
-             echo 'matrix=["cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT"
-          elif [ '${{ needs.setup.outputs.ingestion_only }}' == 'true' ]; then
-             echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> "$GITHUB_OUTPUT"
-          elif [[ '${{ needs.setup.outputs.backend_change }}' == 'true' || '${{ needs.setup.outputs.smoke_test_change }}' == 'true' ]]; then
-             echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT"
-          else
-             echo 'matrix=[]' >> "$GITHUB_OUTPUT"
+          python_batch_count=${{ steps.set-batch-count.outputs.python_batch_count }}
+          python_matrix=$(printf "{\"test_strategy\":\"pytests\",\"batch\":\"0\",\"batch_count\":\"$python_batch_count\"}"; for ((i=1;i<python_batch_count;i++)); do printf ",{\"test_strategy\":\"pytests\", \"batch_count\":\"$python_batch_count\",\"batch\":\"%d\"}" $i; done)
+          
+          cypress_batch_count=${{ steps.set-batch-count.outputs.cypress_batch_count }}
+          cypress_matrix=$(printf "{\"test_strategy\":\"cypress\",\"batch\":\"0\",\"batch_count\":\"$cypress_batch_count\"}"; for ((i=1;i<cypress_batch_count;i++)); do printf ",{\"test_strategy\":\"cypress\", \"batch_count\":\"$cypress_batch_count\",\"batch\":\"%d\"}" $i; done)
+          
+          includes=''
+          if [[  "${{ needs.setup.outputs.frontend_only }}" == 'true' ]]; then
+            includes=$cypress_matrix
+          elif [ "${{ needs.setup.outputs.ingestion_only }}" == 'true' ]; then
+            includes=$python_matrix
+          elif [[ "${{ needs.setup.outputs.backend_change }}" == 'true' || "${{ needs.setup.outputs.smoke_test_change }}" == 'true' ]]; then
+            includes="$python_matrix,$cypress_matrix"
           fi
+          echo "matrix={\"include\":[$includes] }" >> "$GITHUB_OUTPUT"
 
   smoke_test:
     name: Run Smoke Tests
@@ -1043,8 +1064,7 @@ jobs:
       ]
     strategy:
       fail-fast: false
-      matrix:
-        test_strategy: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }}
+      matrix: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }}
     if: ${{ always() && !failure() && !cancelled() && needs.smoke_test_matrix.outputs.matrix != '[]' }}
     steps:
       - name: Free up disk space
@@ -1220,6 +1240,8 @@ jobs:
           CYPRESS_RECORD_KEY: ${{ secrets.CYPRESS_RECORD_KEY }}
           CLEANUP_DATA: "false"
           TEST_STRATEGY: ${{ matrix.test_strategy }}
+          BATCH_COUNT: ${{ matrix.batch_count }}
+          BATCH_NUMBER: ${{ matrix.batch }}
         run: |
           echo "$DATAHUB_VERSION"
           ./gradlew --stop
@@ -1230,25 +1252,25 @@ jobs:
         if: failure()
         run: |
           docker ps -a
-          TEST_STRATEGY="-${{ matrix.test_strategy }}"
+          TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}"
           source .github/scripts/docker_logs.sh
       - name: Upload logs
         uses: actions/upload-artifact@v3
         if: failure()
         with:
-          name: docker-logs-${{ matrix.test_strategy }}
+          name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }}
           path: "docker_logs/*.log"
           retention-days: 5
       - name: Upload screenshots
         uses: actions/upload-artifact@v3
         if: failure()
         with:
-          name: cypress-snapshots-${{ matrix.test_strategy }}
+          name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }}
           path: smoke-test/tests/cypress/cypress/screenshots/
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: Test Results (smoke tests) ${{ matrix.test_strategy }}
+          name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }}
           path: |
             **/build/reports/tests/test/**
             **/build/test-results/test/**
diff --git a/smoke-test/.gitignore b/smoke-test/.gitignore
index b8af2eef535a0..d8cfd65ff81b9 100644
--- a/smoke-test/.gitignore
+++ b/smoke-test/.gitignore
@@ -29,6 +29,8 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+**/cypress/node_modules
+
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -132,4 +134,4 @@ dmypy.json
 # Pyre type checker
 .pyre/
 junit*
-tests/cypress/onboarding.json
\ No newline at end of file
+tests/cypress/onboarding.json
diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle
index 73ecdcb08ea14..60d08e0206cda 100644
--- a/smoke-test/build.gradle
+++ b/smoke-test/build.gradle
@@ -91,39 +91,31 @@ task pythonLintFix(type: Exec, dependsOn: installDev) {
  * The following tasks assume an already running quickstart.
  * ./gradlew quickstart (or another variation `quickstartDebug`)
  */
-task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) {
-    environment 'RUN_QUICKSTART', 'false'
-    environment 'TEST_STRATEGY', 'no_cypress_suite0'
-
-    workingDir = project.projectDir
-    commandLine 'bash', '-c',
-            "source ${venv_name}/bin/activate && set -x && " +
-            "./smoke.sh"
-}
+//  ./gradlew :smoke-test:pytest -PbatchNumber=2 (default 0)
+task pytest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) {
+    // Get BATCH_NUMBER from command line argument with default value of 0
+    def batchNumber = project.hasProperty('batchNumber') ? project.property('batchNumber') : '0'
 
-task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) {
     environment 'RUN_QUICKSTART', 'false'
-    environment 'TEST_STRATEGY', 'no_cypress_suite1'
+    environment 'TEST_STRATEGY', 'pytests'
+    environment 'BATCH_COUNT', 5
+    environment 'BATCH_NUMBER', batchNumber
 
     workingDir = project.projectDir
     commandLine 'bash', '-c',
             "source ${venv_name}/bin/activate && set -x && " +
-                    "./smoke.sh"
+            "./smoke.sh"
 }
 
-task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) {
-    environment 'RUN_QUICKSTART', 'false'
-    environment 'TEST_STRATEGY', 'cypress_suite1'
-
-    workingDir = project.projectDir
-    commandLine 'bash', '-c',
-            "source ${venv_name}/bin/activate && set -x && " +
-                    "./smoke.sh"
-}
+//  ./gradlew :smoke-test:cypressTest -PbatchNumber=2 (default 0)
+task cypressTest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) {
+    // Get BATCH_NUMBER from command line argument with default value of 0
+    def batchNumber = project.hasProperty('batchNumber') ? project.property('batchNumber') : '0'
 
-task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) {
     environment 'RUN_QUICKSTART', 'false'
-    environment 'TEST_STRATEGY', 'cypress_rest'
+    environment 'TEST_STRATEGY', 'cypress'
+    environment 'BATCH_COUNT', 11
+    environment 'BATCH_NUMBER', batchNumber
 
     workingDir = project.projectDir
     commandLine 'bash', '-c',
diff --git a/smoke-test/conftest.py b/smoke-test/conftest.py
index 6d148db9886a4..d48a92b22ab48 100644
--- a/smoke-test/conftest.py
+++ b/smoke-test/conftest.py
@@ -1,6 +1,8 @@
 import os
 
 import pytest
+from typing import List, Tuple
+from _pytest.nodes import Item
 import requests
 from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
 
@@ -45,3 +47,53 @@ def graph_client(auth_session) -> DataHubGraph:
 def pytest_sessionfinish(session, exitstatus):
     """whole test run finishes."""
     send_message(exitstatus)
+
+
+def get_batch_start_end(num_tests: int) -> Tuple[int, int]:
+    batch_count_env = os.getenv("BATCH_COUNT", 1)
+    batch_count = int(batch_count_env)
+
+    batch_number_env = os.getenv("BATCH_NUMBER", 0)
+    batch_number = int(batch_number_env)
+
+    if batch_count == 0 or batch_count > num_tests:
+        raise ValueError(
+            f"Invalid batch count {batch_count}: must be >0 and <= {num_tests} (num_tests)"
+        )
+    if batch_number >= batch_count:
+        raise ValueError(
+            f"Invalid batch number: {batch_number}, must be less than {batch_count} (zer0 based index)"
+        )
+
+    batch_size = round(num_tests / batch_count)
+
+    batch_start = batch_size * batch_number
+    batch_end = batch_start + batch_size
+    # We must have exactly as many batches as specified by BATCH_COUNT.
+    if (
+        num_tests - batch_end < batch_size
+    ):  # We must have exactly as many batches as specified by BATCH_COUNT, put the remaining in the last batch.
+        batch_end = num_tests
+
+    if batch_count > 0:
+        print(f"Running tests for batch {batch_number} of {batch_count}")
+
+    return batch_start, batch_end
+
+
+def pytest_collection_modifyitems(
+    session: pytest.Session, config: pytest.Config, items: List[Item]
+) -> None:
+    if os.getenv("TEST_STRATEGY") == "cypress":
+        return  # We launch cypress via pytests, but needs a different batching mechanism at cypress level.
+
+    # If BATCH_COUNT and BATCH_ENV vars are set, splits the pytests to batches and runs filters only the BATCH_NUMBER
+    # batch for execution. Enables multiple parallel launches. Current implementation assumes all test are of equal
+    # weight for batching. TODO. A weighted batching method can help make batches more equal sized by cost.
+    # this effectively is a no-op if BATCH_COUNT=1
+    start_index, end_index = get_batch_start_end(num_tests=len(items))
+
+    items.sort(key=lambda x: x.nodeid)  # we want the order to be stable across batches
+    # replace items with the filtered list
+    print(f"Running tests for batch {start_index}-{end_index}")
+    items[:] = items[start_index:end_index]
diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh
index 888a60f488e1f..ec8188ebf5f4d 100755
--- a/smoke-test/smoke.sh
+++ b/smoke-test/smoke.sh
@@ -34,15 +34,20 @@ source ./set-cypress-creds.sh
 # set environment variables for the test
 source ./set-test-env-vars.sh
 
-# no_cypress_suite0, no_cypress_suite1, cypress_suite1, cypress_rest
-if [[ -z "${TEST_STRATEGY}" ]]; then
-    pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke.xml
+# TEST_STRATEGY:
+#   if set to pytests, runs all pytests, skips cypress tests(though cypress test launch is via  a pytest).
+#   if set tp cypress, runs all cypress tests
+#   if blank, runs all.
+# When invoked via the github action, BATCH_COUNT and BATCH_NUM env vars are set to run a slice of those tests per
+# worker for parallelism. docker-unified.yml generates a test matrix of pytests/cypress in batches. As number of tests
+# increase, the batch_count config (in docker-unified.yml) may need adjustment.
+if [[ "${TEST_STRATEGY}" == "pytests" ]]; then
+  #pytests only - github test matrix runs pytests in one of the runners when applicable.
+  pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-pytests.xml -k 'not test_run_cypress'
+elif [[ "${TEST_STRATEGY}" == "cypress" ]]; then
+  # run only cypress tests. The test inspects BATCH_COUNT and BATCH_NUMBER and runs only a subset of tests in that batch.
+  # github workflow test matrix will invoke this in multiple runners for each batch.
+  pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-cypress${BATCH_NUMBER}.xml tests/cypress/integration_test.py
 else
-    if [ "$TEST_STRATEGY" == "no_cypress_suite0" ]; then
-        pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_non_cypress.xml -k 'not test_run_cypress' -m 'not no_cypress_suite1'
-    elif [ "$TEST_STRATEGY" == "no_cypress_suite1" ]; then
-        pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_non_cypress.xml -m 'no_cypress_suite1'
-    else
-        pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_cypress_${TEST_STRATEGY}.xml tests/cypress/integration_test.py
-    fi
+  pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-all.xml
 fi
diff --git a/smoke-test/tests/cypress/integration_test.py b/smoke-test/tests/cypress/integration_test.py
index 0d824a96810d0..33c67a923c278 100644
--- a/smoke-test/tests/cypress/integration_test.py
+++ b/smoke-test/tests/cypress/integration_test.py
@@ -1,10 +1,11 @@
 import datetime
 import os
 import subprocess
-from typing import List, Set
+from typing import List
 
 import pytest
 
+from conftest import get_batch_start_end
 from tests.setup.lineage.ingest_time_lineage import (
     get_time_lineage_urns,
     ingest_time_lineage,
@@ -169,10 +170,29 @@ def ingest_cleanup_data(auth_session, graph_client):
     print("deleted onboarding data")
 
 
-def _get_spec_map(items: Set[str]) -> str:
-    if len(items) == 0:
-        return ""
-    return ",".join([f"**/{item}/*.js" for item in items])
+def _get_js_files(base_path: str):
+    file_paths = []
+    for root, dirs, files in os.walk(base_path):
+        for file in files:
+            if file.endswith(".js"):
+                file_paths.append(os.path.relpath(os.path.join(root, file), base_path))
+    return sorted(file_paths)  # sort to make the order stable across batch runs
+
+
+def _get_cypress_tests_batch():
+    """
+    Batching is configured via env vars BATCH_COUNT and BATCH_NUMBER.  All cypress tests are split into exactly
+    BATCH_COUNT batches. When BATCH_NUMBER env var is set (zero based index), that batch alone is run.
+    Github workflow via test_matrix, runs all batches in parallel to speed up the test elapsed time.
+    If either of these vars are not set, all tests are run sequentially.
+    :return:
+    """
+    all_tests = _get_js_files("tests/cypress/cypress/e2e")
+
+    batch_start, batch_end = get_batch_start_end(num_tests=len(all_tests))
+
+    return all_tests[batch_start:batch_end]
+    # return test_batches[int(batch_number)]  #if BATCH_NUMBER was set, we this test just runs that one batch.
 
 
 def test_run_cypress(auth_session):
@@ -182,24 +202,23 @@ def test_run_cypress(auth_session):
     test_strategy = os.getenv("TEST_STRATEGY", None)
     if record_key:
         record_arg = " --record "
-        tag_arg = f" --tag {test_strategy} "
+        batch_number = os.getenv("BATCH_NUMBER")
+        batch_count = os.getenv("BATCH_COUNT")
+        if batch_number and batch_count:
+            batch_suffix = f"-{batch_number}{batch_count}"
+        else:
+            batch_suffix = ""
+        tag_arg = f" --tag {test_strategy}{batch_suffix}"
     else:
         record_arg = " "
 
     rest_specs = set(os.listdir("tests/cypress/cypress/e2e"))
     cypress_suite1_specs = {"mutations", "search", "views"}
     rest_specs.difference_update(set(cypress_suite1_specs))
-    strategy_spec_map = {
-        "cypress_suite1": cypress_suite1_specs,
-        "cypress_rest": rest_specs,
-    }
     print(f"test strategy is {test_strategy}")
     test_spec_arg = ""
-    if test_strategy is not None:
-        specs = strategy_spec_map.get(test_strategy)
-        assert specs is not None
-        specs_str = _get_spec_map(specs)
-        test_spec_arg = f" --spec '{specs_str}' "
+    specs_str = ",".join([f"**/{f}" for f in _get_cypress_tests_batch()])
+    test_spec_arg = f" --spec '{specs_str}' "
 
     print("Running Cypress tests with command")
     command = f"NO_COLOR=1 npx cypress run {record_arg} {test_spec_arg} {tag_arg}"