From 54c17e0cca8dddd6dcace1598d905a89a268173e Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Fri, 2 Feb 2024 09:53:24 +0100 Subject: [PATCH] Add FSDP test to Gaudi2 CI (#683) --- .github/workflows/slow_tests_gaudi2.yml | 28 ++++++++++++++++++++++++- Makefile | 3 +++ tests/test_fsdp_examples.py | 1 + 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml index 4d1a5a2631..ed01c273b7 100644 --- a/.github/workflows/slow_tests_gaudi2.yml +++ b/.github/workflows/slow_tests_gaudi2.yml @@ -58,11 +58,37 @@ jobs: --ipc=host \ vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ /bin/bash tests/ci/slow_tests_deepspeed.sh + fsdp: + name: Test FSDP models + if: ${{ !cancelled() && (success() || failure()) }} + needs: + - deepspeed # run the job when the previous test job is done + runs-on: [self-hosted, linux, x64, gaudi2] + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Pull image + run: | + docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + - name: Run tests + run: | + docker run \ + -v $PWD:/root/workspace \ + --workdir=/root/workspace \ + --runtime=habana \ + -e HABANA_VISIBLE_DEVICES=all \ + -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + -e GAUDI2_CI=1 \ + --cap-add=sys_nice \ + --net=host \ + --ipc=host \ + vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + make slow_tests_fsdp multi-card: name: Test multi-card models if: ${{ !cancelled() && (success() || failure()) }} needs: - - deepspeed # run the job when the previous test job is done + - fsdp # run the job when the previous test job is done runs-on: [self-hosted, linux, x64, gaudi2] steps: - name: Checkout diff --git a/Makefile b/Makefile index ba40ca4b93..9db0046ae8 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,9 @@ slow_tests_text_generation_example: test_installs python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder_text_summarization.py -v -s --token $(TOKEN) +slow_tests_fsdp: test_installs + python -m pytest tests/test_fsdp_examples.py -v -s + # Check if examples are up to date with the Transformers library example_diff_tests: test_installs python -m pytest tests/test_examples_match_transformers.py diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py index af82965063..f87f1d2bb9 100644 --- a/tests/test_fsdp_examples.py +++ b/tests/test_fsdp_examples.py @@ -11,6 +11,7 @@ # Gaudi2 CI baselines +# FSDP is not supported on Gaudi1 MODELS_TO_TEST = { "bf16": [ (