[CI] Update CI image and unit tests (#289)

CI images： Using nvcr.io/nvidia/pytorch: 24.05-py3 as the base image, the name of the built image is 'flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05' - V2.0 represents a major image update - Pytorch 2.5.1-CUDA-12.4.131-NGC-24.05 represents software and basic image versions Unit tests : Run all unit tests, skip/fix errors, overall pass - For megatron unit testing: fixing(know why something went wrong) or skip - For Flagscale unit testing: fixing Functional tests : - Add training tests for Llava onevision (Temporarily closed due to data updates) Bug fix : - Adjust the temporary file path for coverage to avoid coverage loss caused by container destruction TODO : - Add vLLM inference testing - Add Dockerfile.ci (using conda)
FlagOpen · Dec 18, 2024 · 2c880b3 · 2c880b3
1 parent 6df55e3
commit 2c880b3
Show file tree

Hide file tree

Showing 33 changed files with 561 additions and 59 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -2,4 +2,4 @@
 directory = coverage
 
 [run]
-data_file = .coverage_$LOCAL_RANK
+data_file = /workspace/report/$COMMIT_ID/cov-temp-flagscale/.coverage_$LOCAL_RANK
diff --git a/.github/workflows/all-tests.yml b/.github/workflows/all-tests.yml
@@ -11,8 +11,15 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # Megatron Report Clean
+  megatron-report-clean:
+    uses: ./.github/workflows/report-clean.yml
+    with:
+      backend: megatron
+
   # Megatron Unit Tests with Matrix
   megatron-unit-tests:
+    needs: megatron-report-clean
     uses: ./.github/workflows/unit-tests.yml
     strategy:
       matrix:
@@ -25,6 +32,7 @@ jobs:
           - inference
           - models
           - pipeline_parallel
+          - ssm
           - tensor_parallel
           - transformer/moe
           - transformer
@@ -34,8 +42,15 @@ jobs:
       backend: megatron
       subset: ${{ matrix.subset }}
 
+  # FlagScale Report Clean
+  flagscale-report-clean:
+    uses: ./.github/workflows/report-clean.yml
+    with:
+      backend: flagscale
+
   # Flagscale Unit Tests with Matrix
   flagscale-unit-tests:
+    needs: flagscale-report-clean
     uses: ./.github/workflows/unit-tests.yml
     strategy:
       matrix:
@@ -58,6 +73,7 @@ jobs:
         model: 
           - aquila
           - mixtral
+          # - llava_onevision
     name: "train-${{ matrix.model }}"
     with:
       model: ${{ matrix.model }}

diff --git a/.github/workflows/coverage-tests.yml b/.github/workflows/coverage-tests.yml
@@ -11,7 +11,7 @@ jobs:
   test-coverage:
     runs-on: self-hosted
     container:
-      image: localhost:5000/flagscale_cicd:v1.5
+      image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
       ports:
         - 80
       volumes:

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
@@ -20,7 +20,7 @@ env:
 
 jobs:
   format:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
     - name: Checkout Code
@@ -37,8 +37,9 @@ jobs:
 
     - name: Run Black
       run: |
-        black --verbose --include $INCLUDE_FILES ./ --diff
+        black --verbose --include "$INCLUDE_FILES" ./ --check || { echo "Code formatting does not comply with Black's rules. Please reformat the code according to Black and resubmit."; exit 1; }
 
     - name: Run Isort
       run: |
-        isort --verbose --profile black $INCLUDE_FILES --diff --known-local-folder flagscale
+        isort --verbose --profile black $INCLUDE_FILES --check-only --diff --known-local-folder flagscale || { echo "Import order does not comply with isort rules. Please fix the import order and resubmit."; exit 1; }
+
diff --git a/.github/workflows/functional-tests.yml b/.github/workflows/functional-tests.yml
@@ -15,7 +15,7 @@ jobs:
   functional-test:
     runs-on: self-hosted
     container:
-      image: localhost:5000/flagscale_cicd:v1.5
+      image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
       ports:
         - 80
       volumes:

diff --git a/.github/workflows/report-clean.yml b/.github/workflows/report-clean.yml
@@ -0,0 +1,32 @@
+name: Clean Old Report
+
+on:
+  workflow_call:
+    inputs:
+      backend:
+        required: true
+        type: string
+
+jobs:
+  clean-report:
+    runs-on: self-hosted
+    container:
+      image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
+      ports:
+        - 80
+      volumes:
+        - /home/flagscale_cicd/flask/static:/workspace/report
+        - /home/flagscale_cicd/flask/config:/workspace/config
+      options: --hostname flagscale_cicd
+
+    steps:
+      - name: Clean Old Report Report
+        run: |
+          REPORT_ADDR=$(cat "/workspace/config/report_address")
+          echo "Clean old Report report at the http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/diff-cover-report-${{ inputs.backend }}.html"
+          if [ -d "/workspace/report/${{ github.sha }}/cov-report-${{ inputs.backend }}" ]; then
+            rm -r /workspace/report/${{ github.sha }}/cov-report-${{ inputs.backend }}
+          fi
+          if [ -d "/workspace/report/${{ github.sha }}/cov-temp-${{ inputs.backend }}" ]; then
+            rm -r /workspace/report/${{ github.sha }}/cov-temp-${{ inputs.backend }}
+          fi
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -15,7 +15,7 @@ jobs:
   unit-test:
     runs-on: self-hosted
     container:
-      image: localhost:5000/flagscale_cicd:v1.5
+      image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
       ports:
         - 80
       volumes:

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
@@ -0,0 +1,100 @@
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+
+ENV DEBIAN_FRONTEND noninteractive
+ENV TZ=Asia/Shanghai
+
+
+##############################################################################
+# To avoid "curl 92 HTTP/2 stream 0 was not closed cleanly: CANCEL (err 8)" or "fetch-pack: unexpected disconnect while reading sideband packet".
+##############################################################################
+# lowSpeedTime=300s lowSpeedLimit=100B
+RUN git config --global http.lowSpeedTime 300 \
+    && git config --global http.lowSpeedLimit 100 \
+    && git config --global http.postBuffer 524288000
+
+
+##############################################################################
+# Change apt source to Ksyun
+##############################################################################
+RUN sed -i "s#\S\+#http://apt.ksyun.cn/ubuntu/#2" /etc/apt/sources.list && \
+    > /etc/apt/apt.conf.d/docker-clean && \
+    > /etc/dpkg/dpkg.cfg.d/pkg-config-hook-config
+
+
+##############################################################################
+# Install basic utilities
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        curl wget vim tmux less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        tzdata psmisc screen libx11-dev llvm-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
+##############################################################################
+# Uninstall unnecessary packages and their dependencies
+##############################################################################
+RUN pip install --upgrade pip && pip install pip-autoremove && \
+    pip-autoremove torch torchvision torchaudio torch-tensorrt transformer_engine \
+        pytorch-quantization pytorch-triton \
+        flash-attn tensorboard apex cudf dask-cudf \
+        cugraph cugraph-dgl cugraph-pyg cugraph-service-server -y
+
+
+##############################################################################
+# Install PyTorch
+##############################################################################
+RUN pip install --upgrade pip \
+    && pip install --no-cache-dir torch==2.5.1 torchvision torchaudio \
+    -f https://download.pytorch.org/whl/cu124/torch_stable.html -v \
+    || { echo 'PyTorch installation failed'; exit 1; }
+
+
+##############################################################################
+# Install, run, and test dependent environments and data
+##############################################################################
+RUN pip install pytest pytest-cov pytest_mock pytest-random-order \
+    pre-commit black isort diff-cover \
+    zarr tensorstore==0.1.45 wrapt tiktoken omegaconf setuptools_scm hydra-core Ray==2.40.0 numpy==1.26.4 pillow==10.4.0 \
+    git+https://github.com/fanshiqing/[email protected] nltk==3.8.1 \
+    && python -m nltk.downloader -d /root/nltk_data punkt
+
+
+# apex
+RUN cd /workspace \
+    && git clone https://github.com/NVIDIA/apex \
+    && cd apex \
+    && pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+
+# flash-attention
+# Supported flash-attn versions are >= 2.1.1, <= 2.6.3.
+# flash-attn==2.6.3
+RUN cd /workspace \
+    && git clone https://github.com/Dao-AILab/flash-attention.git \
+    && cd flash-attention \
+    && git checkout c1d146c \
+    && git submodule update --init --recursive \
+    && MAX_JOBS=96 python setup.py install
+
+
+# TransformerEngin
+RUN cd /workspace \
+    && git clone -b stable https://github.com/NVIDIA/TransformerEngine.git \
+    && cd TransformerEngine \
+    && git submodule update --init --recursive \
+    && pip install .
+
+
+# xformers
+RUN cd /workspace \
+    && git clone https://github.com/facebookresearch/xformers.git \
+    && cd xformers \
+    && git submodule update --init --recursive \
+    && pip install -v -U .
diff --git a/megatron/.coveragerc b/megatron/.coveragerc
@@ -2,4 +2,4 @@
 directory = coverage
 
 [run]
-data_file = .coverage_$LOCAL_RANK
+data_file = /workspace/report/$COMMIT_ID/cov-temp-megatron/.coverage_$LOCAL_RANK
diff --git a/megatron/megatron/core/parallel_state.py b/megatron/megatron/core/parallel_state.py
@@ -2463,3 +2463,6 @@ def destroy_model_parallel():
     _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_USP_CP_GLOO = None
     global _INTER_PARTIAL_DATA_PARALLEL_GROUP_WITH_USP_CP
     _INTER_PARTIAL_DATA_PARALLEL_GROUP_WITH_USP_CP = None
+
+    global _LAST_RANK_WHEN_USING_PIPELINE
+    _LAST_RANK_WHEN_USING_PIPELINE = None
diff --git a/megatron/tests/unit_tests/data/test_gpt_dataset.py b/megatron/tests/unit_tests/data/test_gpt_dataset.py
@@ -47,9 +47,6 @@ def test_mock_gpt_dataset():
         tokenizer=tokenizer,
     )
 
-    from tests.unit_tests.data import set_mock_args
-    set_mock_args()
-
     datasets = BlendedMegatronDatasetBuilder(
         MockGPTDataset, [100, 100, 100], lambda: True, config
     ).build()

diff --git a/megatron/tests/unit_tests/data/test_multimodal_dataset.py b/megatron/tests/unit_tests/data/test_multimodal_dataset.py
@@ -38,9 +38,6 @@ def test_mock_multimodal_dataset():
         tokenizer=_NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE),
     )
 
-    from tests.unit_tests.data import set_mock_args
-    set_mock_args()
-
     datasets = BlendedMegatronDatasetBuilder(
         MockMultimodalDataset, [100, 100, 100], lambda: True, config
     ).build()

diff --git a/megatron/tests/unit_tests/export/trtllm/test_distributed_fp8.py b/megatron/tests/unit_tests/export/trtllm/test_distributed_fp8.py
@@ -103,7 +103,16 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
 
     return output_tensor, partial(loss_func, loss_mask)
 
-
+"""
+Author: phoenixdong
+Date: 2024-12-17
+Action: Add class-level skip decorator
+Reason: Skip all tests in this class if the device does not support CUDA or if its compute capability is less than 8.9.
+"""
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or torch.cuda.get_device_capability(0) < (8, 9),
+    reason="Device compute capability 8.9 or higher required for FP8 execution"
+)
 class TestTRTLLMSingleDeviceConverterFP8:
     QUANTIZED_LAYERS = [
         'transformer.layers.*.attention.dense.weight',

diff --git a/megatron/tests/unit_tests/export/trtllm/test_single_device_fp8.py b/megatron/tests/unit_tests/export/trtllm/test_single_device_fp8.py
@@ -100,7 +100,16 @@ def _loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
 
     return output_tensor, partial(_loss_func, loss_mask)
 
-
+"""
+Author: phoenixdong
+Date: 2024-12-17
+Action: Add class-level skip decorator
+Reason: Skip all tests in this class if the device does not support CUDA or if its compute capability is less than 8.9.
+"""
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or torch.cuda.get_device_capability(0) < (8, 9),
+    reason="Device compute capability 8.9 or higher required for FP8 execution"
+)
 class TestTRTLLMSingleDeviceConverterFP8:
     QUANTIZED_LAYERS = [
         'transformer.layers.*.attention.dense.weight',

diff --git a/megatron/tests/unit_tests/models/test_bert_model.py b/megatron/tests/unit_tests/models/test_bert_model.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 from packaging.version import Version as PkgVersion
+from packaging.version import parse
 from pytest_mock import mocker
 
 from megatron.core.models.bert.bert_layer_specs import (
@@ -16,7 +17,7 @@
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import is_te_min_version
+from megatron.core.utils import is_te_min_version, get_te_version
 from tests.unit_tests.test_utilities import Utils
 
 
@@ -159,29 +160,44 @@ def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
             attn_mask_dimensions == "b11s"
         ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
 
+    """
+    Author: phoenixdong
+    Date: 2024-12-17
+    Action: Modify the process, exceptions are only thrown between te 1.7 and 1.10.
+    Reason: The new version of TE has already addressed potential exceptions.
+    """
     @pytest.mark.internal
     @pytest.mark.flaky_in_dev
     def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
-        os.environ['NVTE_FLASH_ATTN'] = '0'
-        os.environ['NVTE_FUSED_ATTN'] = '0'
-
-        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
-            'attn_mask_type'
-        ] == AttnMaskType.padding
-        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
-        with pytest.raises(Exception) as exc_info:
+        # Get the current version of Transformer Engine
+        te_version = f"{get_te_version().major}.{get_te_version().minor}"
+
+        # Check if the version is between 1.7 and 1.10
+        if parse("1.7") <= parse(te_version) <= parse("1.10"):
+            # Expect an exception during BertModel initialization
+            with pytest.raises(Exception) as exc_info:
+                self.bert_model = BertModel(
+                    config=self.transformer_config,
+                    num_tokentypes=0,
+                    transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+                    vocab_size=100,
+                    max_sequence_length=4,
+                )
+            # Verify the exception message matches the expected error
+            assert str(exc_info.value) == (
+                "Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
+                "instantiating TERowParallelLinear when instantiating SelfAttention when "
+                "instantiating TransformerLayer"
+            )
+        else:
+            # For versions outside the range, initialize the model without expecting an exception
             self.bert_model = BertModel(
                 config=self.transformer_config,
                 num_tokentypes=0,
                 transformer_layer_spec=bert_layer_with_transformer_engine_spec,
                 vocab_size=100,
                 max_sequence_length=4,
             )
-        assert str(exc_info.value) == (
-            "Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
-            "instantiating TERowParallelLinear when instantiating SelfAttention when "
-            "instantiating TransformerLayer"
-        )
 
     @pytest.mark.internal
     def test_transformer_engine_version_1_7_to_1_10_unfused_attention(self, mocker):

diff --git a/megatron/tests/unit_tests/test_parallel_state.py b/megatron/tests/unit_tests/test_parallel_state.py
@@ -510,9 +510,15 @@ def golden_rank_result_from_past_code(
         expert_model_parallel_size=ep,
         ulysses_sp_parallel_size=usp,
     )
-    rank_generator = ps.RankGenerator(tp=tp, ep=1, dp=dp, pp=pp, cp=cp, order="tp-cp-dp-pp")
+    """
+    Author: phoenixdong
+    Date: 2024-12-17
+    Action: Add usp=1
+    Reason: FlagScale add usp in RankGenerator.
+    """
+    rank_generator = ps.RankGenerator(tp=tp, ep=1, dp=dp, pp=pp, cp=cp, usp=1, order="tp-cp-dp-pp")
     expert_rank_generator = ps.RankGenerator(
-        tp=tp, ep=ep, dp=expert_dp, pp=pp, cp=1, order="tp-ep-dp-pp"
+        tp=tp, ep=ep, dp=expert_dp, pp=pp, cp=1, usp=1, order="tp-ep-dp-pp"
     )
     assert dp_groups == rank_generator.get_ranks(
         "dp"