Skip to content

Commit

Permalink
[CI] Update CI image and unit tests (#289)
Browse files Browse the repository at this point in the history
CI images:

Using nvcr.io/nvidia/pytorch: 24.05-py3 as the base image, the name of
the built image is
'flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05'

- V2.0 represents a major image update

- Pytorch 2.5.1-CUDA-12.4.131-NGC-24.05 represents software and basic
image versions

Unit tests :  

Run all unit tests, skip/fix errors, overall pass

- For megatron unit testing: fixing(know why something went wrong) or
skip

- For Flagscale unit testing: fixing

Functional tests :

- Add training tests for Llava onevision (Temporarily closed due to data
updates)

Bug fix :

- Adjust the temporary file path for coverage to avoid coverage loss
caused by container destruction

TODO :

- Add vLLM inference testing

-  Add Dockerfile.ci (using conda)
  • Loading branch information
phoenixdong authored Dec 18, 2024
1 parent 6df55e3 commit 2c880b3
Show file tree
Hide file tree
Showing 33 changed files with 561 additions and 59 deletions.
2 changes: 1 addition & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
directory = coverage

[run]
data_file = .coverage_$LOCAL_RANK
data_file = /workspace/report/$COMMIT_ID/cov-temp-flagscale/.coverage_$LOCAL_RANK
16 changes: 16 additions & 0 deletions .github/workflows/all-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,15 @@ concurrency:
cancel-in-progress: true

jobs:
# Megatron Report Clean
megatron-report-clean:
uses: ./.github/workflows/report-clean.yml
with:
backend: megatron

# Megatron Unit Tests with Matrix
megatron-unit-tests:
needs: megatron-report-clean
uses: ./.github/workflows/unit-tests.yml
strategy:
matrix:
Expand All @@ -25,6 +32,7 @@ jobs:
- inference
- models
- pipeline_parallel
- ssm
- tensor_parallel
- transformer/moe
- transformer
Expand All @@ -34,8 +42,15 @@ jobs:
backend: megatron
subset: ${{ matrix.subset }}

# FlagScale Report Clean
flagscale-report-clean:
uses: ./.github/workflows/report-clean.yml
with:
backend: flagscale

# Flagscale Unit Tests with Matrix
flagscale-unit-tests:
needs: flagscale-report-clean
uses: ./.github/workflows/unit-tests.yml
strategy:
matrix:
Expand All @@ -58,6 +73,7 @@ jobs:
model:
- aquila
- mixtral
# - llava_onevision
name: "train-${{ matrix.model }}"
with:
model: ${{ matrix.model }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/coverage-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
test-coverage:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v1.5
image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
ports:
- 80
volumes:
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ env:
jobs:
format:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

steps:
- name: Checkout Code
Expand All @@ -37,8 +37,9 @@ jobs:
- name: Run Black
run: |
black --verbose --include $INCLUDE_FILES ./ --diff
black --verbose --include "$INCLUDE_FILES" ./ --check || { echo "Code formatting does not comply with Black's rules. Please reformat the code according to Black and resubmit."; exit 1; }
- name: Run Isort
run: |
isort --verbose --profile black $INCLUDE_FILES --diff --known-local-folder flagscale
isort --verbose --profile black $INCLUDE_FILES --check-only --diff --known-local-folder flagscale || { echo "Import order does not comply with isort rules. Please fix the import order and resubmit."; exit 1; }
2 changes: 1 addition & 1 deletion .github/workflows/functional-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
functional-test:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v1.5
image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
ports:
- 80
volumes:
Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/report-clean.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Clean Old Report

on:
workflow_call:
inputs:
backend:
required: true
type: string

jobs:
clean-report:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
ports:
- 80
volumes:
- /home/flagscale_cicd/flask/static:/workspace/report
- /home/flagscale_cicd/flask/config:/workspace/config
options: --hostname flagscale_cicd

steps:
- name: Clean Old Report Report
run: |
REPORT_ADDR=$(cat "/workspace/config/report_address")
echo "Clean old Report report at the http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/diff-cover-report-${{ inputs.backend }}.html"
if [ -d "/workspace/report/${{ github.sha }}/cov-report-${{ inputs.backend }}" ]; then
rm -r /workspace/report/${{ github.sha }}/cov-report-${{ inputs.backend }}
fi
if [ -d "/workspace/report/${{ github.sha }}/cov-temp-${{ inputs.backend }}" ]; then
rm -r /workspace/report/${{ github.sha }}/cov-temp-${{ inputs.backend }}
fi
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
unit-test:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v1.5
image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
ports:
- 80
volumes:
Expand Down
100 changes: 100 additions & 0 deletions docker/Dockerfile.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
FROM nvcr.io/nvidia/pytorch:24.05-py3

ENV DEBIAN_FRONTEND noninteractive
ENV TZ=Asia/Shanghai


##############################################################################
# To avoid "curl 92 HTTP/2 stream 0 was not closed cleanly: CANCEL (err 8)" or "fetch-pack: unexpected disconnect while reading sideband packet".
##############################################################################
# lowSpeedTime=300s lowSpeedLimit=100B
RUN git config --global http.lowSpeedTime 300 \
&& git config --global http.lowSpeedLimit 100 \
&& git config --global http.postBuffer 524288000


##############################################################################
# Change apt source to Ksyun
##############################################################################
RUN sed -i "s#\S\+#http://apt.ksyun.cn/ubuntu/#2" /etc/apt/sources.list && \
> /etc/apt/apt.conf.d/docker-clean && \
> /etc/dpkg/dpkg.cfg.d/pkg-config-hook-config


##############################################################################
# Install basic utilities
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common build-essential autotools-dev \
nfs-common pdsh \
curl wget vim tmux less unzip \
htop iftop iotop ca-certificates openssh-client openssh-server \
rsync iputils-ping net-tools sudo \
tzdata psmisc screen libx11-dev llvm-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*


##############################################################################
# Uninstall unnecessary packages and their dependencies
##############################################################################
RUN pip install --upgrade pip && pip install pip-autoremove && \
pip-autoremove torch torchvision torchaudio torch-tensorrt transformer_engine \
pytorch-quantization pytorch-triton \
flash-attn tensorboard apex cudf dask-cudf \
cugraph cugraph-dgl cugraph-pyg cugraph-service-server -y


##############################################################################
# Install PyTorch
##############################################################################
RUN pip install --upgrade pip \
&& pip install --no-cache-dir torch==2.5.1 torchvision torchaudio \
-f https://download.pytorch.org/whl/cu124/torch_stable.html -v \
|| { echo 'PyTorch installation failed'; exit 1; }


##############################################################################
# Install, run, and test dependent environments and data
##############################################################################
RUN pip install pytest pytest-cov pytest_mock pytest-random-order \
pre-commit black isort diff-cover \
zarr tensorstore==0.1.45 wrapt tiktoken omegaconf setuptools_scm hydra-core Ray==2.40.0 numpy==1.26.4 pillow==10.4.0 \
git+https://github.com/fanshiqing/[email protected] nltk==3.8.1 \
&& python -m nltk.downloader -d /root/nltk_data punkt


# apex
RUN cd /workspace \
&& git clone https://github.com/NVIDIA/apex \
&& cd apex \
&& pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./


# flash-attention
# Supported flash-attn versions are >= 2.1.1, <= 2.6.3.
# flash-attn==2.6.3
RUN cd /workspace \
&& git clone https://github.com/Dao-AILab/flash-attention.git \
&& cd flash-attention \
&& git checkout c1d146c \
&& git submodule update --init --recursive \
&& MAX_JOBS=96 python setup.py install


# TransformerEngin
RUN cd /workspace \
&& git clone -b stable https://github.com/NVIDIA/TransformerEngine.git \
&& cd TransformerEngine \
&& git submodule update --init --recursive \
&& pip install .


# xformers
RUN cd /workspace \
&& git clone https://github.com/facebookresearch/xformers.git \
&& cd xformers \
&& git submodule update --init --recursive \
&& pip install -v -U .
2 changes: 1 addition & 1 deletion megatron/.coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
directory = coverage

[run]
data_file = .coverage_$LOCAL_RANK
data_file = /workspace/report/$COMMIT_ID/cov-temp-megatron/.coverage_$LOCAL_RANK
3 changes: 3 additions & 0 deletions megatron/megatron/core/parallel_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -2463,3 +2463,6 @@ def destroy_model_parallel():
_INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_USP_CP_GLOO = None
global _INTER_PARTIAL_DATA_PARALLEL_GROUP_WITH_USP_CP
_INTER_PARTIAL_DATA_PARALLEL_GROUP_WITH_USP_CP = None

global _LAST_RANK_WHEN_USING_PIPELINE
_LAST_RANK_WHEN_USING_PIPELINE = None
3 changes: 0 additions & 3 deletions megatron/tests/unit_tests/data/test_gpt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ def test_mock_gpt_dataset():
tokenizer=tokenizer,
)

from tests.unit_tests.data import set_mock_args
set_mock_args()

datasets = BlendedMegatronDatasetBuilder(
MockGPTDataset, [100, 100, 100], lambda: True, config
).build()
Expand Down
3 changes: 0 additions & 3 deletions megatron/tests/unit_tests/data/test_multimodal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ def test_mock_multimodal_dataset():
tokenizer=_NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE),
)

from tests.unit_tests.data import set_mock_args
set_mock_args()

datasets = BlendedMegatronDatasetBuilder(
MockMultimodalDataset, [100, 100, 100], lambda: True, config
).build()
Expand Down
11 changes: 10 additions & 1 deletion megatron/tests/unit_tests/export/trtllm/test_distributed_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,16 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):

return output_tensor, partial(loss_func, loss_mask)


"""
Author: phoenixdong
Date: 2024-12-17
Action: Add class-level skip decorator
Reason: Skip all tests in this class if the device does not support CUDA or if its compute capability is less than 8.9.
"""
@pytest.mark.skipif(
not torch.cuda.is_available() or torch.cuda.get_device_capability(0) < (8, 9),
reason="Device compute capability 8.9 or higher required for FP8 execution"
)
class TestTRTLLMSingleDeviceConverterFP8:
QUANTIZED_LAYERS = [
'transformer.layers.*.attention.dense.weight',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,16 @@ def _loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):

return output_tensor, partial(_loss_func, loss_mask)


"""
Author: phoenixdong
Date: 2024-12-17
Action: Add class-level skip decorator
Reason: Skip all tests in this class if the device does not support CUDA or if its compute capability is less than 8.9.
"""
@pytest.mark.skipif(
not torch.cuda.is_available() or torch.cuda.get_device_capability(0) < (8, 9),
reason="Device compute capability 8.9 or higher required for FP8 execution"
)
class TestTRTLLMSingleDeviceConverterFP8:
QUANTIZED_LAYERS = [
'transformer.layers.*.attention.dense.weight',
Expand Down
44 changes: 30 additions & 14 deletions megatron/tests/unit_tests/models/test_bert_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
import torch
from packaging.version import Version as PkgVersion
from packaging.version import parse
from pytest_mock import mocker

from megatron.core.models.bert.bert_layer_specs import (
Expand All @@ -16,7 +17,7 @@
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.utils import is_te_min_version
from megatron.core.utils import is_te_min_version, get_te_version
from tests.unit_tests.test_utilities import Utils


Expand Down Expand Up @@ -159,29 +160,44 @@ def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
attn_mask_dimensions == "b11s"
), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"

"""
Author: phoenixdong
Date: 2024-12-17
Action: Modify the process, exceptions are only thrown between te 1.7 and 1.10.
Reason: The new version of TE has already addressed potential exceptions.
"""
@pytest.mark.internal
@pytest.mark.flaky_in_dev
def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
os.environ['NVTE_FLASH_ATTN'] = '0'
os.environ['NVTE_FUSED_ATTN'] = '0'

bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
'attn_mask_type'
] == AttnMaskType.padding
mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
with pytest.raises(Exception) as exc_info:
# Get the current version of Transformer Engine
te_version = f"{get_te_version().major}.{get_te_version().minor}"

# Check if the version is between 1.7 and 1.10
if parse("1.7") <= parse(te_version) <= parse("1.10"):
# Expect an exception during BertModel initialization
with pytest.raises(Exception) as exc_info:
self.bert_model = BertModel(
config=self.transformer_config,
num_tokentypes=0,
transformer_layer_spec=bert_layer_with_transformer_engine_spec,
vocab_size=100,
max_sequence_length=4,
)
# Verify the exception message matches the expected error
assert str(exc_info.value) == (
"Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
"instantiating TERowParallelLinear when instantiating SelfAttention when "
"instantiating TransformerLayer"
)
else:
# For versions outside the range, initialize the model without expecting an exception
self.bert_model = BertModel(
config=self.transformer_config,
num_tokentypes=0,
transformer_layer_spec=bert_layer_with_transformer_engine_spec,
vocab_size=100,
max_sequence_length=4,
)
assert str(exc_info.value) == (
"Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
"instantiating TERowParallelLinear when instantiating SelfAttention when "
"instantiating TransformerLayer"
)

@pytest.mark.internal
def test_transformer_engine_version_1_7_to_1_10_unfused_attention(self, mocker):
Expand Down
10 changes: 8 additions & 2 deletions megatron/tests/unit_tests/test_parallel_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,9 +510,15 @@ def golden_rank_result_from_past_code(
expert_model_parallel_size=ep,
ulysses_sp_parallel_size=usp,
)
rank_generator = ps.RankGenerator(tp=tp, ep=1, dp=dp, pp=pp, cp=cp, order="tp-cp-dp-pp")
"""
Author: phoenixdong
Date: 2024-12-17
Action: Add usp=1
Reason: FlagScale add usp in RankGenerator.
"""
rank_generator = ps.RankGenerator(tp=tp, ep=1, dp=dp, pp=pp, cp=cp, usp=1, order="tp-cp-dp-pp")
expert_rank_generator = ps.RankGenerator(
tp=tp, ep=ep, dp=expert_dp, pp=pp, cp=1, order="tp-ep-dp-pp"
tp=tp, ep=ep, dp=expert_dp, pp=pp, cp=1, usp=1, order="tp-ep-dp-pp"
)
assert dp_groups == rank_generator.get_ranks(
"dp"
Expand Down
Loading

0 comments on commit 2c880b3

Please sign in to comment.