Skip to content

Commit

Permalink
merge upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
lzy-dev committed Dec 26, 2024
2 parents 9ffba0f + f2bc020 commit 34bd35b
Show file tree
Hide file tree
Showing 1,594 changed files with 241,118 additions and 38,423 deletions.
2 changes: 1 addition & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
directory = coverage

[run]
data_file = .coverage_$LOCAL_RANK
data_file = /workspace/report/$COMMIT_ID/cov-temp-flagscale/.coverage_$LOCAL_RANK
17 changes: 17 additions & 0 deletions .github/workflows/all-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,28 @@ concurrency:
cancel-in-progress: true

jobs:
# Megatron Report Clean
megatron-report-clean:
uses: ./.github/workflows/report-clean.yml
with:
backend: megatron

# Megatron Unit Tests with Matrix
megatron-unit-tests:
needs: megatron-report-clean
uses: ./.github/workflows/unit-tests.yml
strategy:
matrix:
subset:
- data
- dist_checkpointing
- distributed
- export
- fusions
- inference
- models
- pipeline_parallel
- ssm
- tensor_parallel
- transformer/moe
- transformer
Expand All @@ -33,8 +42,15 @@ jobs:
backend: megatron
subset: ${{ matrix.subset }}

# FlagScale Report Clean
flagscale-report-clean:
uses: ./.github/workflows/report-clean.yml
with:
backend: flagscale

# Flagscale Unit Tests with Matrix
flagscale-unit-tests:
needs: flagscale-report-clean
uses: ./.github/workflows/unit-tests.yml
strategy:
matrix:
Expand All @@ -57,6 +73,7 @@ jobs:
model:
- aquila
- mixtral
# - llava_onevision
name: "train-${{ matrix.model }}"
with:
model: ${{ matrix.model }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/coverage-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
test-coverage:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v1.5
image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
ports:
- 80
volumes:
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ env:
jobs:
format:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

steps:
- name: Checkout Code
Expand All @@ -37,8 +37,9 @@ jobs:
- name: Run Black
run: |
black --verbose --include $INCLUDE_FILES ./ --diff
black --verbose --include "$INCLUDE_FILES" ./ --check || { echo "Code formatting does not comply with Black's rules. Please reformat the code according to Black and resubmit."; exit 1; }
- name: Run Isort
run: |
isort --verbose --profile black $INCLUDE_FILES --diff --known-local-folder flagscale
isort --verbose --profile black $INCLUDE_FILES --check-only --diff --known-local-folder flagscale || { echo "Import order does not comply with isort rules. Please fix the import order and resubmit."; exit 1; }
2 changes: 1 addition & 1 deletion .github/workflows/functional-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
functional-test:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v1.5
image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
ports:
- 80
volumes:
Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/report-clean.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Clean Old Report

on:
workflow_call:
inputs:
backend:
required: true
type: string

jobs:
clean-report:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
ports:
- 80
volumes:
- /home/flagscale_cicd/flask/static:/workspace/report
- /home/flagscale_cicd/flask/config:/workspace/config
options: --hostname flagscale_cicd

steps:
- name: Clean Old Report Report
run: |
REPORT_ADDR=$(cat "/workspace/config/report_address")
echo "Clean old Report report at the http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/diff-cover-report-${{ inputs.backend }}.html"
if [ -d "/workspace/report/${{ github.sha }}/cov-report-${{ inputs.backend }}" ]; then
rm -r /workspace/report/${{ github.sha }}/cov-report-${{ inputs.backend }}
fi
if [ -d "/workspace/report/${{ github.sha }}/cov-temp-${{ inputs.backend }}" ]; then
rm -r /workspace/report/${{ github.sha }}/cov-temp-${{ inputs.backend }}
fi
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
unit-test:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v1.5
image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
ports:
- 80
volumes:
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ slurm*
logs
.vscode
log_file/*
outputs
outputs
*.log
*.out
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ We recommend using the latest release of [NGC's PyTorch container](https://catal

### Run a Task

FlagScale provides a unified runner for various tasks, including training and inference. Simply specify the configuration file to run the task with a single command. The runner will automatically load the configurations and execute the task. The following example demonstrates how to run a distributed training task.
FlagScale provides a unified runner for various tasks, including training,inference and serve. Simply specify the configuration file to run the task with a single command. The runner will automatically load the configurations and execute the task. The following example demonstrates how to run a distributed training task.

#### Train

1. Start the distributed training job:
```sh
Expand All @@ -66,6 +68,18 @@ FlagScale provides a unified runner for various tasks, including training and in
python run.py --config-path ./examples/aquila/conf --config-name config action=stop
```

#### Serve

1. Start the server:
```sh
python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=run
```
2. Stop the server:
```sh
python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=stop
```
For more details, please refer to [Quick Start](./flagscale/serve/README.md).

## License

This project is licensed under the [Apache License (Version 2.0)](https://github.com/FlagOpen/FlagScale/blob/main/LICENSE). This project also contains other third-party components under other open-source licenses. See the [LICENSE](https://github.com/FlagOpen/FlagScale/blob/main/LICENSE) file for more information.
100 changes: 100 additions & 0 deletions docker/Dockerfile.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
FROM nvcr.io/nvidia/pytorch:24.05-py3

ENV DEBIAN_FRONTEND noninteractive
ENV TZ=Asia/Shanghai


##############################################################################
# To avoid "curl 92 HTTP/2 stream 0 was not closed cleanly: CANCEL (err 8)" or "fetch-pack: unexpected disconnect while reading sideband packet".
##############################################################################
# lowSpeedTime=300s lowSpeedLimit=100B
RUN git config --global http.lowSpeedTime 300 \
&& git config --global http.lowSpeedLimit 100 \
&& git config --global http.postBuffer 524288000


##############################################################################
# Change apt source to Ksyun
##############################################################################
RUN sed -i "s#\S\+#http://apt.ksyun.cn/ubuntu/#2" /etc/apt/sources.list && \
> /etc/apt/apt.conf.d/docker-clean && \
> /etc/dpkg/dpkg.cfg.d/pkg-config-hook-config


##############################################################################
# Install basic utilities
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common build-essential autotools-dev \
nfs-common pdsh \
curl wget vim tmux less unzip \
htop iftop iotop ca-certificates openssh-client openssh-server \
rsync iputils-ping net-tools sudo \
tzdata psmisc screen libx11-dev llvm-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*


##############################################################################
# Uninstall unnecessary packages and their dependencies
##############################################################################
RUN pip install --upgrade pip && pip install pip-autoremove && \
pip-autoremove torch torchvision torchaudio torch-tensorrt transformer_engine \
pytorch-quantization pytorch-triton \
flash-attn tensorboard apex cudf dask-cudf \
cugraph cugraph-dgl cugraph-pyg cugraph-service-server -y


##############################################################################
# Install PyTorch
##############################################################################
RUN pip install --upgrade pip \
&& pip install --no-cache-dir torch==2.5.1 torchvision torchaudio \
-f https://download.pytorch.org/whl/cu124/torch_stable.html -v \
|| { echo 'PyTorch installation failed'; exit 1; }


##############################################################################
# Install, run, and test dependent environments and data
##############################################################################
RUN pip install pytest pytest-cov pytest_mock pytest-random-order \
pre-commit black isort diff-cover \
zarr tensorstore==0.1.45 wrapt tiktoken omegaconf setuptools_scm hydra-core Ray==2.40.0 numpy==1.26.4 pillow==10.4.0 \
git+https://github.com/fanshiqing/[email protected] nltk==3.8.1 \
&& python -m nltk.downloader -d /root/nltk_data punkt


# apex
RUN cd /workspace \
&& git clone https://github.com/NVIDIA/apex \
&& cd apex \
&& pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./


# flash-attention
# Supported flash-attn versions are >= 2.1.1, <= 2.6.3.
# flash-attn==2.6.3
RUN cd /workspace \
&& git clone https://github.com/Dao-AILab/flash-attention.git \
&& cd flash-attention \
&& git checkout c1d146c \
&& git submodule update --init --recursive \
&& MAX_JOBS=96 python setup.py install


# TransformerEngin
RUN cd /workspace \
&& git clone -b stable https://github.com/NVIDIA/TransformerEngine.git \
&& cd TransformerEngine \
&& git submodule update --init --recursive \
&& pip install .


# xformers
RUN cd /workspace \
&& git clone https://github.com/facebookresearch/xformers.git \
&& cd xformers \
&& git submodule update --init --recursive \
&& pip install -v -U .
2 changes: 1 addition & 1 deletion examples/aquila/conf/train/demo_hetero.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ model:
swiglu: true
multiple_of: 256
normalization: RMSNorm
rotary_interleaved_patch: true
# rotary_interleaved_patch: true
untie_embeddings_and_output_weights: true
init_method_std: 0.0165
attention_dropout: 0.0
Expand Down
2 changes: 1 addition & 1 deletion examples/aquila/conf/train/train_aquila_7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ model:
swiglu: true
multiple_of: 256
normalization: RMSNorm
rotary_interleaved_patch: true
# rotary_interleaved_patch: true
untie_embeddings_and_output_weights: true
init_method_std: 0.02
attention_dropout: 0.0
Expand Down
2 changes: 1 addition & 1 deletion examples/llama/conf/train/train_llama2_7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ model:
swiglu: True
multiple_of: 256
normalization: RMSNorm
rotary_interleaved_patch: True
# rotary_interleaved_patch: True
untie_embeddings_and_output_weights: True
init_method_std: 0.02
attention_dropout: 0.0
Expand Down
2 changes: 1 addition & 1 deletion examples/llama/conf/train/train_llama3_8b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ model:
no_position_embedding: True
swiglu: True
normalization: RMSNorm
rotary_interleaved_patch: False
# rotary_interleaved_patch: False
position_embedding_type: rope
rotary_base: 500000
untie_embeddings_and_output_weights: True
Expand Down
2 changes: 1 addition & 1 deletion examples/llama/conf/train/train_llama3_8b_hetero.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ model:
no_position_embedding: True
swiglu: True
normalization: RMSNorm
rotary_interleaved_patch: False
# rotary_interleaved_patch: False
position_embedding_type: rope
rotary_base: 500000
untie_embeddings_and_output_weights: True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ model:
hidden_dropout: 0.0
clip_grad: 1.0
train_iters: 10
profile: False
profile-step-start: 10
profile-step-end: 20
profile_ranks: 7
use_pytorch_profiler: True
eval_iters: 0
micro_batch_size: 2
global_batch_size: 512
Expand Down
Loading

0 comments on commit 34bd35b

Please sign in to comment.