merge upstream

FlagOpen · Dec 26, 2024 · 34bd35b · 34bd35b
2 parents 9ffba0f + f2bc020
commit 34bd35b
Show file tree

Hide file tree

Showing 1,594 changed files with 241,118 additions and 38,423 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -2,4 +2,4 @@
 directory = coverage
 
 [run]
-data_file = .coverage_$LOCAL_RANK
+data_file = /workspace/report/$COMMIT_ID/cov-temp-flagscale/.coverage_$LOCAL_RANK
diff --git a/.github/workflows/all-tests.yml b/.github/workflows/all-tests.yml
@@ -11,19 +11,28 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # Megatron Report Clean
+  megatron-report-clean:
+    uses: ./.github/workflows/report-clean.yml
+    with:
+      backend: megatron
+
   # Megatron Unit Tests with Matrix
   megatron-unit-tests:
+    needs: megatron-report-clean
     uses: ./.github/workflows/unit-tests.yml
     strategy:
       matrix:
         subset: 
           - data
           - dist_checkpointing
           - distributed
+          - export
           - fusions
           - inference
           - models
           - pipeline_parallel
+          - ssm
           - tensor_parallel
           - transformer/moe
           - transformer
@@ -33,8 +42,15 @@ jobs:
       backend: megatron
       subset: ${{ matrix.subset }}
 
+  # FlagScale Report Clean
+  flagscale-report-clean:
+    uses: ./.github/workflows/report-clean.yml
+    with:
+      backend: flagscale
+
   # Flagscale Unit Tests with Matrix
   flagscale-unit-tests:
+    needs: flagscale-report-clean
     uses: ./.github/workflows/unit-tests.yml
     strategy:
       matrix:
@@ -57,6 +73,7 @@ jobs:
         model: 
           - aquila
           - mixtral
+          # - llava_onevision
     name: "train-${{ matrix.model }}"
     with:
       model: ${{ matrix.model }}

diff --git a/.github/workflows/coverage-tests.yml b/.github/workflows/coverage-tests.yml
@@ -11,7 +11,7 @@ jobs:
   test-coverage:
     runs-on: self-hosted
     container:
-      image: localhost:5000/flagscale_cicd:v1.5
+      image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
       ports:
         - 80
       volumes:

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
@@ -20,7 +20,7 @@ env:
 
 jobs:
   format:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
     - name: Checkout Code
@@ -37,8 +37,9 @@ jobs:
 
     - name: Run Black
       run: |
-        black --verbose --include $INCLUDE_FILES ./ --diff
+        black --verbose --include "$INCLUDE_FILES" ./ --check || { echo "Code formatting does not comply with Black's rules. Please reformat the code according to Black and resubmit."; exit 1; }
 
     - name: Run Isort
       run: |
-        isort --verbose --profile black $INCLUDE_FILES --diff --known-local-folder flagscale
+        isort --verbose --profile black $INCLUDE_FILES --check-only --diff --known-local-folder flagscale || { echo "Import order does not comply with isort rules. Please fix the import order and resubmit."; exit 1; }
+
diff --git a/.github/workflows/functional-tests.yml b/.github/workflows/functional-tests.yml
@@ -15,7 +15,7 @@ jobs:
   functional-test:
     runs-on: self-hosted
     container:
-      image: localhost:5000/flagscale_cicd:v1.5
+      image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
       ports:
         - 80
       volumes:

diff --git a/.github/workflows/report-clean.yml b/.github/workflows/report-clean.yml
@@ -0,0 +1,32 @@
+name: Clean Old Report
+
+on:
+  workflow_call:
+    inputs:
+      backend:
+        required: true
+        type: string
+
+jobs:
+  clean-report:
+    runs-on: self-hosted
+    container:
+      image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
+      ports:
+        - 80
+      volumes:
+        - /home/flagscale_cicd/flask/static:/workspace/report
+        - /home/flagscale_cicd/flask/config:/workspace/config
+      options: --hostname flagscale_cicd
+
+    steps:
+      - name: Clean Old Report Report
+        run: |
+          REPORT_ADDR=$(cat "/workspace/config/report_address")
+          echo "Clean old Report report at the http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/diff-cover-report-${{ inputs.backend }}.html"
+          if [ -d "/workspace/report/${{ github.sha }}/cov-report-${{ inputs.backend }}" ]; then
+            rm -r /workspace/report/${{ github.sha }}/cov-report-${{ inputs.backend }}
+          fi
+          if [ -d "/workspace/report/${{ github.sha }}/cov-temp-${{ inputs.backend }}" ]; then
+            rm -r /workspace/report/${{ github.sha }}/cov-temp-${{ inputs.backend }}
+          fi
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -15,7 +15,7 @@ jobs:
   unit-test:
     runs-on: self-hosted
     container:
-      image: localhost:5000/flagscale_cicd:v1.5
+      image: localhost:5000/flagscale_cicd:v2.0-pytorch-2.5.1-cuda-12.4.131-ngc-24.05
       ports:
         - 80
       volumes:

diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ slurm*
 logs
 .vscode
 log_file/*
-outputs
+outputs
+*.log
+*.out
diff --git a/README.md b/README.md
@@ -53,7 +53,9 @@ We recommend using the latest release of [NGC's PyTorch container](https://catal
 
 ### Run a Task 
 
-FlagScale provides a unified runner for various tasks, including training and inference. Simply specify the configuration file to run the task with a single command. The runner will automatically load the configurations and execute the task. The following example demonstrates how to run a distributed training task.
+FlagScale provides a unified runner for various tasks, including training，inference and serve. Simply specify the configuration file to run the task with a single command. The runner will automatically load the configurations and execute the task. The following example demonstrates how to run a distributed training task.
+
+#### Train
 
 1. Start the distributed training job:
     ```sh
@@ -66,6 +68,18 @@ FlagScale provides a unified runner for various tasks, including training and in
     python run.py --config-path ./examples/aquila/conf --config-name config action=stop
     ```
 
+#### Serve
+
+1. Start the server:
+    ```sh
+    python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=run
+    ```
+2. Stop the server:
+    ```sh
+    python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=stop
+    ```
+For more details, please refer to [Quick Start](./flagscale/serve/README.md).
+
 ## License
 
 This project is licensed under the [Apache License (Version 2.0)](https://github.com/FlagOpen/FlagScale/blob/main/LICENSE). This project also contains other third-party components under other open-source licenses. See the [LICENSE](https://github.com/FlagOpen/FlagScale/blob/main/LICENSE) file for more information.
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
@@ -0,0 +1,100 @@
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+
+ENV DEBIAN_FRONTEND noninteractive
+ENV TZ=Asia/Shanghai
+
+
+##############################################################################
+# To avoid "curl 92 HTTP/2 stream 0 was not closed cleanly: CANCEL (err 8)" or "fetch-pack: unexpected disconnect while reading sideband packet".
+##############################################################################
+# lowSpeedTime=300s lowSpeedLimit=100B
+RUN git config --global http.lowSpeedTime 300 \
+    && git config --global http.lowSpeedLimit 100 \
+    && git config --global http.postBuffer 524288000
+
+
+##############################################################################
+# Change apt source to Ksyun
+##############################################################################
+RUN sed -i "s#\S\+#http://apt.ksyun.cn/ubuntu/#2" /etc/apt/sources.list && \
+    > /etc/apt/apt.conf.d/docker-clean && \
+    > /etc/dpkg/dpkg.cfg.d/pkg-config-hook-config
+
+
+##############################################################################
+# Install basic utilities
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        curl wget vim tmux less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        tzdata psmisc screen libx11-dev llvm-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
+##############################################################################
+# Uninstall unnecessary packages and their dependencies
+##############################################################################
+RUN pip install --upgrade pip && pip install pip-autoremove && \
+    pip-autoremove torch torchvision torchaudio torch-tensorrt transformer_engine \
+        pytorch-quantization pytorch-triton \
+        flash-attn tensorboard apex cudf dask-cudf \
+        cugraph cugraph-dgl cugraph-pyg cugraph-service-server -y
+
+
+##############################################################################
+# Install PyTorch
+##############################################################################
+RUN pip install --upgrade pip \
+    && pip install --no-cache-dir torch==2.5.1 torchvision torchaudio \
+    -f https://download.pytorch.org/whl/cu124/torch_stable.html -v \
+    || { echo 'PyTorch installation failed'; exit 1; }
+
+
+##############################################################################
+# Install, run, and test dependent environments and data
+##############################################################################
+RUN pip install pytest pytest-cov pytest_mock pytest-random-order \
+    pre-commit black isort diff-cover \
+    zarr tensorstore==0.1.45 wrapt tiktoken omegaconf setuptools_scm hydra-core Ray==2.40.0 numpy==1.26.4 pillow==10.4.0 \
+    git+https://github.com/fanshiqing/[email protected] nltk==3.8.1 \
+    && python -m nltk.downloader -d /root/nltk_data punkt
+
+
+# apex
+RUN cd /workspace \
+    && git clone https://github.com/NVIDIA/apex \
+    && cd apex \
+    && pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+
+# flash-attention
+# Supported flash-attn versions are >= 2.1.1, <= 2.6.3.
+# flash-attn==2.6.3
+RUN cd /workspace \
+    && git clone https://github.com/Dao-AILab/flash-attention.git \
+    && cd flash-attention \
+    && git checkout c1d146c \
+    && git submodule update --init --recursive \
+    && MAX_JOBS=96 python setup.py install
+
+
+# TransformerEngin
+RUN cd /workspace \
+    && git clone -b stable https://github.com/NVIDIA/TransformerEngine.git \
+    && cd TransformerEngine \
+    && git submodule update --init --recursive \
+    && pip install .
+
+
+# xformers
+RUN cd /workspace \
+    && git clone https://github.com/facebookresearch/xformers.git \
+    && cd xformers \
+    && git submodule update --init --recursive \
+    && pip install -v -U .
diff --git a/examples/aquila/conf/train/demo_hetero.yaml b/examples/aquila/conf/train/demo_hetero.yaml
@@ -90,7 +90,7 @@ model:
   swiglu: true
   multiple_of: 256
   normalization: RMSNorm
-  rotary_interleaved_patch: true
+  # rotary_interleaved_patch: true
   untie_embeddings_and_output_weights: true
   init_method_std: 0.0165
   attention_dropout: 0.0

diff --git a/examples/aquila/conf/train/train_aquila_7b.yaml b/examples/aquila/conf/train/train_aquila_7b.yaml
@@ -30,7 +30,7 @@ model:
   swiglu: true
   multiple_of: 256
   normalization: RMSNorm
-  rotary_interleaved_patch: true
+  # rotary_interleaved_patch: true
   untie_embeddings_and_output_weights: true
   init_method_std: 0.02
   attention_dropout: 0.0

diff --git a/examples/llama/conf/train/train_llama2_7b.yaml b/examples/llama/conf/train/train_llama2_7b.yaml
@@ -34,7 +34,7 @@ model:
   swiglu: True
   multiple_of: 256
   normalization: RMSNorm
-  rotary_interleaved_patch: True
+  # rotary_interleaved_patch: True
   untie_embeddings_and_output_weights: True
   init_method_std: 0.02
   attention_dropout: 0.0

diff --git a/examples/llama/conf/train/train_llama3_8b.yaml b/examples/llama/conf/train/train_llama3_8b.yaml
@@ -35,7 +35,7 @@ model:
   no_position_embedding: True
   swiglu: True
   normalization: RMSNorm
-  rotary_interleaved_patch: False
+  # rotary_interleaved_patch: False
   position_embedding_type: rope
   rotary_base: 500000
   untie_embeddings_and_output_weights: True

diff --git a/examples/llama/conf/train/train_llama3_8b_hetero.yaml b/examples/llama/conf/train/train_llama3_8b_hetero.yaml
@@ -36,7 +36,7 @@ model:
   no_position_embedding: True
   swiglu: True
   normalization: RMSNorm
-  rotary_interleaved_patch: False
+  # rotary_interleaved_patch: False
   position_embedding_type: rope
   rotary_base: 500000
   untie_embeddings_and_output_weights: True

diff --git a/examples/llava_onevision/conf/train/train_llava_onevision_1.5b.yaml b/examples/llava_onevision/conf/train/train_llava_onevision_1.5b.yaml
@@ -44,6 +44,11 @@ model:
   hidden_dropout: 0.0
   clip_grad: 1.0
   train_iters: 10
+  profile: False
+  profile-step-start: 10
+  profile-step-end: 20
+  profile_ranks: 7
+  use_pytorch_profiler: True
   eval_iters: 0
   micro_batch_size: 2
   global_batch_size: 512
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,6 @@ slurm* @@
     logs
     .vscode
     log_file/*
-    outputs
+    outputs
+    *.log
+    *.out