Merge branch 'main' into add_video_llavaov

FlagOpen · Dec 9, 2024 · 13dbaa5 · 13dbaa5
2 parents 24fdd6a + 26ada5b
commit 13dbaa5
Show file tree

Hide file tree

Showing 921 changed files with 64,852 additions and 23,149 deletions.
diff --git a/.github/workflows/coverage-tests.yml b/.github/workflows/coverage-tests.yml
@@ -38,14 +38,15 @@ jobs:
           submodules: false
           set-safe-directory: true
 
+      - name: Coverage Online Report
+        run: |
+          REPORT_ADDR=$(cat "/workspace/config/report_address")
+          echo "After 'Run Unit Tests and Check Coverage' is completed/stopped, you can access the test diff-coverage report at the http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/diff-cover-report-${{ inputs.backend }}.html"
+
       - name: Run Unit Tests and Check Coverage
         run: |
           # Execute the test coverage script with the appropriate backend and id (commit SHA)
           tests/scripts/unit_tests/test_coverage.sh --backend ${{ inputs.backend }} --id ${{ github.sha }} --status online
       
-      - name: Coverage Online Report
-        run: |
-          REPORT_ADDR=$(cat "/workspace/config/report_address")
-          echo "You can access the test diff-coverage report at the http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/diff-cover-report-${{ inputs.backend }}.html"
 
 
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -40,11 +40,12 @@ jobs:
           lfs: false
           submodules: false
           set-safe-directory: true
-
-      - name: Run Unit Test
-        run: tests/scripts/unit_tests/test_subset.sh --backend ${{ inputs.backend }} --subset ${{ inputs.subset }} --id ${{ github.sha }}
-
+
       - name: Unit Test Coverage Online Report
         run: | 
           REPORT_ADDR=$(cat "/workspace/config/report_address")
-          echo "You can access the test coverage report at http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/index.html"
+          echo "After 'Run Unit Test' is completed/stopped, you can access the test coverage report at http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/index.html"
+
+      - name: Run Unit Test
+        run: tests/scripts/unit_tests/test_subset.sh --backend ${{ inputs.backend }} --subset ${{ inputs.subset }} --id ${{ github.sha }}
+
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ slurm*
 logs
 .vscode
 log_file/*
-outputs
+outputs
+*.log
+*.out
diff --git a/README.md b/README.md
@@ -1,3 +1,5 @@
+[<img src="flagopen.png">](https://flagopen.baai.ac.cn/)
+
 ## Latest News
 - **[2024/11]** Released [v0.6.0](https://github.com/FlagOpen/FlagScale/tree/release/v0.6.0): 
   - Introduced general multi-dimensional heterogeneous parallelism and CPU-based communication between different chips.
@@ -12,7 +14,9 @@
 
 [FlagScale](https://github.com/FlagOpen/FlagScale.git) is a comprehensive toolkit designed to support the entire lifecycle of large models, developed with the backing of the Beijing Academy of Artificial Intelligence (BAAI). It builds on the strengths of several prominent open-source projects, including [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) and [vllm](https://github.com/vllm-project/vllm), to provide a robust, end-to-end solution for managing and scaling large models.
 
-The primary objective of FlagScale is to enable seamless scalability across diverse hardware architectures while maximizing computational resource efficiency and enhancing model performance. By offering essential components for model development, training, and deployment, FlagScale aims to serve as an indispensable toolkit for optimizing both the speed and effectiveness of large model workflows.
+The primary objective of FlagScale is to enable seamless scalability across diverse hardware architectures while maximizing computational resource efficiency and enhancing model performance. By offering essential components for model development, training, and deployment, FlagScale seeks to establish itself as an indispensable toolkit for optimizing both the speed and effectiveness of large model workflows.
+
+FlagScale is also a part of [FlagAI-Open](https://flagopen.baai.ac.cn/), an open-source initiative by BAAI that aims to foster an open-source ecosystem for AI technologies. It serves as a platform where developers, researchers, and AI enthusiasts can collaborate on various AI projects, contribute to the development of cutting-edge AI solutions, and share their work with the global community.
 
 ## Quick Start
 
@@ -43,13 +47,15 @@ We recommend using the latest release of [NGC's PyTorch container](https://catal
     cd vllm
     pip install .
 
-    cd megatron-energon
-    pip install .
+    pip install -e ./megatron-energon
+    cp -r megatron-energon/src/megatron/energon megatron/megatron
     ```
 
 ### Run a Task 
 
-FlagScale provides a unified runner for various tasks, including training and inference. Simply specify the configuration file to run the task with a single command. The runner will automatically load the configurations and execute the task. The following example demonstrates how to run a distributed training task.
+FlagScale provides a unified runner for various tasks, including training，inference and serve. Simply specify the configuration file to run the task with a single command. The runner will automatically load the configurations and execute the task. The following example demonstrates how to run a distributed training task.
+
+#### Train
 
 1. Start the distributed training job:
     ```sh
@@ -62,6 +68,18 @@ FlagScale provides a unified runner for various tasks, including training and in
     python run.py --config-path ./examples/aquila/conf --config-name config action=stop
     ```
 
+#### Serve
+
+1. Start the server:
+    ```sh
+    python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=run
+    ```
+2. Stop the server:
+    ```sh
+    python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=stop
+    ```
+For more details, please refer to [Quick Start](./flagscale/serve/README.md).
+
 ## License
 
 This project is licensed under the [Apache License (Version 2.0)](https://github.com/FlagOpen/FlagScale/blob/main/LICENSE). This project also contains other third-party components under other open-source licenses. See the [LICENSE](https://github.com/FlagOpen/FlagScale/blob/main/LICENSE) file for more information.
diff --git a/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml b/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml
diff --git a/examples/llama/conf/train/train_llama3_8b_hetero.yaml b/examples/llama/conf/train/train_llama3_8b_hetero.yaml
@@ -0,0 +1,100 @@
+system:
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 2
+  disable_bias_linear: True
+  use_flash_attn: True
+  sequence_parallel: True
+  use_distributed_optimizer: True
+  precision:
+    bf16: True
+    attention_softmax_in_fp32: true
+    accumulate_allreduce_grads_in_fp32: true
+  logging:
+    log_interval: 1
+    tensorboard_log_interval: 1
+    wandb_project: "train-llama3-8B" 
+    wandb_exp_name: "train-test-8B" 
+  checkpoint:
+    load: outputs_llama3/checkpoint_mc
+    save_interval: 10
+    finetune: True
+    ckpt_format: "torch"
+
+model:
+  use_mcore_models: True
+  transformer_impl: transformer_engine
+  num_layers: 32    
+  hidden_size: 4096 
+  ffn_hidden_size: 14336
+  num_attention_heads: 32 
+  seq_length: 4096  
+  group_query_attention: True
+  num_query_groups: 8
+  max_position_embeddings: 8192
+  norm_epsilon: 1e-5
+  use_rotary_position_embeddings: True
+  no_position_embedding: True
+  swiglu: True
+  normalization: RMSNorm
+  rotary_interleaved_patch: False
+  position_embedding_type: rope
+  rotary_base: 500000
+  untie_embeddings_and_output_weights: True
+  init_method_std: 0.02
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  clip_grad: 1.0
+  train_samples: 200000 
+  eval_iters: 100
+  eval_interval: 1000
+  micro_batch_size: 1
+  global_batch_size: 16
+
+  hetero:
+    enable_hetero: True
+    hetero_use_cpu_communication: True
+    # mesh format [tp1,cp1,ep1,dp1,pp1,(tp2,cp2...)]
+
+    # 2 mesh, diff tp dp pp
+    hetero_pipeline_layer_split: [18, 14]
+    hetero_process_meshes: [2, 1, 1, 4, 1, 4, 1, 1, 2, 1]
+    hetero_device_types: ["A800", "A100"]
+
+    standalone_embedding_stage: False
+    hetero_current_device_type: "A800"
+
+  # recompute:
+  #   recompute_granularity: "full" 
+  #   recompute_method: "uniform"
+  #   recompute_num_layers: 1
+
+  #   ## pp 2 stages and num_micro_batches 4
+  #   recompute_granularity_per_stage_micro_batch:
+  #     - [1, 3, 0, 1, 0]
+  #     - [1, 3, 1, 1, 1]
+  #   recompute_method_per_stage_micro_batch:
+  #     - [1, 3, 0, 1, 0]
+  #     - [1, 3, 0, 1, 0]
+  #   recompute_num_layers_per_stage_micro_batch:
+  #     - [1, 3, 2, 1, 2]
+  #     - [1, 3, 1, 1, 1]
+
+
+  optimizer:
+    weight_decay: 1e-2
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    lr_scheduler:
+      lr: 1.0e-5
+      min_lr: 1.0e-6
+      lr_warmup_fraction: .1
+      lr_decay_style: cosine 
+
+data:
+  data_path: examples/llama/pile-openwebtext_text_document/pile-openwebtext_text_document
+  split: 1 
+  tokenizer:
+    tokenizer_type: Llama3TokenizerFS
+    tokenizer_path: meta-llama3/Meta-Llama-3-8B
+    vocab_size: 128256
+    make_vocab_size_divisible_by: 64
diff --git a/examples/llava_onevision/conf/train/train_llava_onevision_1.5b.yaml b/examples/llava_onevision/conf/train/train_llava_onevision_1.5b.yaml
@@ -44,6 +44,11 @@ model:
   hidden_dropout: 0.0
   clip_grad: 1.0
   train_iters: 10
+  profile: False
+  profile-step-start: 10
+  profile-step-end: 20
+  profile_ranks: 7
+  use_pytorch_profiler: True
   eval_iters: 0
   micro_batch_size: 2
   global_batch_size: 512

diff --git a/vllm/vllm/v1/tokenizer/__init__.py → examples/qwen/__init__.py b/vllm/vllm/v1/tokenizer/__init__.py → examples/qwen/__init__.py
diff --git a/examples/qwen/conf/config_qwen2.5_1.5b.yaml b/examples/qwen/conf/config_qwen2.5_1.5b.yaml
@@ -0,0 +1,30 @@
+defaults:
+  - _self_
+  - train: train_qwen_2.5_1.5b
+  # - train: train_mixtral_1.8b
+
+experiment:
+  exp_name: train_qwen_2.5_1.5b
+  exp_dir: ./outputs # outputs ## log、checkpoints output path
+  task:
+    type: train
+    backend: megatron
+    entrypoint: ./flagscale/train/train_aquila.py
+  runner:
+    backend: torchrun
+    nnodes: 2 
+    nproc_per_node: 8
+    hostfile: torchrun # Please replace with your actual hostfile path
+  envs:
+    CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    NCCL_SOCKET_IFNAME: eth0
+    NCCL_IB_DISABLE: 0
+    NCCL_IB_CUDA_SUPPORT: 1
+    NCCL_IB_GID_INDEX: 0
+    NCCL_DEBUG: INFO
+    OMP_NUM_THREADS: 4
+    GLOO_SOCKET_IFNAME: eth0
+    NCCL_IB_HCA: mlx5_2,mlx5_5
+
+action: run
diff --git a/examples/qwen/conf/config_qwen2.5_72b_tp.yaml b/examples/qwen/conf/config_qwen2.5_72b_tp.yaml
@@ -0,0 +1,22 @@
+defaults:
+  - _self_
+  - serve: serve_qwen2.5_72b
+
+experiment:
+  exp_name: qwen2.5_72b
+  exp_dir: outputs/${experiment.exp_name}
+  task:
+    type: serve
+    backend: vllm
+    entrypoint: null
+  runner:
+    hostfile: null
+  envs:
+    CUDA_VISIBLE_DEVICES: 0,1,2,3
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/examples/qwen/conf/config_qwen2.5_7b.yaml b/examples/qwen/conf/config_qwen2.5_7b.yaml
@@ -0,0 +1,22 @@
+defaults:
+  - _self_
+  - serve: serve_qwen2.5_7b
+
+experiment:
+  exp_name: qwen2.5_7b
+  exp_dir: outputs/${experiment.exp_name}
+  task:
+    type: serve
+    backend: vllm
+    entrypoint: null
+  runner:
+    hostfile: null
+  envs:
+    CUDA_VISIBLE_DEVICES: 0
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
diff --git a/examples/qwen/conf/config_ssh_qwen2.5_7b.yaml b/examples/qwen/conf/config_ssh_qwen2.5_7b.yaml
@@ -0,0 +1,25 @@
+defaults:
+  - _self_
+  - serve: serve_qwen2.5_7b
+
+experiment:
+  exp_name: qwen2.5_7b
+  exp_dir: outputs/${experiment.exp_name}
+  task:
+    type: serve
+    backend: vllm
+    entrypoint: null
+  runner:
+    hostfile: /path/to/hostfile # type: {remote ip} slots={gpu num} type={gpu type} (like: x.x.x.x slots=8 type=A100)
+    ssh_port: 22 # replace with your ssh port
+  envs:
+    CUDA_VISIBLE_DEVICES: 0
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+  cmds:
+    before_start: source /root/miniconda3/bin/activate flagscale
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,6 @@ slurm* @@
     logs
     .vscode
     log_file/*
-    outputs
+    outputs
+    *.log
+    *.out