Merge branch 'main' into feat/xgrammar

vllm-project · Dec 2, 2024 · 9f97093 · 9f97093
2 parents 8d3c671 + 9b14d97
commit 9f97093
Show file tree

Hide file tree

Showing 119 changed files with 3,727 additions and 1,365 deletions.
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
@@ -12,5 +12,7 @@ remove_docker_container() { docker rm -f xpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
+# Run the image and test offline inference/tensor parallel
+docker run -it -d --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test /bin/bash
+docker exec xpu-test bash -c "python3 examples/offline_inference.py"
+docker exec xpu-test bash -c "python3 examples/offline_inference_cli.py -tp 2" 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -334,7 +334,6 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
-    - pytest -v -s models/embedding/vision_language -m core_model
 
 - label: Language Models Test (Extended) # 50min
   optional: true
@@ -346,7 +345,6 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
 
 - label: Multi-Modal Models Test (Standard) # 26min
   #mirror_hardwares: [amd]
@@ -359,6 +357,7 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/vision_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
@@ -376,6 +375,7 @@ steps:
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
@@ -430,6 +430,9 @@ steps:
   - vllm/model_executor/models/
   - tests/distributed/
   - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
   commands:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
@@ -443,6 +446,7 @@ steps:
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -522,7 +522,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG fdf6d72b48aea41f4ae6a89139a453dae554abc8
+          GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -199,6 +199,56 @@ def sample_sonnet_requests(
     return sampled_requests
 
 
+def sample_mmmu_pro_vision_requests(
+    dataset,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+    sampled_requests: List[Tuple[str, int, int, Dict[str,
+                                                     Collection[str]]]] = []
+    for data in dataset:
+        if len(sampled_requests) == num_requests:
+            break
+
+        # MMMU-Pro vision direct prompt
+        # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
+        prompt = (
+            "Answer with the option letter from the given choices directly. "
+            "The last line of your response should be of the following "
+            "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
+            "options.")
+
+        prompt_token_ids = tokenizer(prompt).input_ids
+        if fixed_output_len is None:
+            # Default max output len is set to 128
+            print("--hf-output-len is not provided. Using default value 128.")
+            fixed_output_len = 128
+
+        prompt_len = len(prompt_token_ids)
+        output_len = fixed_output_len
+
+        assert isinstance(
+            data["image"],
+            Image), ("Input image format must be `PIL.Image.Image`, "
+                     f"given {type(data['image'])}.")
+        image: Image = data["image"]
+        image = image.convert("RGB")
+        image_data = io.BytesIO()
+        image.save(image_data, format='JPEG')
+        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+        mm_content = {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+
+    return sampled_requests
+
+
 def sample_hf_requests(
     dataset_path: str,
     dataset_subset: str,
@@ -208,6 +258,21 @@ def sample_hf_requests(
     random_seed: int,
     fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+
+    # Special case for MMMU-Pro vision dataset
+    if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
+        assert dataset_split == "test"
+        dataset = load_dataset(dataset_path,
+                               name=dataset_subset,
+                               split=dataset_split,
+                               streaming=True)
+        assert "image" in dataset.features, (
+            "MMMU/MMMU_Pro vision dataset must have 'image' column.")
+        filter_func = lambda x: isinstance(x["image"], Image)
+        dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
+        return sample_mmmu_pro_vision_requests(dataset, num_requests,
+                                               tokenizer, fixed_output_len)
+
     dataset = load_dataset(dataset_path,
                            name=dataset_subset,
                            split=dataset_split,

diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# benchmark the overhead of disaggregated prefill.
+# methodology:
+# - send all request to prefill vLLM instance. It will buffer KV cache.
+# - then send all request to decode instance. 
+# - The TTFT of decode instance is the overhead.
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill -f pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  # compare chunked prefill with disaggregated prefill
+
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=10
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+
+
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+
+  # let the prefill instance finish prefill
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate "inf"
+
+
+  # send the request to decode.
+  # The TTFT of this command will be the overhead of disagg prefill impl.
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate "$qps"
+  kill_gpu_processes
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=1
+  default_output_len=1
+  benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"