-
-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into feat/xgrammar
- Loading branch information
Showing
119 changed files
with
3,727 additions
and
1,365 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
144 changes: 144 additions & 0 deletions
144
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
#!/bin/bash | ||
|
||
# benchmark the overhead of disaggregated prefill. | ||
# methodology: | ||
# - send all request to prefill vLLM instance. It will buffer KV cache. | ||
# - then send all request to decode instance. | ||
# - The TTFT of decode instance is the overhead. | ||
|
||
set -ex | ||
|
||
kill_gpu_processes() { | ||
# kill all processes on GPU. | ||
pkill -f pt_main_thread | ||
sleep 10 | ||
|
||
# remove vllm config file | ||
rm -rf ~/.config/vllm | ||
|
||
# Print the GPU memory usage | ||
# so that we know if all GPU processes are killed. | ||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) | ||
# The memory usage should be 0 MB. | ||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB" | ||
} | ||
|
||
wait_for_server() { | ||
# wait for vllm server to start | ||
# return 1 if vllm server crashes | ||
local port=$1 | ||
timeout 1200 bash -c " | ||
until curl -s localhost:${port}/v1/completions > /dev/null; do | ||
sleep 1 | ||
done" && return 0 || return 1 | ||
} | ||
|
||
|
||
benchmark() { | ||
|
||
export VLLM_LOGGING_LEVEL=DEBUG | ||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') | ||
|
||
# compare chunked prefill with disaggregated prefill | ||
|
||
results_folder="./results" | ||
model="meta-llama/Meta-Llama-3.1-8B-Instruct" | ||
dataset_name="sonnet" | ||
dataset_path="../sonnet_4x.txt" | ||
num_prompts=10 | ||
qps=$1 | ||
prefix_len=50 | ||
input_len=2048 | ||
output_len=$2 | ||
|
||
|
||
CUDA_VISIBLE_DEVICES=0 python3 \ | ||
-m vllm.entrypoints.openai.api_server \ | ||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \ | ||
--port 8100 \ | ||
--max-model-len 10000 \ | ||
--gpu-memory-utilization 0.6 \ | ||
--kv-transfer-config \ | ||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & | ||
|
||
|
||
CUDA_VISIBLE_DEVICES=1 python3 \ | ||
-m vllm.entrypoints.openai.api_server \ | ||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \ | ||
--port 8200 \ | ||
--max-model-len 10000 \ | ||
--gpu-memory-utilization 0.6 \ | ||
--kv-transfer-config \ | ||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & | ||
|
||
wait_for_server 8100 | ||
wait_for_server 8200 | ||
|
||
# let the prefill instance finish prefill | ||
python3 ../benchmark_serving.py \ | ||
--backend vllm \ | ||
--model $model \ | ||
--dataset-name $dataset_name \ | ||
--dataset-path $dataset_path \ | ||
--sonnet-input-len $input_len \ | ||
--sonnet-output-len "$output_len" \ | ||
--sonnet-prefix-len $prefix_len \ | ||
--num-prompts $num_prompts \ | ||
--port 8100 \ | ||
--save-result \ | ||
--result-dir $results_folder \ | ||
--result-filename disagg_prefill_2xtp4.json \ | ||
--request-rate "inf" | ||
|
||
|
||
# send the request to decode. | ||
# The TTFT of this command will be the overhead of disagg prefill impl. | ||
python3 ../benchmark_serving.py \ | ||
--backend vllm \ | ||
--model $model \ | ||
--dataset-name $dataset_name \ | ||
--dataset-path $dataset_path \ | ||
--sonnet-input-len $input_len \ | ||
--sonnet-output-len "$output_len" \ | ||
--sonnet-prefix-len $prefix_len \ | ||
--num-prompts $num_prompts \ | ||
--port 8200 \ | ||
--save-result \ | ||
--result-dir $results_folder \ | ||
--result-filename disagg_prefill_2xtp4.json \ | ||
--request-rate "$qps" | ||
kill_gpu_processes | ||
|
||
} | ||
|
||
|
||
main() { | ||
|
||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||
(which jq) || (apt-get -y install jq) | ||
(which socat) || (apt-get -y install socat) | ||
|
||
pip install quart httpx | ||
|
||
cd "$(dirname "$0")" | ||
|
||
cd .. | ||
# create sonnet-4x.txt | ||
echo "" > sonnet_4x.txt | ||
for _ in {1..4} | ||
do | ||
cat sonnet.txt >> sonnet_4x.txt | ||
done | ||
cd disagg_benchmarks | ||
|
||
rm -rf results | ||
mkdir results | ||
|
||
default_qps=1 | ||
default_output_len=1 | ||
benchmark $default_qps $default_output_len | ||
|
||
} | ||
|
||
|
||
main "$@" |
Oops, something went wrong.