diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh index 1a88d038b4b52..f6e41fcfdd0be 100644 --- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh +++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh @@ -70,23 +70,13 @@ wait_for_server() { kill_gpu_processes() { # kill all processes on GPU. - pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) - if [ -z "$pids" ]; then - echo "No GPU processes found." - else - for pid in $pids; do - kill -9 "$pid" - echo "Killed process with PID: $pid" - done - echo "All GPU processes have been killed." - fi + ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9 + ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9 - # waiting for GPU processes to be fully killed - # loop while nvidia-smi returns any processes - while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do + # wait until GPU memory usage smaller than 1GB + while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do sleep 1 - echo "Waiting for GPU processes to be killed" done # remove vllm config file diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md index 891e4917070d9..da32d1f073cea 100644 --- a/.buildkite/nightly-benchmarks/tests/descriptions.md +++ b/.buildkite/nightly-benchmarks/tests/descriptions.md @@ -1,47 +1,42 @@ ## Latency tests -This test suite aims to test vllm's end-to-end latency under a controlled setup. - - Input length: 32 tokens. - Output length: 128 tokens. - Batch size: fixed (8). -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: end-to-end latency (mean, median, p99). -### Latency benchmarking results {latency_tests_markdown_table} -## Throughput tests -This test suite aims to test vllm's throughput. +## Throughput tests - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm to achieve maximum throughput. -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: throughput. -### Throughput benchmarking results {throughput_tests_markdown_table} -## Serving tests -This test suite aims to test vllm's real serving metrics. +## Serving tests - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm and the arrival pattern of the requests. - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- We also added a speculative decoding test for llama-3 70B, under QPS 2 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). -### Serving benchmarking results {serving_tests_markdown_table} + ## json version of the benchmarking tables This section contains the data of the markdown tables above in JSON format. diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json index 06488cd79110a..1841186da158f 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -2,7 +2,7 @@ { "test_name": "latency_llama8B_tp1", "parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "num_iters_warmup": 5, @@ -12,7 +12,7 @@ { "test_name": "latency_llama70B_tp4", "parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "num-iters-warmup": 5, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json index 300af0524d7c0..facb0eac749ca 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -3,7 +3,7 @@ "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], "server_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", @@ -11,7 +11,7 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -22,7 +22,7 @@ "test_name": "serving_llama70B_tp4_sharegpt", "qps_list": [1, 4, 16, "inf"], "server_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "swap_space": 16, "disable_log_stats": "", @@ -30,7 +30,7 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -60,7 +60,7 @@ "test_name": "serving_llama70B_tp4_sharegpt_specdecode", "qps_list": [2], "server_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "disable_log_requests": "", "tensor_parallel_size": 4, "swap_space": 16, @@ -70,7 +70,7 @@ "use_v2_block_manager": "" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json index 41ac135748704..91ef6d16be638 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -2,7 +2,7 @@ { "test_name": "throughput_llama8B_tp1", "parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -13,7 +13,7 @@ { "test_name": "throughput_llama70B_tp4", "parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",