Skip to content

Commit

Permalink
Merge branch 'vllm-project:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
lk-chen authored Nov 25, 2024
2 parents 9b002b0 + 1b583cf commit 436beb2
Show file tree
Hide file tree
Showing 246 changed files with 9,165 additions and 3,065 deletions.
39 changes: 21 additions & 18 deletions .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ steps:
- wait

- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
plugins:
Expand Down Expand Up @@ -45,6 +46,7 @@ steps:
medium: Memory

- label: "H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
plugins:
Expand All @@ -63,21 +65,22 @@ steps:
- VLLM_USAGE_SOURCE
- HF_TOKEN


# - label: "H100"
# agents:
# queue: H100
# plugins:
# - docker#v5.11.0:
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# command:
# - bash
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
# mount-buildkite-agent: true
# propagate-environment: true
# ipc: host
# gpus: all
# environment:
# - VLLM_USAGE_SOURCE
# - HF_TOKEN

- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,17 @@ def results_to_json(latency, throughput, serving):
throughput_results,
serving_results)

# Sort all dataframes by their respective "Test name" columns
for df in [latency_results, serving_results, throughput_results]:
if not df.empty:
df.sort_values(by="Test name", inplace=True)
if df.empty:
continue

# Sort all dataframes by their respective "Test name" columns
df.sort_values(by="Test name", inplace=True)

# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")

# get markdown tables
latency_md_table = tabulate(latency_results,
Expand Down
1 change: 0 additions & 1 deletion .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \
Expand Down
44 changes: 3 additions & 41 deletions .buildkite/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,49 +4,11 @@
# It serves a sanity check for compilation and basic model usage.
set -ex

# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .

# Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; }
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image, setting --shm-size=4g for tensor parallel.
source /etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test

function cpu_tests() {
set -e

# Run basic model test
docker exec cpu-test bash -c "
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

# online inference
docker exec cpu-test bash -c "
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
}
# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .

# All of CPU tests are expected to be finished less than 25 mins.
export -f cpu_tests
timeout 25m bash -c "cpu_tests"
15 changes: 11 additions & 4 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ steps:
- tests/multimodal
- tests/test_utils
- tests/worker
- tests/test_lazy_torch_compile.py
commands:
- python3 test_lazy_torch_compile.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
Expand Down Expand Up @@ -228,7 +230,7 @@ steps:
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test" # 9min
Expand Down Expand Up @@ -473,18 +475,23 @@ steps:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py

- label: LoRA Long Context (Distributed) # 11min
# This test runs llama 13B, so it is required to run on 4 GPUs.
- label: LoRA TP Test (Distributed)
num_gpus: 4
soft_fail: true
source_file_dependencies:
- vllm/lora
- tests/lora/test_long_context
- tests/lora
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# This test runs llama 13B, so it is required to run on 4 GPUs.
- pytest -v -s -x lora/test_long_context.py
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py


- label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests"
Expand Down
7 changes: 6 additions & 1 deletion .buildkite/upload-wheels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@ echo "Version: $version"

# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
new_version="1.0.0.dev"
suffix="${version##*.}"
if [[ $suffix == cu* ]]; then
new_version="1.0.0.dev+${suffix}"
else
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ set(VLLM_EXT_SRC
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
Expand Down Expand Up @@ -237,7 +238,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ Easy, fast, and cheap LLM serving for everyone
---

*Latest News* 🔥
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
Expand Down
1 change: 1 addition & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ async def async_request_tgi(
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
"truncate": request_func_input.prompt_len,
# TGI does not accept ignore_eos flag.
}
payload = {
Expand Down
2 changes: 2 additions & 0 deletions csrc/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
int64_t thx, int64_t thy);

torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
#endif

torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
int64_t n);
Expand All @@ -138,6 +139,7 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
int64_t row);

#ifndef USE_ROCM
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);

void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
Expand Down
17 changes: 16 additions & 1 deletion csrc/quantization/gguf/ggml-common.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h
#define QK_K 256
#define K_QUANTS_PER_ITERATION 2
#define WARP_SIZE 32
#define WARP_SIZE_GGUF 32
#define K_SCALE_SIZE 12
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
#define CUDA_QUANTIZE_BLOCK_SIZE 256
Expand Down Expand Up @@ -1112,4 +1112,19 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#endif
return c;
}

static __device__ __forceinline__ uint32_t __vcmpeq4(const uint32_t a, const uint32_t b) {
uint32_t neq = a^b;
return !(neq & 0xff000000) * 0xff000000 |
!(neq & 0x00ff0000) * 0x00ff0000 |
!(neq & 0x0000ff00) * 0x0000ff00 |
!(neq & 0x000000ff) * 0x000000ff;
}

static __device__ __forceinline__ uint32_t __vsub4(const uint32_t a, const uint32_t b) {
return (static_cast<uint8_t>(((a & 0xff000000) >> 24) - ((b & 0xff000000) >> 24)) << 24) +
(static_cast<uint8_t>(((a & 0x00ff0000) >> 16) - ((b & 0x00ff0000) >> 16)) << 16) +
(static_cast<uint8_t>(((a & 0x0000ff00) >> 8) - ((b & 0x0000ff00) >> 8)) << 8) +
(static_cast<uint8_t>(((a & 0x000000ff) >> 0) - ((b & 0x000000ff) >> 0)) << 0);
}
#endif // defined(USE_ROCM)
6 changes: 4 additions & 2 deletions csrc/quantization/gguf/gguf_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <torch/all.h>
#include <c10/cuda/CUDAGuard.h>

#include "cuda_compat.h"

#include "ggml-common.h"
#include "vecdotq.cuh"
#include "dequantize.cuh"
Expand Down Expand Up @@ -32,8 +34,8 @@ static __global__ void quantize_q8_1(const half* __restrict__ x,

#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32));
sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32);
}

const float d = amax / 127;
Expand Down
Loading

0 comments on commit 436beb2

Please sign in to comment.