Skip to content

Commit

Permalink
Merge branch 'vllm-project:main' into punica-kernel-fusion
Browse files Browse the repository at this point in the history
  • Loading branch information
jeejeelee authored Dec 23, 2024
2 parents b3ea6fc + 048fc57 commit eb01089
Show file tree
Hide file tree
Showing 14 changed files with 99 additions and 25 deletions.
24 changes: 24 additions & 0 deletions .buildkite/generate_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import argparse
import os

template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
</body>
</html>
"""

parser = argparse.ArgumentParser()
parser.add_argument("--wheel", help="The wheel path.", required=True)
args = parser.parse_args()

filename = os.path.basename(args.wheel)

with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# cloudfront requires escaping the '+' character
f.write(
template.format(wheel=filename,
wheel_html_escaped=filename.replace("+", "%2B")))
6 changes: 3 additions & 3 deletions .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ steps:
- VLLM_USAGE_SOURCE
- HF_TOKEN

- block: "Run H100 Benchmark"
key: block-h100
depends_on: ~
#- block: "Run H100 Benchmark"
#key: block-h100
#depends_on: ~

- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
Expand Down
30 changes: 29 additions & 1 deletion .buildkite/upload-wheels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ wheel="$new_wheel"
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version"

normal_wheel="$wheel" # Save the original wheel filename

# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
suffix="${version##*.}"
Expand All @@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
mv -- "$wheel" "$new_wheel"
# use cp to keep both files in the artifacts directory
cp -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi

# Upload the wheel to S3
python3 .buildkite/generate_index.py --wheel "$normal_wheel"

# generate index for this commit
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"

if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi

# generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"

if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi

aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
3 changes: 3 additions & 0 deletions tests/entrypoints/openai/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1

Expand Down Expand Up @@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded(
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1

Expand All @@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Expand Down
3 changes: 3 additions & 0 deletions tests/entrypoints/openai/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1

Expand Down Expand Up @@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded(
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1

Expand All @@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Expand Down
3 changes: 3 additions & 0 deletions tests/entrypoints/openai/test_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1

Expand Down Expand Up @@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded(
messages=messages,
max_completion_tokens=10,
logprobs=True,
temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1

Expand All @@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Expand Down
4 changes: 3 additions & 1 deletion tests/lora/test_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):


@pytest.mark.parametrize("tp_size", [4])
@pytest.mark.parametrize("fully_shard", [True, False])
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
tp_size):
tp_size, fully_shard):
"""This LoRA model has all supported Mixtral target modules"""

if torch.cuda.device_count() < tp_size:
Expand All @@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
max_loras=4,
distributed_executor_backend="ray",
tensor_parallel_size=tp_size,
fully_sharded_loras=fully_shard,
max_lora_rank=32,
)

Expand Down
7 changes: 7 additions & 0 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,3 +1256,10 @@ async def stop_profile(self) -> None:
self.engine.model_executor.stop_profile()
else:
self.engine.model_executor._run_workers("stop_profile")


# TODO(v1): Remove this class proxy when V1 goes default.
if envs.VLLM_USE_V1:
from vllm.v1.engine.async_llm import AsyncLLM

AsyncLLMEngine = AsyncLLM # type: ignore
6 changes: 1 addition & 5 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import vllm.envs as envs
from vllm.config import ModelConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore
from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.engine.multiprocessing.engine import run_mp_engine
from vllm.engine.protocol import EngineClient
Expand Down Expand Up @@ -66,11 +67,6 @@
is_valid_ipv6_address)
from vllm.version import __version__ as VLLM_VERSION

if envs.VLLM_USE_V1:
from vllm.v1.engine.async_llm import AsyncLLMEngine # type: ignore
else:
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore

TIMEOUT_KEEP_ALIVE = 5 # seconds

prometheus_multiproc_dir: tempfile.TemporaryDirectory
Expand Down
3 changes: 2 additions & 1 deletion vllm/lora/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,8 +425,9 @@ def forward(self, input_):
if self.base_layer.skip_bias_add else None)
return output, output_bias

# ReplicatedLinear should always be replaced, regardless of the fully
# sharded LoRAs setting, because it is, by definition, copied per GPU.
@classmethod
@_not_fully_sharded_can_replace
def can_replace_layer(
cls,
source_layer: nn.Module,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def should_ignore_layer(layer_name: Optional[str],
# in the safetensors checkpoint. So, we convert the name
# from the fused version to unfused + check to make sure that
# each shard of the fused layer has the same scheme.
if proj_name in FUSED_LAYER_NAME_MAPPING:
if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore:
shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]

# Convert fused_name --> [shard_names]
Expand Down
23 changes: 15 additions & 8 deletions vllm/model_executor/models/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,12 @@
except ImportError:
USE_XFORMERS_OPS = False

PIXTRAL_IMAGE_BREAK_ID = 12
PIXTRAL_IMAGE_END_ID = 13
# These token ids cannot be retrieved from model config
# so we hardcode them here.
PIXTRAL_12B_IMAGE_BREAK_ID = 12
PIXTRAL_12B_IMAGE_END_ID = 13
PIXTRAL_LARGE_IMAGE_BREAK_ID = 14
PIXTRAL_LARGE_IMAGE_END_ID = 15


def get_max_pixtral_image_tokens(ctx: InputContext):
Expand Down Expand Up @@ -118,8 +122,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
for image_data in data_list:
image = ImageChunk(image=image_data)
encoding = tokenizer.instruct.mm_encoder(image)
image = torch.from_numpy(encoding.image).to(device="cuda",
dtype=torch.float16)
image = torch.from_numpy(encoding.image).to(dtype=torch.float16)
images.append(image)
image_tokens_list.append(encoding.tokens)

Expand Down Expand Up @@ -237,8 +240,9 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:

# NOTE: Image embeddings are split into separate tensors for each image
# by the indices of `[IMG_END]` token.
split_indices = torch.where(
image_tokens == PIXTRAL_IMAGE_END_ID)[0] + 1
image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | (
image_tokens == PIXTRAL_LARGE_IMAGE_END_ID)
split_indices = torch.where(image_end_condition)[0] + 1
if len(split_indices) <= 1:
# Do not split, return as tensor of shape [1, fs, hs]
return image_embeds.unsqueeze(0)
Expand All @@ -260,8 +264,11 @@ def get_input_embeddings(
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings, [
self.vision_args.image_token_id, PIXTRAL_IMAGE_END_ID,
PIXTRAL_IMAGE_BREAK_ID
self.vision_args.image_token_id,
PIXTRAL_12B_IMAGE_END_ID,
PIXTRAL_12B_IMAGE_BREAK_ID,
PIXTRAL_LARGE_IMAGE_BREAK_ID,
PIXTRAL_LARGE_IMAGE_END_ID,
])
return inputs_embeds

Expand Down
6 changes: 1 addition & 5 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def from_engine_args(
start_engine_loop: bool = True,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
) -> "AsyncLLMEngine":
) -> "AsyncLLM":
"""Create an AsyncLLM from the EngineArgs."""

# Create the engine configs.
Expand Down Expand Up @@ -386,7 +386,3 @@ def errored(self) -> bool:
@property
def dead_error(self) -> BaseException:
return Exception() # TODO: implement


# Retain V0 name for backwards compatibility.
AsyncLLMEngine = AsyncLLM
4 changes: 4 additions & 0 deletions vllm/v1/engine/mm_input_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
return None

mm_data = prompt["multi_modal_data"]
if not mm_data:
# mm_data can be None or an empty dict.
return None

image_inputs = mm_data["image"]

return self.hash_images(image_inputs)
Expand Down

0 comments on commit eb01089

Please sign in to comment.