diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 41ab9ed146627..1bc8d32d2d161 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -34,11 +34,6 @@ We currently support the following OpenAI APIs: - *Note: `suffix` parameter is not supported.* - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). - - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). - - *Note: `image_url.detail` parameter is not supported.* - - We support two audio content types. - - Support `input_audio` content type as defined [here](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in). - - Support `audio_url` content type for audio files. Refer to [here](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py#L51) for the exact schema. - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](#embeddings-api) (`/v1/embeddings`) - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). @@ -209,6 +204,11 @@ The following extra parameters are supported: Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details. +We support both [Vision](https://platform.openai.com/docs/guides/vision)- and +[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; +see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information. +- *Note: `image_url.detail` parameter is not supported.* + #### Extra parameters The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst index 08a0536646754..417c86aad9dcf 100644 --- a/docs/source/usage/multimodal_inputs.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -376,6 +376,10 @@ Then, you can use the OpenAI client as follows: result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from input audio:", result) +Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input: + +.. code-block:: python + chat_completion_from_url = client.chat.completions.create( messages=[{ "role": "user", diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py index eea429d86d917..6a160fd70423f 100644 --- a/examples/openai_chat_completion_client_for_multimodal.py +++ b/examples/openai_chat_completion_client_for_multimodal.py @@ -153,11 +153,11 @@ def run_multi_image() -> None: # Audio input inference def run_audio() -> None: - # Any format supported by librosa is supported audio_url = AudioAsset("winning_call").url + audio_base64 = encode_base64_content_from_url(audio_url) - # Use audio url in the payload - chat_completion_from_url = client.chat.completions.create( + # OpenAI-compatible schema (`input_audio`) + chat_completion_from_base64 = client.chat.completions.create( messages=[{ "role": "user", @@ -167,9 +167,11 @@ def run_audio() -> None: "text": "What's in this audio?" }, { - "type": "audio_url", - "audio_url": { - "url": audio_url + "type": "input_audio", + "input_audio": { + # Any format supported by librosa is supported + "data": audio_base64, + "format": "wav" }, }, ], @@ -178,11 +180,11 @@ def run_audio() -> None: max_completion_tokens=64, ) - result = chat_completion_from_url.choices[0].message.content - print("Chat completion output from audio url:", result) + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from input audio:", result) - audio_base64 = encode_base64_content_from_url(audio_url) - chat_completion_from_base64 = client.chat.completions.create( + # HTTP URL + chat_completion_from_url = client.chat.completions.create( messages=[{ "role": "user", @@ -195,7 +197,7 @@ def run_audio() -> None: "type": "audio_url", "audio_url": { # Any format supported by librosa is supported - "url": f"data:audio/ogg;base64,{audio_base64}" + "url": audio_url }, }, ], @@ -204,9 +206,10 @@ def run_audio() -> None: max_completion_tokens=64, ) - result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from base64 encoded audio:", result) + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from audio url:", result) + # base64 URL chat_completion_from_base64 = client.chat.completions.create( messages=[{ "role": @@ -217,11 +220,10 @@ def run_audio() -> None: "text": "What's in this audio?" }, { - "type": "input_audio", - "input_audio": { + "type": "audio_url", + "audio_url": { # Any format supported by librosa is supported - "data": audio_base64, - "format": "wav" + "url": f"data:audio/ogg;base64,{audio_base64}" }, }, ], @@ -231,7 +233,7 @@ def run_audio() -> None: ) result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from input audio:", result) + print("Chat completion output from base64 encoded audio:", result) example_function_map = {