From b4e5eb944809abbda01606e00081b1f47dc946bb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 16 Dec 2024 15:10:02 +0000
Subject: [PATCH] Update docs

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../serving/openai_compatible_server.md       | 10 ++---
 docs/source/usage/multimodal_inputs.rst       |  4 ++
 ...i_chat_completion_client_for_multimodal.py | 38 ++++++++++---------
 3 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 41ab9ed146627..1bc8d32d2d161 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -34,11 +34,6 @@ We currently support the following OpenAI APIs:
   - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
   - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
-  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
-    - *Note: `image_url.detail` parameter is not supported.*
-  - We support two audio content types.
-    - Support `input_audio` content type as defined [here](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
-    - Support `audio_url` content type for audio files. Refer to [here](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py#L51) for the exact schema.
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
   - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
@@ -209,6 +204,11 @@ The following extra parameters are supported:
 
 Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
 
+We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
+[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
+see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
+- *Note: `image_url.detail` parameter is not supported.*
+
 #### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index 08a0536646754..417c86aad9dcf 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -376,6 +376,10 @@ Then, you can use the OpenAI client as follows:
     result = chat_completion_from_base64.choices[0].message.content
     print("Chat completion output from input audio:", result)
 
+Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input:
+
+.. code-block:: python
+
     chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role": "user",
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index eea429d86d917..6a160fd70423f 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -153,11 +153,11 @@ def run_multi_image() -> None:
 
 # Audio input inference
 def run_audio() -> None:
-    # Any format supported by librosa is supported
     audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
 
-    # Use audio url in the payload
-    chat_completion_from_url = client.chat.completions.create(
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role":
             "user",
@@ -167,9 +167,11 @@ def run_audio() -> None:
                     "text": "What's in this audio?"
                 },
                 {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
+                    "type": "input_audio",
+                    "input_audio": {
+                        # Any format supported by librosa is supported
+                        "data": audio_base64,
+                        "format": "wav"
                     },
                 },
             ],
@@ -178,11 +180,11 @@ def run_audio() -> None:
         max_completion_tokens=64,
     )
 
-    result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from audio url:", result)
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
 
-    audio_base64 = encode_base64_content_from_url(audio_url)
-    chat_completion_from_base64 = client.chat.completions.create(
+    # HTTP URL
+    chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role":
             "user",
@@ -195,7 +197,7 @@ def run_audio() -> None:
                     "type": "audio_url",
                     "audio_url": {
                         # Any format supported by librosa is supported
-                        "url": f"data:audio/ogg;base64,{audio_base64}"
+                        "url": audio_url
                     },
                 },
             ],
@@ -204,9 +206,10 @@ def run_audio() -> None:
         max_completion_tokens=64,
     )
 
-    result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from base64 encoded audio:", result)
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
 
+    # base64 URL
     chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role":
@@ -217,11 +220,10 @@ def run_audio() -> None:
                     "text": "What's in this audio?"
                 },
                 {
-                    "type": "input_audio",
-                    "input_audio": {
+                    "type": "audio_url",
+                    "audio_url": {
                         # Any format supported by librosa is supported
-                        "data": audio_base64,
-                        "format": "wav"
+                        "url": f"data:audio/ogg;base64,{audio_base64}"
                     },
                 },
             ],
@@ -231,7 +233,7 @@ def run_audio() -> None:
     )
 
     result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
+    print("Chat completion output from base64 encoded audio:", result)
 
 
 example_function_map = {