From 85cb4684a49a775a5e086ffc74bfb17a9d6ac919 Mon Sep 17 00:00:00 2001
From: SameepPanigrahi <v-sameepp@microsoft.com>
Date: Thu, 22 Feb 2024 16:00:52 +0530
Subject: [PATCH 1/6] Refactor common.py file

---
 src/llmperf/common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llmperf/common.py b/src/llmperf/common.py
index 3efefa1..621c843 100644
--- a/src/llmperf/common.py
+++ b/src/llmperf/common.py
@@ -5,6 +5,7 @@
 )
 from llmperf.ray_clients.sagemaker_client import SageMakerClient
 from llmperf.ray_clients.vertexai_client import VertexAIClient
+from llmperf.ray_clients.azureai_chat_completion import AzureAIChatCompletionsClient
 from llmperf.ray_llm_client import LLMClient
 
 
@@ -28,6 +29,8 @@ def construct_clients(llm_api: str, num_clients: int) -> List[LLMClient]:
         clients = [SageMakerClient.remote() for _ in range(num_clients)]
     elif llm_api == "vertexai":
         clients = [VertexAIClient.remote() for _ in range(num_clients)]
+    elif llm_api == "azureai":
+        clients = [AzureAIChatCompletionsClient.remote() for _ in range(num_clients)]
     elif llm_api in SUPPORTED_APIS:
         clients = [LiteLLMClient.remote() for _ in range(num_clients)]
     else:

From 3b6ada1cc2995bc4e66ff76ee42ed1d4f944d73e Mon Sep 17 00:00:00 2001
From: SameepPanigrahi <v-sameepp@microsoft.com>
Date: Thu, 22 Feb 2024 16:01:27 +0530
Subject: [PATCH 2/6] Create AzureAI client file for the azure pdeployed api

---
 .../ray_clients/azureai_chat_completion.py    | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 src/llmperf/ray_clients/azureai_chat_completion.py

diff --git a/src/llmperf/ray_clients/azureai_chat_completion.py b/src/llmperf/ray_clients/azureai_chat_completion.py
new file mode 100644
index 0000000..49dd151
--- /dev/null
+++ b/src/llmperf/ray_clients/azureai_chat_completion.py
@@ -0,0 +1,119 @@
+import json
+import os
+import time
+from typing import Any, Dict
+
+import ray
+import requests
+
+from llmperf.ray_llm_client import LLMClient
+from llmperf.models import RequestConfig
+from llmperf import common_metrics
+
+@ray.remote
+class AzureAIChatCompletionsClient(LLMClient):
+    """Client for AzureAI Chat Completions API."""
+
+    def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
+        prompt = request_config.prompt
+        prompt, prompt_len = prompt
+
+        message = [
+            {"role": "system", "content": ""},
+            {"role": "user", "content": prompt},
+        ]
+        model = request_config.model
+        body = {
+            "model": model,
+            "messages": message,
+            "stream": True,
+        }
+        sampling_params = request_config.sampling_params
+        body.update(sampling_params or {})
+        time_to_next_token = []
+        tokens_received = 0
+        ttft = 0
+        error_response_code = -1
+        generated_text = ""
+        error_msg = ""
+        output_throughput = 0
+        total_request_time = 0
+
+        metrics = {}
+
+        metrics[common_metrics.ERROR_CODE] = None
+        metrics[common_metrics.ERROR_MSG] = ""
+
+        start_time = time.monotonic()
+        most_recent_received_token_time = time.monotonic()
+        address = os.environ.get("AZUREAI_API_BASE")
+        if not address:
+            raise ValueError("the environment variable OPENAI_API_BASE must be set.")
+        key = os.environ.get("AZUREAI_API_KEY")
+        if not key:
+            raise ValueError("the environment variable OPENAI_API_KEY must be set.")
+        headers = {"Authorization": f"Bearer {key}"}
+        if not address:
+            raise ValueError("No host provided.")
+        if not address.endswith("/"):
+            address = address + "/"
+        address += "chat/completions"
+        try:
+            with requests.post(
+                address,
+                json=body,
+                stream=True,
+                timeout=180,
+                headers=headers,
+            ) as response:
+                if response.status_code != 200:
+                    error_msg = response.text
+                    error_response_code = response.status_code
+                    response.raise_for_status()
+                for chunk in response.iter_lines(chunk_size=None):
+                    chunk = chunk.strip()
+
+                    if not chunk:
+                        continue
+                    stem = "data: "
+                    chunk = chunk[len(stem) :]
+                    if chunk == b"[DONE]":
+                        continue
+                    tokens_received += 1
+                    data = json.loads(chunk)
+
+                    if "error" in data:
+                        error_msg = data["error"]["message"]
+                        error_response_code = data["error"]["code"]
+                        raise RuntimeError(data["error"]["message"])
+                        
+                    delta = data["choices"][0]["delta"]
+                    if delta.get("content", None):
+                        if not ttft:
+                            ttft = time.monotonic() - start_time
+                            time_to_next_token.append(ttft)
+                        else:
+                            time_to_next_token.append(
+                                time.monotonic() - most_recent_received_token_time
+                            )
+                        most_recent_received_token_time = time.monotonic()
+                        generated_text += delta["content"]
+
+            total_request_time = time.monotonic() - start_time
+            output_throughput = tokens_received / total_request_time
+
+        except Exception as e:
+            metrics[common_metrics.ERROR_MSG] = error_msg
+            metrics[common_metrics.ERROR_CODE] = error_response_code
+            print(f"Warning Or Error: {e}")
+            print(error_response_code)
+
+        metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) #This should be same as metrics[common_metrics.E2E_LAT]. Leave it here for now
+        metrics[common_metrics.TTFT] = ttft
+        metrics[common_metrics.E2E_LAT] = total_request_time
+        metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput
+        metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
+        metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received
+        metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len
+
+        return metrics, generated_text, request_config

From 06a08331a21f8c5c346ce6172de7290c438c124f Mon Sep 17 00:00:00 2001
From: SameepPanigrahi <v-sameepp@microsoft.com>
Date: Thu, 22 Feb 2024 16:01:38 +0530
Subject: [PATCH 3/6] Add readme file

---
 README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/README.md b/README.md
index 066f8ac..13e3efc 100644
--- a/README.md
+++ b/README.md
@@ -338,6 +338,27 @@ python llm_correctness.py \
 
 ```
 
+
+### AzureAI Compatible APIs
+```bash
+export AZUREAI_API_KEY=secret_abcdefg
+export AZUREAI_API_BASE="https://api.endpoints.anyscale.com/v1"
+
+python token_benchmark_ray.py \
+--model "Llama-2-70b-chat" \
+--mean-input-tokens 550 \
+--stddev-input-tokens 150 \
+--mean-output-tokens 150 \
+--stddev-output-tokens 10 \
+--max-num-completed-requests 2 \
+--timeout 600 \
+--num-concurrent-requests 1 \
+--results-dir "result_outputs" \
+--llm-api azureai \
+--additional-sampling-params '{}'
+
+```
+
 ## Saving Results
 
 The results of the load test and correctness test are saved in the results directory specified by the `--results-dir` argument. The results are saved in 2 files, one with the summary metrics of the test, and one with metrics from each individual request that is returned.

From 9ed5695541afca30a1dda88f55426a18e0c0c8fe Mon Sep 17 00:00:00 2001
From: SameepPanigrahi <v-sameepp@microsoft.com>
Date: Fri, 23 Feb 2024 12:13:03 +0530
Subject: [PATCH 4/6] Chance the readme.md file

---
 README.md | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 13e3efc..4903d91 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,28 @@ python llm_correctness.py \
 
 ```
 
+
+### AzureAI Compatible APIs
+```bash
+export AZUREAI_API_KEY=secret_abcdefg
+export AZUREAI_API_BASE="https://api.endpoints.anyscale.com/v1"
+
+python token_benchmark_ray.py \
+--model "Llama-2-70b-chat" \
+--mean-input-tokens 550 \
+--stddev-input-tokens 150 \
+--mean-output-tokens 150 \
+--stddev-output-tokens 10 \
+--max-num-completed-requests 2 \
+--timeout 600 \
+--num-concurrent-requests 1 \
+--results-dir "result_outputs" \
+--llm-api azureai \
+--additional-sampling-params '{}'
+
+```
+
+
 see `python token_benchmark_ray.py --help` for more details on the arguments.
 
 ## Correctness Test
@@ -339,26 +361,6 @@ python llm_correctness.py \
 ```
 
 
-### AzureAI Compatible APIs
-```bash
-export AZUREAI_API_KEY=secret_abcdefg
-export AZUREAI_API_BASE="https://api.endpoints.anyscale.com/v1"
-
-python token_benchmark_ray.py \
---model "Llama-2-70b-chat" \
---mean-input-tokens 550 \
---stddev-input-tokens 150 \
---mean-output-tokens 150 \
---stddev-output-tokens 10 \
---max-num-completed-requests 2 \
---timeout 600 \
---num-concurrent-requests 1 \
---results-dir "result_outputs" \
---llm-api azureai \
---additional-sampling-params '{}'
-
-```
-
 ## Saving Results
 
 The results of the load test and correctness test are saved in the results directory specified by the `--results-dir` argument. The results are saved in 2 files, one with the summary metrics of the test, and one with metrics from each individual request that is returned.

From 036ff0a9820a55cf22214758bb3fdd06592fe0c3 Mon Sep 17 00:00:00 2001
From: VindyaKonjarla <157361884+VindyaKonjarla@users.noreply.github.com>
Date: Fri, 8 Mar 2024 18:55:21 +0530
Subject: [PATCH 5/6] Update README.md

Signed-off-by: VindyaKonjarla <157361884+VindyaKonjarla@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4903d91..80d0c1d 100644
--- a/README.md
+++ b/README.md
@@ -197,7 +197,7 @@ python llm_correctness.py \
 ### AzureAI Compatible APIs
 ```bash
 export AZUREAI_API_KEY=secret_abcdefg
-export AZUREAI_API_BASE="https://api.endpoints.anyscale.com/v1"
+export AZUREAI_API_BASE="https://api.endpoints.ai.azure.com/v1"
 
 python token_benchmark_ray.py \
 --model "Llama-2-70b-chat" \

From 1ba6d22844971e2ad7fbdc2fc71593211ca1d98f Mon Sep 17 00:00:00 2001
From: Sameep Kumar Panigrahi
 <59465094+SameepPanigrahi@users.noreply.github.com>
Date: Thu, 14 Mar 2024 11:20:00 +0530
Subject: [PATCH 6/6] Update azureai_chat_completion.py

Signed-off-by: Sameep Kumar Panigrahi <59465094+SameepPanigrahi@users.noreply.github.com>
---
 src/llmperf/ray_clients/azureai_chat_completion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmperf/ray_clients/azureai_chat_completion.py b/src/llmperf/ray_clients/azureai_chat_completion.py
index 49dd151..04e5d2e 100644
--- a/src/llmperf/ray_clients/azureai_chat_completion.py
+++ b/src/llmperf/ray_clients/azureai_chat_completion.py
@@ -48,10 +48,10 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]:
         most_recent_received_token_time = time.monotonic()
         address = os.environ.get("AZUREAI_API_BASE")
         if not address:
-            raise ValueError("the environment variable OPENAI_API_BASE must be set.")
+            raise ValueError("the environment variable AZUREAI_API_BASE must be set.")
         key = os.environ.get("AZUREAI_API_KEY")
         if not key:
-            raise ValueError("the environment variable OPENAI_API_KEY must be set.")
+            raise ValueError("the environment variable AZUREAI_API_KEY must be set.")
         headers = {"Authorization": f"Bearer {key}"}
         if not address:
             raise ValueError("No host provided.")