From d507f5caa920bda1a8895cca4c8be9256ae415b1 Mon Sep 17 00:00:00 2001 From: philschmid Date: Thu, 28 Mar 2024 12:30:14 +0000 Subject: [PATCH 01/18] add hugging face client --- README.md | 27 ++-- src/llmperf/common.py | 3 + src/llmperf/ray_clients/huggingface_client.py | 134 ++++++++++++++++++ 3 files changed, 154 insertions(+), 10 deletions(-) create mode 100644 src/llmperf/ray_clients/huggingface_client.py diff --git a/README.md b/README.md index 5abc91d..08f67e5 100644 --- a/README.md +++ b/README.md @@ -99,11 +99,15 @@ python token_benchmark_ray.py \ ### Hugging Face ```bash -export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" -export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT" +export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" # only for IE and API +# local testing "http://localhost:8000" +# serverless hosted models "https://api-inference.huggingface.co" +# Inference endpoints, e.g. "https://ptrlmejh4tjmcb4t.us-east-1.aws.endpoints.huggingface.cloud" +export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_URL" +export MODEL_ID="meta-llama/Llama-2-7b-chat-hf" python token_benchmark_ray.py \ ---model "huggingface/meta-llama/Llama-2-7b-chat-hf" \ +--model $MODEL_ID \ --mean-input-tokens 550 \ --stddev-input-tokens 150 \ --mean-output-tokens 150 \ @@ -112,9 +116,8 @@ python token_benchmark_ray.py \ --timeout 600 \ --num-concurrent-requests 1 \ --results-dir "result_outputs" \ ---llm-api "litellm" \ +--llm-api huggingface \ --additional-sampling-params '{}' - ``` ### LiteLLM @@ -255,16 +258,20 @@ python llm_correctness.py \ ### Hugging Face ```bash -export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" -export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT" +export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" # only for IE and API +# local testing "http://localhost:8000" +# serverless hosted models "https://api-inference.huggingface.co" +# Inference endpoints, e.g. "https://ptrlmejh4tjmcb4t.us-east-1.aws.endpoints.huggingface.cloud" +export HUGGINGFACE_API_BASE="http://localhost:8080" +export MODEL_ID="HuggingFaceH4/zephyr-7b-beta" python llm_correctness.py \ ---model "huggingface/meta-llama/Llama-2-7b-chat-hf" \ ---llm-api "litellm" \ +--model $MODEL_ID \ +--llm-api huggingface \ --max-num-completed-requests 2 \ --timeout 600 \ --num-concurrent-requests 1 \ ---results-dir "result_outputs" \ +--results-dir "result_outputs" ``` diff --git a/src/llmperf/common.py b/src/llmperf/common.py index 3efefa1..f29dad4 100644 --- a/src/llmperf/common.py +++ b/src/llmperf/common.py @@ -1,4 +1,5 @@ from typing import List +from llmperf.ray_clients.huggingface_client import HuggingFaceTgiClient from llmperf.ray_clients.litellm_client import LiteLLMClient from llmperf.ray_clients.openai_chat_completions_client import ( OpenAIChatCompletionsClient, @@ -30,6 +31,8 @@ def construct_clients(llm_api: str, num_clients: int) -> List[LLMClient]: clients = [VertexAIClient.remote() for _ in range(num_clients)] elif llm_api in SUPPORTED_APIS: clients = [LiteLLMClient.remote() for _ in range(num_clients)] + elif llm_api == "huggingface": + clients = [HuggingFaceTgiClient.remote() for _ in range(num_clients)] else: raise ValueError( f"llm_api must be one of the supported LLM APIs: {SUPPORTED_APIS}" diff --git a/src/llmperf/ray_clients/huggingface_client.py b/src/llmperf/ray_clients/huggingface_client.py new file mode 100644 index 0000000..39dab71 --- /dev/null +++ b/src/llmperf/ray_clients/huggingface_client.py @@ -0,0 +1,134 @@ +import json +import os +import time +from typing import Any, Dict + +import ray +import requests + +from llmperf.ray_llm_client import LLMClient +from llmperf.models import RequestConfig +from llmperf import common_metrics +from transformers import AutoTokenizer + + +@ray.remote +class HuggingFaceTgiClient(LLMClient): + """Client for Hugging Face TGI""" + + def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: + prompt = request_config.prompt + prompt, prompt_len = prompt + + tokenizer = AutoTokenizer.from_pretrained(request_config.model) + # try to apply chat template with system message if error retry without system message + try: + prompt = tokenizer.apply_chat_template( + [ + {"role": "system", "content": ""}, + {"role": "user", "content": prompt}, + ], + tokenize=False, + add_generation_prompt=True, + ) + except: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + tokenize=False, + add_generation_prompt=True, + ) + + # update prompt_len to match include special tokens + prompt_len = len(tokenizer(prompt).input_ids) + + sampling_params = request_config.sampling_params + + if "max_tokens" in sampling_params: + sampling_params["max_new_tokens"] = sampling_params["max_tokens"] + del sampling_params["max_tokens"] + + body = { + "inputs": prompt, + "parameters": { + **request_config.sampling_params, + }, + "stream": True, + } + address = os.environ.get("HUGGINGFACE_API_BASE", "https://api-inference.huggingface.co") + # Adds the model name to the address if it is not "local" or "inference endpoint" + if address == "https://api-inference.huggingface.co": + address = f"{address}/models/{request_config.model}" + headers = { + "Authorization": f"Bearer {os.environ.get('HUGGINGFACE_API_TOKEN', '')}" + } + + time_to_next_token = [] + tokens_received = 0 + ttft = None + error_response_code = None + generated_text = "" + error_msg = "" + output_throughput = 0 + total_request_time = 0 + metrics = {} + + metrics[common_metrics.ERROR_CODE] = None + metrics[common_metrics.ERROR_MSG] = "" + + start_time = time.monotonic() + most_recent_received_token_time = time.monotonic() + try: + with requests.post( + address, + json=body, + stream=True, + timeout=180, + headers=headers, + ) as response: + if response.status_code != 200: + error_msg = response.text + error_response_code = response.status_code + response.raise_for_status() + # ADAPTED FROM: https://github.com/huggingface/text-generation-inference/blob/6c4496a1a30f119cebd3afbfedd847039325dbc9/clients/python/text_generation/client.py#L767 + for byte_payload in response.iter_lines(): + # Skip line + if byte_payload == b"\n": + continue + payload = byte_payload.decode("utf-8") + # Event data + if payload.startswith("data:"): + # Decode payload + tokens_received += 1 + chunk = json.loads(payload.lstrip("data:").rstrip("/n")) + + if chunk.get("token", None): + if not ttft: + ttft = time.monotonic() - start_time + time_to_next_token.append(ttft) + else: + time_to_next_token.append( + time.monotonic() - most_recent_received_token_time + ) + most_recent_received_token_time = time.monotonic() + generated_text += chunk["token"]["text"] + + total_request_time = time.monotonic() - start_time + output_throughput = tokens_received / total_request_time + + except Exception as e: + metrics[common_metrics.ERROR_MSG] = error_msg + metrics[common_metrics.ERROR_CODE] = error_response_code + print(f"Warning Or Error: {e}") + print(error_response_code) + + metrics[common_metrics.INTER_TOKEN_LAT] = sum( + time_to_next_token + ) # This should be same as metrics[common_metrics.E2E_LAT]. Leave it here for now + metrics[common_metrics.TTFT] = ttft + metrics[common_metrics.E2E_LAT] = total_request_time + metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput + metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len + metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received + metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len + + return metrics, generated_text, request_config From febbcb1e22c89dc366f0daaf6f84d1af758b2a9a Mon Sep 17 00:00:00 2001 From: philschmid Date: Thu, 28 Mar 2024 12:34:50 +0000 Subject: [PATCH 02/18] add token for gated and private models --- README.md | 2 +- src/llmperf/ray_clients/huggingface_client.py | 23 ++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 08f67e5..42be4d4 100644 --- a/README.md +++ b/README.md @@ -262,7 +262,7 @@ export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" # only for IE and API # local testing "http://localhost:8000" # serverless hosted models "https://api-inference.huggingface.co" # Inference endpoints, e.g. "https://ptrlmejh4tjmcb4t.us-east-1.aws.endpoints.huggingface.cloud" -export HUGGINGFACE_API_BASE="http://localhost:8080" +export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_URL" export MODEL_ID="HuggingFaceH4/zephyr-7b-beta" python llm_correctness.py \ diff --git a/src/llmperf/ray_clients/huggingface_client.py b/src/llmperf/ray_clients/huggingface_client.py index 39dab71..822f724 100644 --- a/src/llmperf/ray_clients/huggingface_client.py +++ b/src/llmperf/ray_clients/huggingface_client.py @@ -17,10 +17,24 @@ class HuggingFaceTgiClient(LLMClient): """Client for Hugging Face TGI""" def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: + address = os.environ.get( + "HUGGINGFACE_API_BASE", "https://api-inference.huggingface.co" + ) + token = os.environ.get("HUGGINGFACE_API_TOKEN", "") + + # Adds the model name to the address if it is not "local" or "inference endpoint" + if address == "https://api-inference.huggingface.co": + address = f"{address}/models/{request_config.model}" + headers = { + "Authorization": f"Bearer {os.environ.get('HUGGINGFACE_API_TOKEN', '')}" + } + prompt = request_config.prompt prompt, prompt_len = prompt - tokenizer = AutoTokenizer.from_pretrained(request_config.model) + tokenizer = AutoTokenizer.from_pretrained( + request_config.model, use_auth_token=token if token else None + ) # try to apply chat template with system message if error retry without system message try: prompt = tokenizer.apply_chat_template( @@ -54,13 +68,6 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: }, "stream": True, } - address = os.environ.get("HUGGINGFACE_API_BASE", "https://api-inference.huggingface.co") - # Adds the model name to the address if it is not "local" or "inference endpoint" - if address == "https://api-inference.huggingface.co": - address = f"{address}/models/{request_config.model}" - headers = { - "Authorization": f"Bearer {os.environ.get('HUGGINGFACE_API_TOKEN', '')}" - } time_to_next_token = [] tokens_received = 0 From 73e6f59499f33560374374efc0376a62abf03364 Mon Sep 17 00:00:00 2001 From: philschmid Date: Thu, 28 Mar 2024 16:47:15 +0000 Subject: [PATCH 03/18] fix to make sure base models work as well --- src/llmperf/ray_clients/huggingface_client.py | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/llmperf/ray_clients/huggingface_client.py b/src/llmperf/ray_clients/huggingface_client.py index 822f724..8b88ee2 100644 --- a/src/llmperf/ray_clients/huggingface_client.py +++ b/src/llmperf/ray_clients/huggingface_client.py @@ -36,22 +36,25 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: request_config.model, use_auth_token=token if token else None ) # try to apply chat template with system message if error retry without system message - try: - prompt = tokenizer.apply_chat_template( - [ - {"role": "system", "content": ""}, - {"role": "user", "content": prompt}, - ], - tokenize=False, - add_generation_prompt=True, - ) - except: - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - tokenize=False, - add_generation_prompt=True, - ) - + if getattr(tokenizer,"chat_template", None) is None: + print("Chat template not found in tokenizer. Using default prompt") + prompt = prompt + else: + try: + prompt = tokenizer.apply_chat_template( + [ + {"role": "system", "content": ""}, + {"role": "user", "content": prompt}, + ], + tokenize=False, + add_generation_prompt=True, + ) + except: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + tokenize=False, + add_generation_prompt=True, + ) # update prompt_len to match include special tokens prompt_len = len(tokenizer(prompt).input_ids) From 75187d2479302cc5071197a119f183f32862ec17 Mon Sep 17 00:00:00 2001 From: philschmid Date: Thu, 28 Mar 2024 16:49:40 +0000 Subject: [PATCH 04/18] removed unnecessary res --- src/llmperf/ray_clients/huggingface_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llmperf/ray_clients/huggingface_client.py b/src/llmperf/ray_clients/huggingface_client.py index 8b88ee2..c09dc87 100644 --- a/src/llmperf/ray_clients/huggingface_client.py +++ b/src/llmperf/ray_clients/huggingface_client.py @@ -38,7 +38,6 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: # try to apply chat template with system message if error retry without system message if getattr(tokenizer,"chat_template", None) is None: print("Chat template not found in tokenizer. Using default prompt") - prompt = prompt else: try: prompt = tokenizer.apply_chat_template( From 7ad763f107cdb84579faecbf524e31414a33fcee Mon Sep 17 00:00:00 2001 From: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Date: Tue, 16 Apr 2024 16:11:31 +0200 Subject: [PATCH 05/18] Update README.md Signed-off-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 42be4d4..1e88873 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A Tool for evaulation the performance of LLM APIs. # Installation ```bash -git clone https://github.com/ray-project/llmperf.git +git clone https://github.com/philschmid/llmperf.git cd llmperf pip install -e . ``` From 6c2ae619afa985e30e63f3babe00bcd9cbee0733 Mon Sep 17 00:00:00 2001 From: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Date: Tue, 16 Apr 2024 16:12:20 +0200 Subject: [PATCH 06/18] Update README.md Signed-off-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 1e88873..b0cfc48 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,7 @@ A Tool for evaulation the performance of LLM APIs. # Installation ```bash -git clone https://github.com/philschmid/llmperf.git -cd llmperf -pip install -e . +pip install git+https://github.com/philschmid/llmperf.git ``` # Basic Usage From 0411dc28f7ec64c4d9c52b4b39c47371fec61c04 Mon Sep 17 00:00:00 2001 From: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Date: Tue, 16 Apr 2024 17:01:39 +0200 Subject: [PATCH 07/18] Update README.md Signed-off-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b0cfc48..8e38116 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,8 @@ A Tool for evaulation the performance of LLM APIs. # Installation ```bash -pip install git+https://github.com/philschmid/llmperf.git +git clone https://github.com/philschmid/llmperf.git +pip install -e llmperf/ ``` # Basic Usage From 2cbacf9501a417aee0f429d8b6518f368655913a Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Tue, 16 Apr 2024 15:31:47 +0000 Subject: [PATCH 08/18] add hf endpoint to sm client --- src/llmperf/ray_clients/sagemaker_client.py | 63 +++++++++++++-------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/src/llmperf/ray_clients/sagemaker_client.py b/src/llmperf/ray_clients/sagemaker_client.py index ce15964..98b06fa 100644 --- a/src/llmperf/ray_clients/sagemaker_client.py +++ b/src/llmperf/ray_clients/sagemaker_client.py @@ -26,23 +26,35 @@ def __init__(self): def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: if not os.environ.get("AWS_ACCESS_KEY_ID"): - raise ValueError("AWS_ACCESS_KEY_ID must be set.") + print( + f"No AWS_ACCESS_KEY_ID found in the environment. Use the default AWS credentials." + ) if not os.environ.get("AWS_SECRET_ACCESS_KEY"): - raise ValueError("AWS_SECRET_ACCESS_KEY must be set.") - if not os.environ.get("AWS_REGION_NAME"): - raise ValueError("AWS_REGION_NAME must be set.") + print( + f"No AWS_SECRET_ACCESS_KEY found in the environment. Use the default AWS credentials." + ) + region = os.environ.get("AWS_REGION", None) + if not region: + print( + f"No AWS_REGION found in the environment. Use the default AWS credentials." + ) + + is_messages_api = os.environ.get("MESSAGES_API", "false").lower() == "true" + is_jumpstart = os.environ.get("JUMPSTART", "false").lower() == "true" prompt = request_config.prompt prompt, prompt_len = prompt - message = [ - {"role": "system", "content": ""}, - {"role": "user", "content": prompt}, - ] + if is_jumpstart or is_messages_api: + message = [ + {"role": "system", "content": ""}, + {"role": "user", "content": prompt}, + ] + else: + message = prompt + model = request_config.model - sm_runtime = boto3.client( - "sagemaker-runtime", region_name=os.environ.get("AWS_REGION_NAME") - ) + sm_runtime = boto3.client("sagemaker-runtime", region_name=region) sampling_params = request_config.sampling_params @@ -50,16 +62,12 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: sampling_params["max_new_tokens"] = sampling_params["max_tokens"] del sampling_params["max_tokens"] - message = { - "inputs": [ - [ - {"role": "system", "content": ""}, - {"role": "user", "content": prompt}, - ] - ], + payload = { + "inputs": message, "parameters": { **request_config.sampling_params, }, + "stream": True, } time_to_next_token = [] @@ -79,23 +87,28 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: response = sm_runtime.invoke_endpoint_with_response_stream( EndpointName=model, ContentType="application/json", - Body=json.dumps(message), - CustomAttributes="accept_eula=true", + Body=json.dumps(payload), + CustomAttributes="accept_eula=true" if is_jumpstart else "", ) event_stream = response["Body"] json_byte = b"" + generated_text = prompt + start_json = b"{" + for line, ttft, _ in LineIterator(event_stream): - json_byte += line time_to_next_token.append( time.monotonic() - most_recent_received_token_time ) most_recent_received_token_time = time.monotonic() + if line != b"" and start_json in line: + data = json.loads(line[line.find(start_json) :].decode("utf-8")) + generated_text += data["token"]["text"] ttft = ttft - start_time - resp = json.loads(json_byte) total_request_time = time.monotonic() - start_time - generated_text = resp[0]["generation"]["content"] - tokens_received = len(self.tokenizer.encode(generated_text)) + if is_jumpstart: + raise NotImplementedError("No tests for Jumpstart yet") + tokens_received = len(self.tokenizer(generated_text).input_ids) output_throughput = tokens_received / total_request_time except Exception as e: @@ -106,7 +119,7 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: metrics[common_metrics.ERROR_MSG] = error_msg metrics[common_metrics.ERROR_CODE] = error_response_code - metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token + metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) metrics[common_metrics.TTFT] = ttft metrics[common_metrics.E2E_LAT] = total_request_time metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput From 9c844e02a57d12b20b5846a1a620b62d1f051a95 Mon Sep 17 00:00:00 2001 From: Philipp Schmid Date: Wed, 17 Apr 2024 15:56:32 +0000 Subject: [PATCH 09/18] add messages client --- src/llmperf/ray_clients/sagemaker_client.py | 27 +++++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/llmperf/ray_clients/sagemaker_client.py b/src/llmperf/ray_clients/sagemaker_client.py index 98b06fa..a700b22 100644 --- a/src/llmperf/ray_clients/sagemaker_client.py +++ b/src/llmperf/ray_clients/sagemaker_client.py @@ -58,17 +58,25 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: sampling_params = request_config.sampling_params - if "max_tokens" in sampling_params: + if "max_tokens" in sampling_params and not is_messages_api: sampling_params["max_new_tokens"] = sampling_params["max_tokens"] del sampling_params["max_tokens"] - payload = { - "inputs": message, - "parameters": { + if is_messages_api: + payload = { + "messages": message, + "model": model, **request_config.sampling_params, - }, - "stream": True, - } + "stream": True, + } + else: + payload = { + "inputs": message, + "parameters": { + **request_config.sampling_params, + }, + "stream": True, + } time_to_next_token = [] tokens_received = 0 @@ -103,7 +111,10 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: most_recent_received_token_time = time.monotonic() if line != b"" and start_json in line: data = json.loads(line[line.find(start_json) :].decode("utf-8")) - generated_text += data["token"]["text"] + if is_messages_api: + generated_text += data["choices"][0]["delta"]["content"] + else: + generated_text += data["token"]["text"] ttft = ttft - start_time total_request_time = time.monotonic() - start_time if is_jumpstart: From c6abd529ecf7f170913cf9c5b570dd7381a6eaae Mon Sep 17 00:00:00 2001 From: philschmid Date: Fri, 17 May 2024 08:45:37 +0000 Subject: [PATCH 10/18] clean up readme --- README.md | 374 ++++++++++++----------------------------------- parse_results.py | 25 ++++ 2 files changed, 115 insertions(+), 284 deletions(-) create mode 100644 parse_results.py diff --git a/README.md b/README.md index 8e38116..2db3cf7 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,20 @@ # LLMPerf -A Tool for evaulation the performance of LLM APIs. +Fork of LLMPerf optimized for open LLM usage. + +## Installation -# Installation ```bash git clone https://github.com/philschmid/llmperf.git pip install -e llmperf/ ``` -# Basic Usage +## Basic Usage We implement 2 tests for evaluating LLMs: a load test to check for performance and a correctness test to check for correctness. -## Load test - -The load test spawns a number of concurrent requests to the LLM API and measures the inter-token latency and generation throughput per request and across concurrent requests. The prompt that is sent with each request is of the format: - -``` -Randomly stream lines from the following text. Don't generate eos tokens: -LINE 1, -LINE 2, -LINE 3, -... -``` - -Where the lines are randomly sampled from a collection of lines from Shakespeare sonnets. Tokens are counted using the `LlamaTokenizer` regardless of which LLM API is being tested. This is to ensure that the prompts are consistent across different LLM APIs. - -To run the most basic load test you can the token_benchmark_ray script. - - -### Caveats and Disclaimers - -- The endpoints provider backend might vary widely, so this is not a reflection on how the software runs on a particular hardware. -- The results may vary with time of day. -- The results may vary with the load. -- The results may not correlate with users’ workloads. - ### OpenAI Compatible APIs + ```bash export OPENAI_API_KEY=secret_abcdefg export OPENAI_API_BASE="https://api.endpoints.anyscale.com/v1" @@ -53,49 +31,9 @@ python token_benchmark_ray.py \ --results-dir "result_outputs" \ --llm-api openai \ --additional-sampling-params '{}' - -``` - -### Anthropic -```bash -export ANTHROPIC_API_KEY=secret_abcdefg - -python token_benchmark_ray.py \ ---model "claude-2" \ ---mean-input-tokens 550 \ ---stddev-input-tokens 150 \ ---mean-output-tokens 150 \ ---stddev-output-tokens 10 \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ ---llm-api anthropic \ ---additional-sampling-params '{}' - -``` - -### TogetherAI - -```bash -export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY" - -python token_benchmark_ray.py \ ---model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \ ---mean-input-tokens 550 \ ---stddev-input-tokens 150 \ ---mean-output-tokens 150 \ ---stddev-output-tokens 10 \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ ---llm-api "litellm" \ ---additional-sampling-params '{}' - ``` -### Hugging Face +### Hugging Face (TGI) ```bash export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" # only for IE and API @@ -119,26 +57,18 @@ python token_benchmark_ray.py \ --additional-sampling-params '{}' ``` -### LiteLLM - -LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params. +### SageMaker (TGI) -see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers). +SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. ```bash -python token_benchmark_ray.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---mean-input-tokens 550 \ ---stddev-input-tokens 150 \ ---mean-output-tokens 150 \ ---stddev-output-tokens 10 \ ---max-num-completed-requests 2 \ +MESSAGES_API=true python llmperf/token_benchmark_ray.py \ +--model {endpoint_name} \ +--llm-api "sagemaker" \ +--max-num-completed-requests 500 \ --timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ ---llm-api "litellm" \ ---additional-sampling-params '{}' - +--num-concurrent-requests 25 \ +--results-dir "results" ``` ### Vertex AI @@ -171,90 +101,13 @@ python token_benchmark_ray.py \ --results-dir "result_outputs" \ --llm-api "vertexai" \ --additional-sampling-params '{}' - -``` - -### SageMaker - -SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. - -```bash - -export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID" -export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s -export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN" -export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME" - -python llm_correctness.py \ ---model "llama-2-7b" \ ---llm-api "sagemaker" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - ``` see `python token_benchmark_ray.py --help` for more details on the arguments. -## Correctness Test - -The correctness test spawns a number of concurrent requests to the LLM API with the following format: - -``` -Convert the following sequence of words into a number: {random_number_in_word_format}. Output just your final answer. -``` - -where random_number_in_word_format could be for example "one hundred and twenty three". The test then checks that the response contains that number in digit format which in this case would be 123. +## Use Hugging Face Dataset -The test does this for a number of randomly generated numbers and reports the number of responses that contain a mismatch. - -To run the most basic correctness test you can run the the llm_correctness.py script. - -### OpenAI Compatible APIs - -```bash -export OPENAI_API_KEY=secret_abcdefg -export OPENAI_API_BASE=https://console.endpoints.anyscale.com/m/v1 - -python llm_correctness.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---max-num-completed-requests 150 \ ---timeout 600 \ ---num-concurrent-requests 10 \ ---results-dir "result_outputs" -``` - -### Anthropic - -```bash -export ANTHROPIC_API_KEY=secret_abcdefg - -python llm_correctness.py \ ---model "claude-2" \ ---llm-api "anthropic" \ ---max-num-completed-requests 5 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" -``` - -### TogetherAI - -```bash -export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY" - -python llm_correctness.py \ ---model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \ ---llm-api "litellm" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - -``` - -### Hugging Face +In this fork we added support to used datasets from Hugging Face to generate the input for the LLM. Dataset should either have a `prompt` column or use the `messages` format from openai, where then the first `user` message will be used as input. ```bash export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" # only for IE and API @@ -262,160 +115,113 @@ export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" # only for IE and API # serverless hosted models "https://api-inference.huggingface.co" # Inference endpoints, e.g. "https://ptrlmejh4tjmcb4t.us-east-1.aws.endpoints.huggingface.cloud" export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_URL" -export MODEL_ID="HuggingFaceH4/zephyr-7b-beta" +export MODEL_ID="meta-llama/Llama-2-7b-chat-hf" -python llm_correctness.py \ +python token_benchmark_ray.py \ --model $MODEL_ID \ ---llm-api huggingface \ +--dataset \ --max-num-completed-requests 2 \ ---timeout 600 \ --num-concurrent-requests 1 \ ---results-dir "result_outputs" - +--results-dir "result_outputs" \ +--llm-api huggingface ``` -### LiteLLM +## Implementing New LLM Clients -LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params. +To implement a new LLM client, you need to implement the base class `llmperf.ray_llm_client.LLMClient` and decorate it as a ray actor. -see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers). +```python -```bash -python llm_correctness.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---llm-api "litellm" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ +from llmperf.ray_llm_client import LLMClient +import ray -``` -see `python llm_correctness.py --help` for more details on the arguments. +@ray.remote +class CustomLLMClient(LLMClient): + def llm_request(self, request_config: RequestConfig) -> Tuple[Metrics, str, RequestConfig]: + """Make a single completion request to a LLM API -### Vertex AI + Returns: + Metrics about the performance charateristics of the request. + The text generated by the request to the LLM API. + The request_config used to make the request. This is mainly for logging purposes. -Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID. + """ + ... -The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so. +``` -Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. +## End to End Test for llama 3 8b instruct +First we need to start TGI: ```bash +model=meta-llama/Meta-Llama-3-8B-Instruct +token=$(cat ~/.cache/huggingface/token) +num_shard=2 +max_input_length=5000 +max_total_tokens=6000 +max_batch_prefill_tokens=10000 +docker run --gpus $num_shard -ti -p 8080:80 \ + -e MODEL_ID=$model \ + -e HF_TOKEN=$token \ + -e NUM_SHARD=$num_shard \ + -e MAX_INPUT_LENGTH=$max_input_length \ + -e MAX_TOTAL_TOKENS=$max_total_tokens \ + -e MAX_BATCH_PREFILL_TOKENS=$max_batch_prefill_tokens \ + ghcr.io/huggingface/text-generation-inference:2.0.3 +``` + +Test the TGI: -gcloud auth application-default login -gcloud config set project YOUR_PROJECT_ID +```bash +curl http://localhost:8080 \ + -X POST \ + -d '{"inputs":"nWhat is 10+10?","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \ + -H 'Content-Type: application/json' +``` -export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) -export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID -export GCLOUD_REGION=YOUR_REGION -export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID +Then we can run the benchmark: -python llm_correctness.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ ---llm-api "vertexai" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ +```bash +HUGGINGFACE_API_BASE="http://localhost:8080" +MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" +python token_benchmark_ray.py \ +--model $MODEL_ID \ +--max-num-completed-requests 100 \ +--num-concurrent-requests 10 \ --results-dir "result_outputs" \ - +--llm-api huggingface ``` -### SageMaker - -SageMaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. +Parse results ```bash - -export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID" -export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s -export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN" -export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME" - -python llm_correctness.py \ ---model "llama-2-7b" \ ---llm-api "sagemaker" \ ---max-num-completed-requests 2 \ ---timeout 600 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ - +python parse_results.py --results-dir "result_outputs" ``` -## Saving Results - -The results of the load test and correctness test are saved in the results directory specified by the `--results-dir` argument. The results are saved in 2 files, one with the summary metrics of the test, and one with metrics from each individual request that is returned. - -# Advanced Usage - -The correctness tests were implemented with the following workflow in mind: - -```python -import ray -from transformers import LlamaTokenizerFast - -from llmperf.ray_clients.openai_chat_completions_client import ( - OpenAIChatCompletionsClient, -) -from llmperf.models import RequestConfig -from llmperf.requests_launcher import RequestsLauncher - - -# Copying the environment variables and passing them to ray.init() is necessary -# For making any clients work. -ray.init(runtime_env={"env_vars": {"OPENAI_API_BASE" : "https://api.endpoints.anyscale.com/v1", - "OPENAI_API_KEY" : "YOUR_API_KEY"}}) - -base_prompt = "hello_world" -tokenizer = LlamaTokenizerFast.from_pretrained( - "hf-internal-testing/llama-tokenizer" -) -base_prompt_len = len(tokenizer.encode(base_prompt)) -prompt = (base_prompt, base_prompt_len) - -# Create a client for spawning requests -clients = [OpenAIChatCompletionsClient.remote()] - -req_launcher = RequestsLauncher(clients) - -req_config = RequestConfig( - model="meta-llama/Llama-2-7b-chat-hf", - prompt=prompt - ) - -req_launcher.launch_requests(req_config) -result = req_launcher.get_next_ready(block=True) -print(result) +Results on a 1x A10G GPU: +```bash +Avg. Input token length: 550 +Avg. Output token length: 150 +Avg. First-Time-To-Token: 375.99ms +Avg. Thorughput: 163.23 tokens/sec +Avg. Latency: 38.22ms/token ``` -# Implementing New LLM Clients - -To implement a new LLM client, you need to implement the base class `llmperf.ray_llm_client.LLMClient` and decorate it as a ray actor. +Results on a 4x A10G GPU with (max_batch_prefill_tokens=16182) -```python - -from llmperf.ray_llm_client import LLMClient -import ray - - -@ray.remote -class CustomLLMClient(LLMClient): - - def llm_request(self, request_config: RequestConfig) -> Tuple[Metrics, str, RequestConfig]: - """Make a single completion request to a LLM API - - Returns: - Metrics about the performance charateristics of the request. - The text generated by the request to the LLM API. - The request_config used to make the request. This is mainly for logging purposes. +```bash +Avg. Input token length: 550 +Avg. Output token length: 150 +Avg. First-Time-To-Token: 375.99ms +Avg. Thorughput: 163.23 tokens/sec +Avg. Latency: 38.22ms/token +``` - """ - ... -``` +### Speculative Test -# Legacy Codebase -The old LLMPerf code base can be found in the [llmperf-legacy](https://github.com/ray-project/llmval-legacy) repo. +```bash diff --git a/parse_results.py b/parse_results.py new file mode 100644 index 0000000..506fe55 --- /dev/null +++ b/parse_results.py @@ -0,0 +1,25 @@ +import argparse +import glob +import json + +# python parse_results.py --results-dir "result_outputs" + +# Parse command line arguments +parser = argparse.ArgumentParser() +parser.add_argument("--results-dir", help="Directory containing the result files") +args = parser.parse_args() + +# Check if --results-dir argument is provided +if not args.results_dir: + print("Please provide the --results-dir argument.") + exit(1) + +# Reads the summary.json file and prints the results +with open(glob.glob(f'{args.results_dir}/*summary.json')[0], 'r') as file: + data = json.load(file) + +print(f"Avg. Input token length: {data['mean_input_tokens']}") +print(f"Avg. Output token length: {data['mean_output_tokens']}") +print(f"Avg. First-Time-To-Token: {data['results_ttft_s_mean']*1000:.2f}ms") +print(f"Avg. Thorughput: {data['results_mean_output_throughput_token_per_s']:.2f} tokens/sec") +print(f"Avg. Latency: {data['results_inter_token_latency_s_mean']*1000:.2f}ms/token") \ No newline at end of file From 5be9eee14299863b64b91b0e863d4dbb50b2eedc Mon Sep 17 00:00:00 2001 From: philschmid Date: Fri, 17 May 2024 08:47:57 +0000 Subject: [PATCH 11/18] update --- README.md | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 2db3cf7..712319a 100644 --- a/README.md +++ b/README.md @@ -159,10 +159,10 @@ First we need to start TGI: ```bash model=meta-llama/Meta-Llama-3-8B-Instruct token=$(cat ~/.cache/huggingface/token) -num_shard=2 +num_shard=1 max_input_length=5000 max_total_tokens=6000 -max_batch_prefill_tokens=10000 +max_batch_prefill_tokens=6144 docker run --gpus $num_shard -ti -p 8080:80 \ -e MODEL_ID=$model \ -e HF_TOKEN=$token \ @@ -211,17 +211,26 @@ Avg. Thorughput: 163.23 tokens/sec Avg. Latency: 38.22ms/token ``` -Results on a 4x A10G GPU with (max_batch_prefill_tokens=16182) +Results on a 1x H100 GPU with (max_batch_prefill_tokens=16182) ```bash -Avg. Input token length: 550 -Avg. Output token length: 150 -Avg. First-Time-To-Token: 375.99ms -Avg. Thorughput: 163.23 tokens/sec -Avg. Latency: 38.22ms/token ``` ### Speculative Test ```bash +model=ibm-fms/llama3-8b-accelerator +token=$(cat ~/.cache/huggingface/token) +num_shard=1 +max_input_length=5000 +max_total_tokens=6000 +max_batch_prefill_tokens=6144 +docker run --gpus $num_shard -ti -p 8080:80 \ + -e MODEL_ID=$model \ + -e HF_TOKEN=$token \ + -e NUM_SHARD=$num_shard \ + -e MAX_INPUT_LENGTH=$max_input_length \ + -e MAX_TOTAL_TOKENS=$max_total_tokens \ + -e MAX_BATCH_PREFILL_TOKENS=$max_batch_prefill_tokens \ + ghcr.io/huggingface/text-generation-inference:2.0.3 \ No newline at end of file From acbee87e7f22d3dab7fea94332c0bc24c02e3e7a Mon Sep 17 00:00:00 2001 From: philschmid Date: Wed, 5 Jun 2024 12:17:49 +0000 Subject: [PATCH 12/18] init --- README.md | 113 ++++++--------- pyproject.toml | 31 ++-- scripts/benchmark.py | 137 ++++++++++++++++++ .../openai_chat_completions_client.py | 10 +- 4 files changed, 206 insertions(+), 85 deletions(-) create mode 100644 scripts/benchmark.py diff --git a/README.md b/README.md index 712319a..9c680b1 100644 --- a/README.md +++ b/README.md @@ -9,18 +9,26 @@ git clone https://github.com/philschmid/llmperf.git pip install -e llmperf/ ``` +## Benchmarks + +This fork of LLMPerf was used to generated the following benchmarks: +* [Llama 3 8B Instruct on NVIDIA A10G: Hugging Face TGI, vLLM, NVIDIA NIM](./benchmarks/llama_3_8b_instruct_a10g.md) + + ## Basic Usage We implement 2 tests for evaluating LLMs: a load test to check for performance and a correctness test to check for correctness. ### OpenAI Compatible APIs +_Note: This includes `vllm`, `Tgi` or NVIDIA NIM Containers._ + ```bash export OPENAI_API_KEY=secret_abcdefg -export OPENAI_API_BASE="https://api.endpoints.anyscale.com/v1" +export OPENAI_API_BASE="https://api.endpoints.anyscale.com/v1" # or "http://localhost:8000/v1" python token_benchmark_ray.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ +--model "meta-llama/Meta-Llama-3-8B-Instruct" \ --mean-input-tokens 550 \ --stddev-input-tokens 150 \ --mean-output-tokens 150 \ @@ -73,6 +81,9 @@ MESSAGES_API=true python llmperf/token_benchmark_ray.py \ ### Vertex AI +_NOTE: WIP, not yet tested._ + + Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID. The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so. @@ -90,7 +101,7 @@ export GCLOUD_REGION=YOUR_REGION export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID python token_benchmark_ray.py \ ---model "meta-llama/Llama-2-7b-chat-hf" \ +--model "meta-llama/Meta-Llama-3-8B-Instruct" \ --mean-input-tokens 550 \ --stddev-input-tokens 150 \ --mean-output-tokens 150 \ @@ -105,54 +116,9 @@ python token_benchmark_ray.py \ see `python token_benchmark_ray.py --help` for more details on the arguments. -## Use Hugging Face Dataset - -In this fork we added support to used datasets from Hugging Face to generate the input for the LLM. Dataset should either have a `prompt` column or use the `messages` format from openai, where then the first `user` message will be used as input. - -```bash -export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" # only for IE and API -# local testing "http://localhost:8000" -# serverless hosted models "https://api-inference.huggingface.co" -# Inference endpoints, e.g. "https://ptrlmejh4tjmcb4t.us-east-1.aws.endpoints.huggingface.cloud" -export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_URL" -export MODEL_ID="meta-llama/Llama-2-7b-chat-hf" - -python token_benchmark_ray.py \ ---model $MODEL_ID \ ---dataset \ ---max-num-completed-requests 2 \ ---num-concurrent-requests 1 \ ---results-dir "result_outputs" \ ---llm-api huggingface -``` - -## Implementing New LLM Clients - -To implement a new LLM client, you need to implement the base class `llmperf.ray_llm_client.LLMClient` and decorate it as a ray actor. - -```python +## Examples and other use cases -from llmperf.ray_llm_client import LLMClient -import ray - - -@ray.remote -class CustomLLMClient(LLMClient): - - def llm_request(self, request_config: RequestConfig) -> Tuple[Metrics, str, RequestConfig]: - """Make a single completion request to a LLM API - - Returns: - Metrics about the performance charateristics of the request. - The text generated by the request to the LLM API. - The request_config used to make the request. This is mainly for logging purposes. - - """ - ... - -``` - -## End to End Test for llama 3 8b instruct +### End to End Test for llama 3 8b instruct First we need to start TGI: @@ -178,7 +144,7 @@ Test the TGI: ```bash curl http://localhost:8080 \ -X POST \ - -d '{"inputs":"nWhat is 10+10?","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \ + -d '{"inputs":"What is 10+10?","parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}' \ -H 'Content-Type: application/json' ``` @@ -217,20 +183,33 @@ Results on a 1x H100 GPU with (max_batch_prefill_tokens=16182) ``` -### Speculative Test +### Speculative Decoding -```bash -model=ibm-fms/llama3-8b-accelerator -token=$(cat ~/.cache/huggingface/token) -num_shard=1 -max_input_length=5000 -max_total_tokens=6000 -max_batch_prefill_tokens=6144 -docker run --gpus $num_shard -ti -p 8080:80 \ - -e MODEL_ID=$model \ - -e HF_TOKEN=$token \ - -e NUM_SHARD=$num_shard \ - -e MAX_INPUT_LENGTH=$max_input_length \ - -e MAX_TOTAL_TOKENS=$max_total_tokens \ - -e MAX_BATCH_PREFILL_TOKENS=$max_batch_prefill_tokens \ - ghcr.io/huggingface/text-generation-inference:2.0.3 \ No newline at end of file +_Note: WIP_ + + +### Use Hugging Face Dataset + +In this fork we added support to used datasets from Hugging Face to generate the input for the LLM. Dataset should either have a `prompt` column or use the `messages` format from openai, where then the first `user` message will be used as input. + +_Note: WIP._ + + + +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + } + ], + "stream": true + }' diff --git a/pyproject.toml b/pyproject.toml index 7687fb2..142e52c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,18 +6,21 @@ build-backend = "setuptools.build_meta" name = "LLMPerf" version = "0.1.0" description = "A framework for load testing LLM APIs" -authors = [{name="Avnish Narayan", email="avnish@anyscale.com"}] -license = {text= "Apache-2.0"} +authors = [{ name = "Avnish Narayan", email = "avnish@anyscale.com" }] +license = { text = "Apache-2.0" } requires-python = ">=3.8, <3.11" -dependencies = ["pydantic<2.5", - "ray", - "pytest>=6.0", - "seaborn>=0.11", - "awscli>=1.22", - "typer>=0.4", - "litellm>=0.1.738", - "num2words", - "transformers", - "tqdm", - "boto3", - "google-cloud-aiplatform"] +dependencies = [ + "pydantic<2.5", + "ray", + "pytest>=6.0", + "seaborn>=0.11", + "awscli>=1.22", + "typer>=0.4", + "litellm>=0.1.738", + "num2words", + "transformers", + "tqdm", + "boto3", + "google-cloud-aiplatform", + "pandas", +] diff --git a/scripts/benchmark.py b/scripts/benchmark.py new file mode 100644 index 0000000..5ff4857 --- /dev/null +++ b/scripts/benchmark.py @@ -0,0 +1,137 @@ +import argparse +from dataclasses import dataclass, field +import os +import subprocess +import json +import glob +import pandas as pd + + +@dataclass +class Config: + model_id: str + concurrency: list = field(default_factory=list) + num_requests: int = 100 # Default value if not specified + input_token_length: int = 500 # Default value if not specified + output_token_length: int = 200 # Default value if not specified + + +def benchmark(config): + """Run the performance script for each concurrency level.""" + results = {} + detailed_results = {} + # get script file path its ../token_benchmark_ray.py from the current benchmark.py + script_file_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "../token_benchmark_ray.py" + ) + for concurrency in config.concurrency: + print(f"Running test with concurrency: {concurrency}") + os.environ["OPENAI_API_BASE"] = "http://localhost:8000/v1" + os.environ["OPENAI_API_KEY"] = "none" + output_dir = f"result_outputs_{concurrency}" + cmd = [ + "python", + script_file_path, + "--model", + config.model_id, + "--mean-input-tokens", + str(config.input_token_length), + "--stddev-input-tokens", + "0", + "--mean-output-tokens", + str(config.output_token_length), + "--stddev-output-tokens", + "0", + "--max-num-completed-requests", + str(config.num_requests), + "--timeout", + "600", + "--num-concurrent-requests", + str(concurrency), + "--results-dir", + output_dir, + "--llm-api", + "openai", + "--additional-sampling-params", + "{}", + ] + subprocess.run(cmd) + with open(glob.glob(f"{output_dir}/*summary.json")[0], "r") as file: + data = json.load(file) + c_detailed_results = { + "concurrency": concurrency, + "mean_input_token_length": data["results_number_input_tokens_mean"], + "mean_output_token_length": data["results_number_output_tokens_mean"], + "first-time-to-token_mean_in_ms_(ttft)": data["results_ttft_s_mean"] * 1000, + "throughput_token_per_s_(token/sec)": data[ + "results_mean_output_throughput_token_per_s" + ], + "latency_ms_per_token_(inter_token_latency)": data[ + "results_inter_token_latency_s_mean" + ] + * 1000, + "requests_per_minute_(qpm)": data["results_num_completed_requests_per_min"], + } + # append results + results[concurrency] = data + detailed_results[concurrency] = c_detailed_results + with open( + f'{config.model_id.replace("/","_")}_cur_{concurrency}.json', "w" + ) as file: + json.dump(detailed_results[concurrency], file, indent=2) + # remove the output directory + subprocess.run(["rm", "-rf", output_dir]) + return results, detailed_results + + +def main(): + parser = argparse.ArgumentParser( + description="Manage Docker, run tests, and process results." + ) + parser.add_argument("--model-id", type=str, help="The model ID to benchmark.") + parser.add_argument( + "--concurrency", + type=int, + nargs="+", + help="The concurrency levels to test. Add via space separated list.", + default=[1, 2, 4, 8, 16, 32, 64], + ) + parser.add_argument( + "--num-requests", type=int, help="The number of requests to make.", default=100 + ) + parser.add_argument( + "--input-token-length", + type=int, + help="The length of the input tokens.", + default=550, + ) + parser.add_argument( + "--output-token-length", + type=int, + help="The length of the output tokens.", + default=150, + ) + args = parser.parse_args() + + # convert args to config + config = Config( + model_id=args.model_id, + concurrency=args.concurrency, + num_requests=args.num_requests, + input_token_length=args.input_token_length, + output_token_length=args.output_token_length, + ) + # run the benchmark + results, detailed_results = benchmark(config) + # print the results in a nice markdown table using pandas + df = pd.DataFrame(detailed_results) + print(df.to_markdown()) + # write to csv + df.to_csv(f"{config.model_id.replace('/','_')}.csv") + + +if __name__ == "__main__": + main() + +# example usage +# python scripts/benchmark.py --model-id "openai/chatgpt" --concurrency 1 2 --num-requests 100 --input-token-length 550 --output-token-length 150 diff --git a/src/llmperf/ray_clients/openai_chat_completions_client.py b/src/llmperf/ray_clients/openai_chat_completions_client.py index f2e0a91..be7465f 100644 --- a/src/llmperf/ray_clients/openai_chat_completions_client.py +++ b/src/llmperf/ray_clients/openai_chat_completions_client.py @@ -76,8 +76,8 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: if not chunk: continue - stem = "data: " - chunk = chunk[len(stem) :] + stem = "data:" + chunk = chunk[len(stem) :].lstrip() if chunk == b"[DONE]": continue tokens_received += 1 @@ -87,7 +87,7 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: error_msg = data["error"]["message"] error_response_code = data["error"]["code"] raise RuntimeError(data["error"]["message"]) - + delta = data["choices"][0]["delta"] if delta.get("content", None): if not ttft: @@ -109,7 +109,9 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: print(f"Warning Or Error: {e}") print(error_response_code) - metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) #This should be same as metrics[common_metrics.E2E_LAT]. Leave it here for now + metrics[common_metrics.INTER_TOKEN_LAT] = sum( + time_to_next_token + ) # This should be same as metrics[common_metrics.E2E_LAT]. Leave it here for now metrics[common_metrics.TTFT] = ttft metrics[common_metrics.E2E_LAT] = total_request_time metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput From e22c83038431e4e20783648ac344114e8f5cfd32 Mon Sep 17 00:00:00 2001 From: philschmid Date: Wed, 5 Jun 2024 13:29:09 +0000 Subject: [PATCH 13/18] first results --- benchmarks/llama_3_8b_instruct_a10g.md | 139 ++++++++++++++++++ .../a10g/nim-meta_llama3-8b-instruct.csv | 8 + ...gi-meta-llama_Meta-Llama-3-8B-Instruct.csv | 8 + ...lm-meta-llama_Meta-Llama-3-8B-Instruct.csv | 8 + 4 files changed, 163 insertions(+) create mode 100644 benchmarks/llama_3_8b_instruct_a10g.md create mode 100644 benchmarks/results/a10g/nim-meta_llama3-8b-instruct.csv create mode 100644 benchmarks/results/a10g/tgi-meta-llama_Meta-Llama-3-8B-Instruct.csv create mode 100644 benchmarks/results/a10g/vllm-meta-llama_Meta-Llama-3-8B-Instruct.csv diff --git a/benchmarks/llama_3_8b_instruct_a10g.md b/benchmarks/llama_3_8b_instruct_a10g.md new file mode 100644 index 0000000..d370a78 --- /dev/null +++ b/benchmarks/llama_3_8b_instruct_a10g.md @@ -0,0 +1,139 @@ +# Benchmark: Llama 3 8b Instruct on NVIDIA A10G + +Benchmarking the performance of LLMs on the Llama 3 8b Instruct model using the NVIDIA A10G GPU using `llmperf`. The engines tested include vLLM, Hugging Face TGI, and NVIDIA NIM, all measueed via HTTP and their OpenAI API implementations. The tests were run on an Amazon EC2 g5.2xlarge instance equipped with an NVIDIA A10G GPU. + +## Test Environment +- **Instance Type**: Amazon EC2 g5.2xlarge +- **GPU**: NVIDIA A10G +- **Setup**: Requests and containers were run on the same machine via localhost. +- **Engines Tested**: + - [vLLM](https://docs.vllm.ai/en/stable/) + - [Hugging Face TGI](https://huggingface.co/docs/text-generation-inference/en/index) + - [NVIDIA NIM](https://build.nvidia.com/) +- **Model**: [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) +- **Scenario**: + - Expected Input: 550 tokens (mean) + - Expected Output: 150 tokens (mean) + - Concurrent Requests: 2, 4, 8, 16, 32, 64 +- **metrics**: + - Throughput: Measures how many tokens can be processed in a given time frame. + - First Time to Token: Tracks the time taken to generate the first token in response to a request. + - Latency (Inter-Token Latency): Measures the time elapsed between generating successive tokens. + - Requests Per Second: Evaluates the number of requests that can be handled by the model per second. + +The benchmarking was performed using `llmperf`, a tool designed to evaluate the performance of LLMs across different frameworks and hardware configurations. + +## Benchmark Results + +### Concurrency User 1 + +| Engine | vLLM | TGI | NVIDIA NIM | +| ------------------------------ | ----------- | ----------- | ----------- | +| First Time To Token (ms) | 137.2919661 | 138.9137787 | 135.4107646 | +| Througput (token/sec) | 31.92462559 | 32.78526142 | 32.2123514 | +| Inter Token Latency (ms/token) | 30.65149844 | 29.86407376 | 30.3319248 | + + +### Concurrency User 4 + +| Engine | vLLM | TGI | NVIDIA NIM | +| ------------------------------ | ----------- | ----------- | ----------- | +| First Time To Token (ms) | 171.3956358 | 212.6501531 | 173.6120437 | +| Througput (token/sec) | 110.9478713 | 110.7551778 | 115.3847403 | +| Inter Token Latency (ms/token) | 33.88657168 | 33.60044702 | 31.56057292 | + +### Concurrency User 16 + +| Engine | vLLM | TGI | NVIDIA NIM | +| ------------------------------ | ----------- | ----------- | ----------- | +| First Time To Token (ms) | 302.7480913 | 475.7047288 | 336.220663 | +| Througput (token/sec) | 289.873427 | 277.873219 | 298.7441355 | +| Inter Token Latency (ms/token) | 42.66842311 | 42.95979633 | 38.68509632 | + +### Concurrency User 64 + +| Engine | vLLM | TGI | NVIDIA NIM | +| ------------------------------ | ----------- | ----------- | ----------- | +| First Time To Token (ms) | 1080.420167 | 2371.579404 | 1814.533666 | +| Througput (token/sec) | 301.1851391 | 304.3837829 | 310.8465793 | +| Inter Token Latency (ms/token) | 61.72701229 | 60.59072025 | 52.95298819 | + + +## Steps to Run Each Benchmark + +Make sure to login into huggingface to have access to Llama 3 8B Instruct model with `huggingface-cli login`. We are going to use the [benchmark.py](../scripts/benchmark.py) script to run the benchmarks. The script will run the benchmark for 2, 4, 8, 16, 32, 64, and 128 concurrent requests using the same configuration for each engine. + +### vLLM + +1. Start the vLLM Container: +```bash +docker run --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -e "HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:v0.4.3 \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +2. Run the benchmark: + +```bash +# pwd +# >/home/ubuntu/llmperf +python scripts/benchmark.py --model-id "meta-llama/Meta-Llama-3-8B-Instruct" +``` + +### Hugging Face TGI + +1. Start the TGI Container: + +```bash +docker run --gpus all -ti -p 8000:80 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -e MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" \ + -e HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token) \ + -e MAX_INPUT_LENGTH=6000 \ + -e MAX_TOTAL_TOKENS=6144 \ + -e MAX_BATCH_PREFILL_TOKENS=8192 \ + ghcr.io/huggingface/text-generation-inference:2.0.4 +``` + +1. Run the benchmark: + +```bash +# pwd +# >/home/ubuntu/llmperf +python scripts/benchmark.py --model-id "meta-llama/Meta-Llama-3-8B-Instruct" +``` + +### NVIDIA NIM (llm_engine: tensorrt_llm) + +NIM Config: +```bash +Profile metadata: feat_lora: false +Profile metadata: precision: fp16 +Profile metadata: tp: 1 +Profile metadata: llm_engine: tensorrt_llm +Profile metadata: pp: 1 +Profile metadata: profile: throughput +Profile metadata: gpu: A10G +``` +_Note: NVIDIA NIM requires a valid license and nv api key. Make sure to replace `NGC_API_KEY`. + +1. Start the NVIDIA NIM Container: + +```bash +docker run --gpus all -ti -p 8000:8000 \ + -e NGC_API_KEY=nvapi-xxxx \ + nvcr.io/nim/meta/llama3-8b-instruct:1.0.0 +``` + +1. Run the benchmark: +_Note: NVIDIA changed the name from the official model id_ + +```bash +# pwd +# >/home/ubuntu/llmperf +python scripts/benchmark.py --model-id "meta/llama3-8b-instruct" +``` \ No newline at end of file diff --git a/benchmarks/results/a10g/nim-meta_llama3-8b-instruct.csv b/benchmarks/results/a10g/nim-meta_llama3-8b-instruct.csv new file mode 100644 index 0000000..b5ce5db --- /dev/null +++ b/benchmarks/results/a10g/nim-meta_llama3-8b-instruct.csv @@ -0,0 +1,8 @@ +,1,2,4,8,16,32,64 +concurrency,1.0,2.0,4.0,8.0,16.0,32.0,64.0 +mean_input_token_length,550.0,550.0,550.0,550.0,550.0,550.0,550.0 +mean_output_token_length,166.94,167.8,169.31,168.94230769230768,168.53571428571428,167.0859375,168.8828125 +first-time-to-token_mean_in_ms_(ttft),135.41076456991505,147.75785183010157,173.61204367998653,231.09643700977009,336.22066303579294,325.3755184217937,1814.5336663984608 +throughput_token_per_s_(token/sec),32.21235140322287,61.536425104467924,115.384740281783,200.4407025179132,298.74413554760974,374.5205318175369,310.84657934372564 +latency_ms_per_token_(inter_token_latency),30.331924801253578,30.776021783836935,31.56057291689576,33.894895297254756,38.68509631915052,44.48178251480613,52.95298818714245 +requests_per_minute_(qpm),11.577459471626767,22.003489310298423,40.88999124036962,71.18668091700502,106.35519129476253,134.48906739414986,110.43631074431295 diff --git a/benchmarks/results/a10g/tgi-meta-llama_Meta-Llama-3-8B-Instruct.csv b/benchmarks/results/a10g/tgi-meta-llama_Meta-Llama-3-8B-Instruct.csv new file mode 100644 index 0000000..da0ce1d --- /dev/null +++ b/benchmarks/results/a10g/tgi-meta-llama_Meta-Llama-3-8B-Instruct.csv @@ -0,0 +1,8 @@ +,1,2,4,8,16,32,64 +concurrency,1.0,2.0,4.0,8.0,16.0,32.0,64.0 +mean_input_token_length,550.0,550.0,550.0,550.0,550.0,550.0,550.0 +mean_output_token_length,175.47,174.22,174.91,174.90384615384616,174.11607142857142,176.1328125,176.8984375 +first-time-to-token_mean_in_ms_(ttft),138.9137787099753,153.3188243099903,212.65015314993434,325.2174015288591,475.70472879473465,707.5841560625094,2371.5794044920954 +throughput_token_per_s_(token/sec),32.785261417985254,59.73921702382513,110.75517782814165,187.20210799221525,277.8732190091075,346.0933269929774,304.3837828657419 +latency_ms_per_token_(inter_token_latency),29.864073755361563,31.903856806699547,33.60044701626547,36.461300106203126,42.95979633481386,50.71961143107315,60.59072025021147 +requests_per_minute_(qpm),11.21055271601479,20.57371726225179,37.99274295173803,64.21886497369012,95.75447575720233,117.89739415861905,103.24018250271158 diff --git a/benchmarks/results/a10g/vllm-meta-llama_Meta-Llama-3-8B-Instruct.csv b/benchmarks/results/a10g/vllm-meta-llama_Meta-Llama-3-8B-Instruct.csv new file mode 100644 index 0000000..b59aff3 --- /dev/null +++ b/benchmarks/results/a10g/vllm-meta-llama_Meta-Llama-3-8B-Instruct.csv @@ -0,0 +1,8 @@ +,1,2,4,8,16,32,64 +concurrency,1.0,2.0,4.0,8.0,16.0,32.0,64.0 +mean_input_token_length,550.0,550.0,550.0,550.0,550.0,550.0,550.0 +mean_output_token_length,177.43,176.69,177.03,176.42307692307693,177.09821428571428,177.3046875,176.0234375 +first-time-to-token_mean_in_ms_(ttft),137.2919660700245,150.65406004003307,171.3956357799725,249.26975120194163,302.7480912588609,319.6774275156855,1080.4201672266204 +throughput_token_per_s_(token/sec),31.92462558547657,59.16626358621019,110.94787126384446,191.83859595565588,289.87342701157286,363.9864466332658,301.18513911051025 +latency_ms_per_token_(inter_token_latency),30.651498440077052,32.754774205866035,33.88657167794647,36.69785138358354,42.668423107195636,50.154911587173835,61.72701229143614 +requests_per_minute_(qpm),10.79568018445919,20.091549126564104,37.60307448359413,65.24268796399022,98.20768487611645,123.17320599883152,102.6630805720438 From 61362d936860809b438262a7b00e4977219fa49b Mon Sep 17 00:00:00 2001 From: philschmid Date: Wed, 5 Jun 2024 14:16:07 +0000 Subject: [PATCH 14/18] x --- benchmarks/llama_3_8b_instruct_a10g.md | 33 +++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/benchmarks/llama_3_8b_instruct_a10g.md b/benchmarks/llama_3_8b_instruct_a10g.md index d370a78..aa4bbc2 100644 --- a/benchmarks/llama_3_8b_instruct_a10g.md +++ b/benchmarks/llama_3_8b_instruct_a10g.md @@ -19,12 +19,24 @@ Benchmarking the performance of LLMs on the Llama 3 8b Instruct model using the - Throughput: Measures how many tokens can be processed in a given time frame. - First Time to Token: Tracks the time taken to generate the first token in response to a request. - Latency (Inter-Token Latency): Measures the time elapsed between generating successive tokens. - - Requests Per Second: Evaluates the number of requests that can be handled by the model per second. The benchmarking was performed using `llmperf`, a tool designed to evaluate the performance of LLMs across different frameworks and hardware configurations. ## Benchmark Results +The benchmark tested the Llama 3 8b Instruct model on an NVIDIA A10G GPU using llmperf, comparing vLLM, Hugging Face TGI, and NVIDIA NIM on an Amazon EC2 g5.2xlarge instance. Metrics included throughput, first time to token, and inter-token latency under varying levels of concurrency. + +NVIDIA NIM offers overall best engine, #1 in inter-token latency and maintaining competitive throughput. For example, at concurrency 64, NIM had the lowest inter-token latency (52.95 ms/token) and the highest throughput (310.85 tokens/sec). vLLM showed the fastest first token generation across all tests, such as 137.29 ms at concurrency 1, making it ideal for latency-sensitive applications. + +Hugging Face TGI maintained high throughput, like 304.38 tokens/sec at concurrency 64, making it suitable for high-load scenarios. However, TGI often lagged behind in first token generation and inter-token latency. For instance, at concurrency 16, TGI’s first token generation time was 475.70 ms, 57.13% slower than vLLM (302.75 ms) and 41.54% slower than NIM (336.22 ms). + +TGI's performance metrics evolved noticeably. At 1 concurrent user, TGI had a first time to token of 138.91 ms and throughput of 32.79 tokens/sec. At 64 users, TGI's first time to token ( 2371.58 ms) is dramatically higher compared to vLLM (1080.42 ms) and NVIDIA NIM (1814.53 ms) + +Despite this increase, TGI managed to keep a high throughput (304.38 tokens/sec) compared to vLLM's 301.19 tokens/sec and NVIDIA NIM's 310.85 tokens/sec. TGI's inter-token latency is also stays competitive, though it is still outperformed by NVIDIA NIM, especially at 64 users where TGI had 60.59 ms/token compared to NVIDIA NIM's 52.95 ms/token. +As concurrency increased from 1 to 64 users, TGI's inter-token latency remains close but slightly better to vLLM, being 2.57% faster at 1 user and 1.84% faster at 64 users. + + + ### Concurrency User 1 | Engine | vLLM | TGI | NVIDIA NIM | @@ -33,6 +45,10 @@ The benchmarking was performed using `llmperf`, a tool designed to evaluate the | Througput (token/sec) | 31.92462559 | 32.78526142 | 32.2123514 | | Inter Token Latency (ms/token) | 30.65149844 | 29.86407376 | 30.3319248 | +- For First Time To Token (ms), TGI is 1.18% slower than vLLM and NVIDIA NIM is 1.37% faster than vLLM. Compared to TGI, NVIDIA NIM is 2.52% faster.- +- For Throughput (token/sec), TGI is 2.70% slower than vLLM and NVIDIA NIM is 0.90% slower than vLLM. Compared to TGI, NVIDIA NIM is 1.75% faster. +- For Inter Token Latency (ms/token), TGI is 2.57% faster than vLLM and NVIDIA NIM is 1.04% faster than vLLM. Compared to TGI, NVIDIA NIM is 1.57% slower. + ### Concurrency User 4 @@ -42,6 +58,11 @@ The benchmarking was performed using `llmperf`, a tool designed to evaluate the | Througput (token/sec) | 110.9478713 | 110.7551778 | 115.3847403 | | Inter Token Latency (ms/token) | 33.88657168 | 33.60044702 | 31.56057292 | +- For First Time To Token (ms), TGI is 24.07% slower than vLLM and NVIDIA NIM is 1.29% slower than vLLM. Compared to TGI, NVIDIA NIM is 18.36% faster. +- For Throughput (token/sec), TGI is 0.17% faster than vLLM and NVIDIA NIM is 4.00% slower than vLLM. Compared to TGI, NVIDIA NIM is 4.18% slower. +- For Inter Token Latency (ms/token), TGI is 0.84% faster than vLLM and NVIDIA NIM is 6.86% faster than vLLM. Compared to TGI, NVIDIA NIM is 6.07% faster. + + ### Concurrency User 16 | Engine | vLLM | TGI | NVIDIA NIM | @@ -50,6 +71,11 @@ The benchmarking was performed using `llmperf`, a tool designed to evaluate the | Througput (token/sec) | 289.873427 | 277.873219 | 298.7441355 | | Inter Token Latency (ms/token) | 42.66842311 | 42.95979633 | 38.68509632 | +- For First Time To Token (ms), TGI is 57.13% slower than vLLM and NVIDIA NIM is 11.06% slower than vLLM. Compared to TGI, NVIDIA NIM is 29.32% faster. +- For Throughput (token/sec), TGI is 4.14% faster than vLLM and NVIDIA NIM is 3.06% slower than vLLM. Compared to TGI, NVIDIA NIM is 7.51% slower. +- For Inter Token Latency (ms/token), TGI is 0.68% slower than vLLM and NVIDIA NIM is 9.34% faster than vLLM. Compared to TGI, NVIDIA NIM is 9.95% faster. + + ### Concurrency User 64 | Engine | vLLM | TGI | NVIDIA NIM | @@ -58,6 +84,11 @@ The benchmarking was performed using `llmperf`, a tool designed to evaluate the | Througput (token/sec) | 301.1851391 | 304.3837829 | 310.8465793 | | Inter Token Latency (ms/token) | 61.72701229 | 60.59072025 | 52.95298819 | +- For First Time To Token (ms), TGI is 119.51% slower than vLLM and NVIDIA NIM is 67.95% slower than vLLM. Compared to TGI, NVIDIA NIM is 23.49% faster. +- For Throughput (token/sec), TGI is 1.06% slower than vLLM and NVIDIA NIM is 3.21% slower than vLLM. Compared to TGI, NVIDIA NIM is 2.12% slower. +- For Inter Token Latency (ms/token), TGI is 1.84% faster than vLLM and NVIDIA NIM is 14.21% faster than vLLM. Compared to TGI, NVIDIA NIM is 12.61% faster. + + ## Steps to Run Each Benchmark From eb77e6376a0937a7454fd17803abe7dcfd7a65a7 Mon Sep 17 00:00:00 2001 From: philschmid Date: Wed, 5 Jun 2024 14:34:54 +0000 Subject: [PATCH 15/18] updatae --- benchmarks/llama_3_8b_instruct_a10g.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/llama_3_8b_instruct_a10g.md b/benchmarks/llama_3_8b_instruct_a10g.md index aa4bbc2..6590808 100644 --- a/benchmarks/llama_3_8b_instruct_a10g.md +++ b/benchmarks/llama_3_8b_instruct_a10g.md @@ -33,8 +33,7 @@ Hugging Face TGI maintained high throughput, like 304.38 tokens/sec at concurren TGI's performance metrics evolved noticeably. At 1 concurrent user, TGI had a first time to token of 138.91 ms and throughput of 32.79 tokens/sec. At 64 users, TGI's first time to token ( 2371.58 ms) is dramatically higher compared to vLLM (1080.42 ms) and NVIDIA NIM (1814.53 ms) Despite this increase, TGI managed to keep a high throughput (304.38 tokens/sec) compared to vLLM's 301.19 tokens/sec and NVIDIA NIM's 310.85 tokens/sec. TGI's inter-token latency is also stays competitive, though it is still outperformed by NVIDIA NIM, especially at 64 users where TGI had 60.59 ms/token compared to NVIDIA NIM's 52.95 ms/token. -As concurrency increased from 1 to 64 users, TGI's inter-token latency remains close but slightly better to vLLM, being 2.57% faster at 1 user and 1.84% faster at 64 users. - +As concurrency increased from 1 to 64 users, TGI's and vLLMs inter-token latency remains close to each other from vLLM being ~1.12% faster at 32 users and TGI being ~2.57% faster at 1 users. ### Concurrency User 1 From e5833c0de47e7c4d397357d91cca4db79f6fa33b Mon Sep 17 00:00:00 2001 From: philschmid Date: Wed, 12 Jun 2024 16:33:42 +0000 Subject: [PATCH 16/18] wip --- .gitignore | 2 - .../yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv | 16 + .../tgi/01-ai_Yi-1.5-34B-Chat_cur_1.json | 17 + .../tgi/01-ai_Yi-1.5-34B-Chat_cur_16.json | 17 + .../tgi/01-ai_Yi-1.5-34B-Chat_cur_2.json | 17 + .../tgi/01-ai_Yi-1.5-34B-Chat_cur_32.json | 17 + .../tgi/01-ai_Yi-1.5-34B-Chat_cur_4.json | 17 + .../tgi/01-ai_Yi-1.5-34B-Chat_cur_64.json | 17 + .../tgi/01-ai_Yi-1.5-34B-Chat_cur_8.json | 17 + ...34B-Chat_550_150_individual_responses.json | 915 +++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1234 +++++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1102 +++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1410 +++++++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1102 +++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1410 +++++++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1146 ++++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + .../yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv | 16 + .../vllm/01-ai_Yi-1.5-34B-Chat_cur_1.json | 17 + .../vllm/01-ai_Yi-1.5-34B-Chat_cur_16.json | 17 + .../vllm/01-ai_Yi-1.5-34B-Chat_cur_2.json | 17 + .../vllm/01-ai_Yi-1.5-34B-Chat_cur_32.json | 17 + .../vllm/01-ai_Yi-1.5-34B-Chat_cur_4.json | 17 + .../vllm/01-ai_Yi-1.5-34B-Chat_cur_64.json | 17 + .../vllm/01-ai_Yi-1.5-34B-Chat_cur_8.json | 17 + ...34B-Chat_550_150_individual_responses.json | 882 +++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1234 +++++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1102 +++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1410 +++++++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1102 +++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1410 +++++++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + ...34B-Chat_550_150_individual_responses.json | 1146 ++++++++++++++ ...01-ai-Yi-1-5-34B-Chat_550_150_summary.json | 78 + benchmarks/yi_1_5_34b_chat_4xa10g.md | 103 ++ scripts/benchmark.py | 35 +- 47 files changed, 18098 insertions(+), 9 deletions(-) create mode 100644 benchmarks/results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv create mode 100644 benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_1.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_16.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_2.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_32.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_4.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_64.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_8.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv create mode 100644 benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_1.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_16.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_2.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_32.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_4.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_64.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_8.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json create mode 100644 benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_summary.json create mode 100644 benchmarks/yi_1_5_34b_chat_4xa10g.md diff --git a/.gitignore b/.gitignore index 54047ad..6590954 100644 --- a/.gitignore +++ b/.gitignore @@ -240,8 +240,6 @@ dist/ # results output/ -*.json -result_outputs/ __pycache__ **/__pycache__/ \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv b/benchmarks/results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv new file mode 100644 index 0000000..a9e40dd --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv @@ -0,0 +1,16 @@ +,1,2,4,8,16,32,64 +concurrency,1.0,2.0,4.0,8.0,16.0,32.0,64.0 +mean_input_token_length,550.0,550.0,550.0,550.0,550.0,550.0,550.0 +mean_output_token_length,158.33734939759037,157.22,158.22,156.83653846153845,158.78571428571428,158.3452380952381,159.04761904761904 +time_to_first_token_in_ms_(ttft)_p50,589.2667170001005,644.3396354998185,2026.0217614995781,2842.750992999754,7700.104450000254,10071.607194499848,9436.927823000133 +time_to_first_token_in_ms_(ttft)_p75,591.8130939999173,1067.9737397492772,2167.6567652496033,3851.6287092497805,8107.615061500383,10712.515758000109,10294.223657249859 +time_to_first_token_in_ms_(ttft)_p95,635.9745054999618,1089.6510657003091,2224.4235754002148,4269.30308755027,8504.484299800288,11336.454061500126,11146.214323150343 +throughput_token_per_s_(token/sec)_p50,20.867749646233072,18.6632313122328,16.015117757491083,12.370819152432396,8.492894098096965,6.621992168964429,6.698087590201483 +throughput_token_per_s_(token/sec)_p75,20.88080751297862,18.816694929061082,16.238827688643383,12.6543303317669,8.698153944788167,6.767838652716483,6.908107654299118 +throughput_token_per_s_(token/sec)_p95,20.89667551037948,18.850257292145642,16.41324026560619,12.906849934852001,8.860771653750774,6.893210560071129,7.155502747864208 +latency_ms_per_token_(inter_token_latency)_p50,45.290974628885394,50.47298943664333,58.695386015622304,75.76445488334618,111.29316081666768,141.89353758391655,141.45381929530785 +latency_ms_per_token_(inter_token_latency)_p75,45.79195392042707,51.184840881309285,59.91128747742228,78.08761739927169,113.83001986670706,146.0550590310391,144.78203158558753 +latency_ms_per_token_(inter_token_latency)_p95,47.012874357545535,52.761722370638076,61.40358282577323,80.49464227586313,117.23993523632585,151.81597529057638,148.8703841504284 +requests_per_minute_(qpm),8.211894857783436,14.57747226052002,24.663603308027646,36.74115036623516,47.693122662953904,42.71346832611164,27.129583372411076 +results_number_errors,0.0,0.0,0.0,0.0,0.0,44.0,86.0 +results_num_completed_requests,83.0,100.0,100.0,104.0,112.0,84.0,42.0 diff --git a/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_1.json b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_1.json new file mode 100644 index 0000000..ffe2b4a --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_1.json @@ -0,0 +1,17 @@ +{ + "concurrency": 1, + "mean_input_token_length": 550.0, + "mean_output_token_length": 158.33734939759037, + "time_to_first_token_in_ms_(ttft)_p50": 589.2667170001005, + "time_to_first_token_in_ms_(ttft)_p75": 591.8130939999173, + "time_to_first_token_in_ms_(ttft)_p95": 635.9745054999618, + "throughput_token_per_s_(token/sec)_p50": 20.867749646233072, + "throughput_token_per_s_(token/sec)_p75": 20.88080751297862, + "throughput_token_per_s_(token/sec)_p95": 20.89667551037948, + "latency_ms_per_token_(inter_token_latency)_p50": 45.290974628885394, + "latency_ms_per_token_(inter_token_latency)_p75": 45.79195392042707, + "latency_ms_per_token_(inter_token_latency)_p95": 47.012874357545535, + "requests_per_minute_(qpm)": 8.211894857783436, + "results_number_errors": 0, + "results_num_completed_requests": 83 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_16.json b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_16.json new file mode 100644 index 0000000..0c15b41 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_16.json @@ -0,0 +1,17 @@ +{ + "concurrency": 16, + "mean_input_token_length": 550.0, + "mean_output_token_length": 158.78571428571428, + "time_to_first_token_in_ms_(ttft)_p50": 7700.104450000254, + "time_to_first_token_in_ms_(ttft)_p75": 8107.615061500383, + "time_to_first_token_in_ms_(ttft)_p95": 8504.484299800288, + "throughput_token_per_s_(token/sec)_p50": 8.492894098096965, + "throughput_token_per_s_(token/sec)_p75": 8.698153944788167, + "throughput_token_per_s_(token/sec)_p95": 8.860771653750774, + "latency_ms_per_token_(inter_token_latency)_p50": 111.29316081666768, + "latency_ms_per_token_(inter_token_latency)_p75": 113.83001986670706, + "latency_ms_per_token_(inter_token_latency)_p95": 117.23993523632585, + "requests_per_minute_(qpm)": 47.693122662953904, + "results_number_errors": 0, + "results_num_completed_requests": 112 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_2.json b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_2.json new file mode 100644 index 0000000..b3bb661 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_2.json @@ -0,0 +1,17 @@ +{ + "concurrency": 2, + "mean_input_token_length": 550.0, + "mean_output_token_length": 157.22, + "time_to_first_token_in_ms_(ttft)_p50": 644.3396354998185, + "time_to_first_token_in_ms_(ttft)_p75": 1067.9737397492772, + "time_to_first_token_in_ms_(ttft)_p95": 1089.6510657003091, + "throughput_token_per_s_(token/sec)_p50": 18.6632313122328, + "throughput_token_per_s_(token/sec)_p75": 18.816694929061082, + "throughput_token_per_s_(token/sec)_p95": 18.850257292145642, + "latency_ms_per_token_(inter_token_latency)_p50": 50.47298943664333, + "latency_ms_per_token_(inter_token_latency)_p75": 51.184840881309285, + "latency_ms_per_token_(inter_token_latency)_p95": 52.761722370638076, + "requests_per_minute_(qpm)": 14.57747226052002, + "results_number_errors": 0, + "results_num_completed_requests": 100 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_32.json b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_32.json new file mode 100644 index 0000000..00d7468 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_32.json @@ -0,0 +1,17 @@ +{ + "concurrency": 32, + "mean_input_token_length": 550.0, + "mean_output_token_length": 158.3452380952381, + "time_to_first_token_in_ms_(ttft)_p50": 10071.607194499848, + "time_to_first_token_in_ms_(ttft)_p75": 10712.515758000109, + "time_to_first_token_in_ms_(ttft)_p95": 11336.454061500126, + "throughput_token_per_s_(token/sec)_p50": 6.621992168964429, + "throughput_token_per_s_(token/sec)_p75": 6.767838652716483, + "throughput_token_per_s_(token/sec)_p95": 6.893210560071129, + "latency_ms_per_token_(inter_token_latency)_p50": 141.89353758391655, + "latency_ms_per_token_(inter_token_latency)_p75": 146.0550590310391, + "latency_ms_per_token_(inter_token_latency)_p95": 151.81597529057638, + "requests_per_minute_(qpm)": 42.71346832611164, + "results_number_errors": 44, + "results_num_completed_requests": 84 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_4.json b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_4.json new file mode 100644 index 0000000..256f094 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_4.json @@ -0,0 +1,17 @@ +{ + "concurrency": 4, + "mean_input_token_length": 550.0, + "mean_output_token_length": 158.22, + "time_to_first_token_in_ms_(ttft)_p50": 2026.0217614995781, + "time_to_first_token_in_ms_(ttft)_p75": 2167.6567652496033, + "time_to_first_token_in_ms_(ttft)_p95": 2224.4235754002148, + "throughput_token_per_s_(token/sec)_p50": 16.015117757491083, + "throughput_token_per_s_(token/sec)_p75": 16.238827688643383, + "throughput_token_per_s_(token/sec)_p95": 16.41324026560619, + "latency_ms_per_token_(inter_token_latency)_p50": 58.695386015622304, + "latency_ms_per_token_(inter_token_latency)_p75": 59.91128747742228, + "latency_ms_per_token_(inter_token_latency)_p95": 61.40358282577323, + "requests_per_minute_(qpm)": 24.663603308027646, + "results_number_errors": 0, + "results_num_completed_requests": 100 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_64.json b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_64.json new file mode 100644 index 0000000..e070d7e --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_64.json @@ -0,0 +1,17 @@ +{ + "concurrency": 64, + "mean_input_token_length": 550.0, + "mean_output_token_length": 159.04761904761904, + "time_to_first_token_in_ms_(ttft)_p50": 9436.927823000133, + "time_to_first_token_in_ms_(ttft)_p75": 10294.223657249859, + "time_to_first_token_in_ms_(ttft)_p95": 11146.214323150343, + "throughput_token_per_s_(token/sec)_p50": 6.698087590201483, + "throughput_token_per_s_(token/sec)_p75": 6.908107654299118, + "throughput_token_per_s_(token/sec)_p95": 7.155502747864208, + "latency_ms_per_token_(inter_token_latency)_p50": 141.45381929530785, + "latency_ms_per_token_(inter_token_latency)_p75": 144.78203158558753, + "latency_ms_per_token_(inter_token_latency)_p95": 148.8703841504284, + "requests_per_minute_(qpm)": 27.129583372411076, + "results_number_errors": 86, + "results_num_completed_requests": 42 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_8.json b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_8.json new file mode 100644 index 0000000..3d7a77b --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/01-ai_Yi-1.5-34B-Chat_cur_8.json @@ -0,0 +1,17 @@ +{ + "concurrency": 8, + "mean_input_token_length": 550.0, + "mean_output_token_length": 156.83653846153845, + "time_to_first_token_in_ms_(ttft)_p50": 2842.750992999754, + "time_to_first_token_in_ms_(ttft)_p75": 3851.6287092497805, + "time_to_first_token_in_ms_(ttft)_p95": 4269.30308755027, + "throughput_token_per_s_(token/sec)_p50": 12.370819152432396, + "throughput_token_per_s_(token/sec)_p75": 12.6543303317669, + "throughput_token_per_s_(token/sec)_p95": 12.906849934852001, + "latency_ms_per_token_(inter_token_latency)_p50": 75.76445488334618, + "latency_ms_per_token_(inter_token_latency)_p75": 78.08761739927169, + "latency_ms_per_token_(inter_token_latency)_p95": 80.49464227586313, + "requests_per_minute_(qpm)": 36.74115036623516, + "results_number_errors": 0, + "results_num_completed_requests": 104 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..3c68b6a --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,915 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04529097462888539, + "ttft_s": 0.6050821279995944, + "end_to_end_latency_s": 7.201438346999566, + "request_output_throughput_token_per_s": 20.82917228090921, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044956166437503955, + "ttft_s": 0.5933951199995136, + "end_to_end_latency_s": 7.193185252999683, + "request_output_throughput_token_per_s": 20.85307061116595, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04575531782803369, + "ttft_s": 0.5855188590003308, + "end_to_end_latency_s": 7.183814074000111, + "request_output_throughput_token_per_s": 20.880273132747796, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044470938117346696, + "ttft_s": 0.6033534460002556, + "end_to_end_latency_s": 7.204464265999377, + "request_output_throughput_token_per_s": 20.820423901317323, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04552856763294136, + "ttft_s": 0.5904318220000278, + "end_to_end_latency_s": 7.1937204400001065, + "request_output_throughput_token_per_s": 20.851519217502116, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04366467350299706, + "ttft_s": 0.6029422399997202, + "end_to_end_latency_s": 7.2049909229999685, + "request_output_throughput_token_per_s": 20.818902008768106, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046394794748372194, + "ttft_s": 0.5916011859999344, + "end_to_end_latency_s": 7.191412569999557, + "request_output_throughput_token_per_s": 20.858210892496363, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044632695099404734, + "ttft_s": 0.5886632299998382, + "end_to_end_latency_s": 7.186052052000377, + "request_output_throughput_token_per_s": 20.87377031429164, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04404696135583313, + "ttft_s": 0.5846693290004623, + "end_to_end_latency_s": 7.179832224000165, + "request_output_throughput_token_per_s": 20.89185308517267, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04492448283752424, + "ttft_s": 0.5903442399994674, + "end_to_end_latency_s": 7.188084677999541, + "request_output_throughput_token_per_s": 20.86786768930292, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04363886110906716, + "ttft_s": 0.6031195730001855, + "end_to_end_latency_s": 7.2005831970000145, + "request_output_throughput_token_per_s": 20.83164597868887, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04522946171068696, + "ttft_s": 0.5912670919997254, + "end_to_end_latency_s": 7.191679092999948, + "request_output_throughput_token_per_s": 20.857437889018595, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047613692311257325, + "ttft_s": 0.5892953570000827, + "end_to_end_latency_s": 7.189855401000386, + "request_output_throughput_token_per_s": 20.86272833513859, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04603352442311986, + "ttft_s": 0.5879461699996682, + "end_to_end_latency_s": 7.181404302999908, + "request_output_throughput_token_per_s": 20.88727965606115, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04546011815817778, + "ttft_s": 0.5822074379993865, + "end_to_end_latency_s": 7.182888431999345, + "request_output_throughput_token_per_s": 20.88296392461824, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0458057902038247, + "ttft_s": 0.5901408480003738, + "end_to_end_latency_s": 7.191712724000354, + "request_output_throughput_token_per_s": 20.85734035223855, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04578646664972699, + "ttft_s": 0.5903974120001294, + "end_to_end_latency_s": 7.18867016600052, + "request_output_throughput_token_per_s": 20.86616808619748, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04547048822785402, + "ttft_s": 0.5881423230002838, + "end_to_end_latency_s": 7.184537513000578, + "request_output_throughput_token_per_s": 20.87817061690773, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04588801889804017, + "ttft_s": 0.6039758140004778, + "end_to_end_latency_s": 7.204587848000301, + "request_output_throughput_token_per_s": 20.820066763657252, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.043798682408517316, + "ttft_s": 0.5873079920002056, + "end_to_end_latency_s": 7.183191795000312, + "request_output_throughput_token_per_s": 20.882081988177443, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04437175537040511, + "ttft_s": 0.5893763680005577, + "end_to_end_latency_s": 7.188416332000088, + "request_output_throughput_token_per_s": 20.86690490258017, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04641448056124993, + "ttft_s": 0.5920604330003698, + "end_to_end_latency_s": 7.194462368999666, + "request_output_throughput_token_per_s": 20.849368904386434, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046094964576917474, + "ttft_s": 0.5900966580002205, + "end_to_end_latency_s": 7.191020535000462, + "request_output_throughput_token_per_s": 20.859348025765353, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04576143798724489, + "ttft_s": 0.5863534410000284, + "end_to_end_latency_s": 7.184729825999966, + "request_output_throughput_token_per_s": 20.877611772843956, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04489605198126014, + "ttft_s": 0.5881667539997579, + "end_to_end_latency_s": 7.183559821000017, + "request_output_throughput_token_per_s": 20.881012163565256, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04490257043749466, + "ttft_s": 0.5875129340001877, + "end_to_end_latency_s": 7.18459464300031, + "request_output_throughput_token_per_s": 20.878004599207216, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04520158996854421, + "ttft_s": 0.5889338229999339, + "end_to_end_latency_s": 7.187215216999903, + "request_output_throughput_token_per_s": 20.87039214370614, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044925732437513945, + "ttft_s": 0.5868854560003456, + "end_to_end_latency_s": 7.188327431000289, + "request_output_throughput_token_per_s": 20.867162972169563, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.045196937836459804, + "ttft_s": 0.5889797130002989, + "end_to_end_latency_s": 7.186527789000138, + "request_output_throughput_token_per_s": 20.872388503053365, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047922298686668606, + "ttft_s": 0.5893888489999881, + "end_to_end_latency_s": 7.188530704000186, + "request_output_throughput_token_per_s": 20.866572902934088, + "number_total_tokens": 700, + "number_output_tokens": 150, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04495173293749986, + "ttft_s": 0.5917859080000198, + "end_to_end_latency_s": 7.192484684000192, + "request_output_throughput_token_per_s": 20.855101761103175, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04409370409204176, + "ttft_s": 0.58660157399936, + "end_to_end_latency_s": 7.187464889999319, + "request_output_throughput_token_per_s": 20.869667162995242, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.045504465930385964, + "ttft_s": 0.5897568929995032, + "end_to_end_latency_s": 7.18988187099967, + "request_output_throughput_token_per_s": 20.862651527700862, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.045270877006271346, + "ttft_s": 0.6389919379998901, + "end_to_end_latency_s": 7.243530851999822, + "request_output_throughput_token_per_s": 20.708132962336652, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046657293701309166, + "ttft_s": 0.584379834000174, + "end_to_end_latency_s": 7.185404554000343, + "request_output_throughput_token_per_s": 20.875651311308594, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04553725705661562, + "ttft_s": 0.6391354900006263, + "end_to_end_latency_s": 7.240695806000076, + "request_output_throughput_token_per_s": 20.716241093252528, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04549969827846176, + "ttft_s": 0.588321535000432, + "end_to_end_latency_s": 7.189145260999794, + "request_output_throughput_token_per_s": 20.864789144508052, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04434797279012024, + "ttft_s": 0.586374350999904, + "end_to_end_latency_s": 7.184536333000324, + "request_output_throughput_token_per_s": 20.878174045973363, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04607622896151625, + "ttft_s": 0.5898388449995764, + "end_to_end_latency_s": 7.18812533900018, + "request_output_throughput_token_per_s": 20.867749646233072, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044643078962756856, + "ttft_s": 0.590296841000054, + "end_to_end_latency_s": 7.187709963999623, + "request_output_throughput_token_per_s": 20.86895558547719, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04586894859235984, + "ttft_s": 0.6037096110003404, + "end_to_end_latency_s": 7.201593310000135, + "request_output_throughput_token_per_s": 20.82872408133766, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04468959041614032, + "ttft_s": 0.592798672999379, + "end_to_end_latency_s": 7.195189258999562, + "request_output_throughput_token_per_s": 20.84726260846909, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04527638763752293, + "ttft_s": 0.641617831000076, + "end_to_end_latency_s": 7.244400403000327, + "request_output_throughput_token_per_s": 20.70564734907202, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04701574787585284, + "ttft_s": 0.5923701570000048, + "end_to_end_latency_s": 7.193616278000263, + "request_output_throughput_token_per_s": 20.851821142967353, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04520041505031213, + "ttft_s": 0.5856291610007247, + "end_to_end_latency_s": 7.187054665000687, + "request_output_throughput_token_per_s": 20.870858368514394, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04762657300003014, + "ttft_s": 0.5895769609996933, + "end_to_end_latency_s": 7.19181481600026, + "request_output_throughput_token_per_s": 20.8570442701447, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04609848083326795, + "ttft_s": 0.5899588459997176, + "end_to_end_latency_s": 7.191533662000438, + "request_output_throughput_token_per_s": 20.85785967916545, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.043811145115890814, + "ttft_s": 0.5852543159999186, + "end_to_end_latency_s": 7.185245182000472, + "request_output_throughput_token_per_s": 20.87611434273116, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046318397200006835, + "ttft_s": 0.5840115709997917, + "end_to_end_latency_s": 7.1795677609998165, + "request_output_throughput_token_per_s": 20.892622647120362, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04667981394150992, + "ttft_s": 0.5918402799998148, + "end_to_end_latency_s": 7.188893829000335, + "request_output_throughput_token_per_s": 20.865518891779004, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04514052888050099, + "ttft_s": 0.5835700150000775, + "end_to_end_latency_s": 7.177546783999787, + "request_output_throughput_token_per_s": 20.898505368767577, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04570752717832747, + "ttft_s": 0.5841266809993613, + "end_to_end_latency_s": 7.176291277999553, + "request_output_throughput_token_per_s": 20.902161602589473, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04739048737400904, + "ttft_s": 0.5892667170001005, + "end_to_end_latency_s": 6.2535524499999156, + "request_output_throughput_token_per_s": 20.628275053486078, + "number_total_tokens": 681, + "number_output_tokens": 131, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04542991417719914, + "ttft_s": 0.5847814109993124, + "end_to_end_latency_s": 7.178170053000031, + "request_output_throughput_token_per_s": 20.89669078504337, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04544445256960553, + "ttft_s": 0.5880035820000558, + "end_to_end_latency_s": 7.180389660999936, + "request_output_throughput_token_per_s": 20.89023118267806, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.045509691553431236, + "ttft_s": 0.6419801560004998, + "end_to_end_latency_s": 7.236277009000332, + "request_output_throughput_token_per_s": 20.728891364085854, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04429483877780557, + "ttft_s": 0.582233298000574, + "end_to_end_latency_s": 7.175938444000167, + "request_output_throughput_token_per_s": 20.90318934179482, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044344149956773264, + "ttft_s": 0.5887142499996116, + "end_to_end_latency_s": 7.183971814999495, + "request_output_throughput_token_per_s": 20.87981465723645, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04405625203070279, + "ttft_s": 0.586557852999249, + "end_to_end_latency_s": 7.181355432999226, + "request_output_throughput_token_per_s": 20.88742179654989, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04633298356130258, + "ttft_s": 0.5875102970003354, + "end_to_end_latency_s": 7.181802813000104, + "request_output_throughput_token_per_s": 20.886120644871824, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04515047366673554, + "ttft_s": 0.5850997409997944, + "end_to_end_latency_s": 7.179132493999532, + "request_output_throughput_token_per_s": 20.893889355764518, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04406156879748739, + "ttft_s": 0.5844622430004165, + "end_to_end_latency_s": 7.182254145000115, + "request_output_throughput_token_per_s": 20.88480816352365, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04461809789438505, + "ttft_s": 0.5885619059999954, + "end_to_end_latency_s": 7.183700633000626, + "request_output_throughput_token_per_s": 20.880602862391985, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04525590971248335, + "ttft_s": 0.6420152070004406, + "end_to_end_latency_s": 7.241117634999682, + "request_output_throughput_token_per_s": 20.715034275231268, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04522818395599191, + "ttft_s": 0.5913990810004179, + "end_to_end_latency_s": 7.191454300999794, + "request_output_throughput_token_per_s": 20.858089855225277, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04698701269277984, + "ttft_s": 0.588849468999797, + "end_to_end_latency_s": 7.189230973000122, + "request_output_throughput_token_per_s": 20.864540388720304, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04530672269812047, + "ttft_s": 0.601648111999566, + "end_to_end_latency_s": 7.203949290999844, + "request_output_throughput_token_per_s": 20.821912251298112, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.045771533019120444, + "ttft_s": 0.5857024289998662, + "end_to_end_latency_s": 7.186365735999971, + "request_output_throughput_token_per_s": 20.8728591767293, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04489491948126556, + "ttft_s": 0.5825316479995308, + "end_to_end_latency_s": 7.183385878999616, + "request_output_throughput_token_per_s": 20.881517786552426, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04553652104425653, + "ttft_s": 0.5933170660000542, + "end_to_end_latency_s": 7.194950966999386, + "request_output_throughput_token_per_s": 20.847953055968727, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04486628798121615, + "ttft_s": 0.5805830939998486, + "end_to_end_latency_s": 7.178777029999765, + "request_output_throughput_token_per_s": 20.89492393664787, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04549971075946906, + "ttft_s": 0.5912796299999172, + "end_to_end_latency_s": 7.189121112000066, + "request_output_throughput_token_per_s": 20.86485923148802, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04352601890305144, + "ttft_s": 0.5851026609998371, + "end_to_end_latency_s": 7.181962431000102, + "request_output_throughput_token_per_s": 20.88565645408315, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046021630173060823, + "ttft_s": 0.5821544339996763, + "end_to_end_latency_s": 7.179564749999372, + "request_output_throughput_token_per_s": 20.892631409169077, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04559084039877384, + "ttft_s": 0.6088176130006104, + "end_to_end_latency_s": 7.2035214560000895, + "request_output_throughput_token_per_s": 20.823148916292773, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04518158318238283, + "ttft_s": 0.5837422939994212, + "end_to_end_latency_s": 7.184067067999422, + "request_output_throughput_token_per_s": 20.879537813358855, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04457967286957372, + "ttft_s": 0.5802961800000048, + "end_to_end_latency_s": 7.17749845399976, + "request_output_throughput_token_per_s": 20.898646089768285, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04549291131643028, + "ttft_s": 0.5888415590006844, + "end_to_end_latency_s": 7.1880728989999625, + "request_output_throughput_token_per_s": 20.86790188520051, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.045267242553432285, + "ttft_s": 0.6005570890001763, + "end_to_end_latency_s": 7.197662741000386, + "request_output_throughput_token_per_s": 20.840098431612798, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044901611743785, + "ttft_s": 0.5911427279997952, + "end_to_end_latency_s": 7.1845061229996645, + "request_output_throughput_token_per_s": 20.878261836232134, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0449582074687271, + "ttft_s": 0.6013069170003291, + "end_to_end_latency_s": 7.193506417000208, + "request_output_throughput_token_per_s": 20.852139597110707, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04571990254775718, + "ttft_s": 0.5831807269996716, + "end_to_end_latency_s": 7.178222522999931, + "request_output_throughput_token_per_s": 20.8965380384045, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04579744119112715, + "ttft_s": 0.5916742439994778, + "end_to_end_latency_s": 7.19039588799933, + "request_output_throughput_token_per_s": 20.861160127545677, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..2a2ede9 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 1, + "results_inter_token_latency_s_quantiles_p25": 0.044895485731262855, + "results_inter_token_latency_s_quantiles_p50": 0.04529097462888539, + "results_inter_token_latency_s_quantiles_p75": 0.04579195392042707, + "results_inter_token_latency_s_quantiles_p90": 0.046410543398674384, + "results_inter_token_latency_s_quantiles_p95": 0.04701287435754554, + "results_inter_token_latency_s_quantiles_p99": 0.04767980362362506, + "results_inter_token_latency_s_mean": 0.045362784766552006, + "results_inter_token_latency_s_min": 0.04352601890305144, + "results_inter_token_latency_s_max": 0.047922298686668606, + "results_inter_token_latency_s_stddev": 0.0009171923085976901, + "results_ttft_s_quantiles_p25": 0.5860279349999473, + "results_ttft_s_quantiles_p50": 0.5892667170001005, + "results_ttft_s_quantiles_p75": 0.5918130939999173, + "results_ttft_s_quantiles_p90": 0.6036383780003234, + "results_ttft_s_quantiles_p95": 0.6359745054999618, + "results_ttft_s_quantiles_p99": 0.6419864651804892, + "results_ttft_s_mean": 0.5928773132048055, + "results_ttft_s_min": 0.5802961800000048, + "results_ttft_s_max": 0.6420152070004406, + "results_ttft_s_stddev": 0.013565724721168154, + "results_end_to_end_latency_s_quantiles_p25": 7.183472849999816, + "results_end_to_end_latency_s_quantiles_p50": 7.188084677999541, + "results_end_to_end_latency_s_quantiles_p75": 7.1928349684999375, + "results_end_to_end_latency_s_quantiles_p90": 7.203863723999893, + "results_end_to_end_latency_s_quantiles_p95": 7.233148400400296, + "results_end_to_end_latency_s_quantiles_p99": 7.243687371179913, + "results_end_to_end_latency_s_mean": 7.179989100855399, + "results_end_to_end_latency_s_min": 6.2535524499999156, + "results_end_to_end_latency_s_max": 7.244400403000327, + "results_end_to_end_latency_s_stddev": 0.10394402242098062, + "results_request_output_throughput_token_per_s_quantiles_p25": 20.852605104138327, + "results_request_output_throughput_token_per_s_quantiles_p50": 20.867749646233072, + "results_request_output_throughput_token_per_s_quantiles_p75": 20.88080751297862, + "results_request_output_throughput_token_per_s_quantiles_p90": 20.892629656759333, + "results_request_output_throughput_token_per_s_quantiles_p95": 20.89667551037948, + "results_request_output_throughput_token_per_s_quantiles_p99": 20.902346595646435, + "results_request_output_throughput_token_per_s_mean": 20.855883300869912, + "results_request_output_throughput_token_per_s_min": 20.628275053486078, + "results_request_output_throughput_token_per_s_max": 20.90318934179482, + "results_request_output_throughput_token_per_s_stddev": 0.0488639633828661, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 157.0, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 160.0, + "results_number_output_tokens_quantiles_p90": 162.8, + "results_number_output_tokens_quantiles_p95": 163.89999999999998, + "results_number_output_tokens_quantiles_p99": 165.0, + "results_number_output_tokens_mean": 158.33734939759037, + "results_number_output_tokens_min": "131", + "results_number_output_tokens_max": "165", + "results_number_output_tokens_stddev": 4.33161128256701, + "results_num_requests_started": 83, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 21.67082775521886, + "results_num_completed_requests": 83, + "results_num_completed_requests_per_min": 8.211894857783436, + "timestamp": 1718205712 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..e8ce560 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1234 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11919542909307815, + "ttft_s": 2.396424467000543, + "end_to_end_latency_s": 15.441014682000059, + "request_output_throughput_token_per_s": 7.641984832613058, + "number_total_tokens": 679, + "number_output_tokens": 129, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11636315226414294, + "ttft_s": 0.6162726860002294, + "end_to_end_latency_s": 18.50193818200023, + "request_output_throughput_token_per_s": 8.107258738218507, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11495983981411975, + "ttft_s": 0.590558097000212, + "end_to_end_latency_s": 17.934095435000017, + "request_output_throughput_token_per_s": 8.363956829808174, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10327839804148096, + "ttft_s": 2.389825152999947, + "end_to_end_latency_s": 17.454306732999612, + "request_output_throughput_token_per_s": 8.593867536222778, + "number_total_tokens": 719, + "number_output_tokens": 169, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10805693687799559, + "ttft_s": 2.6576875259997905, + "end_to_end_latency_s": 17.721612309000193, + "request_output_throughput_token_per_s": 8.464241141525266, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10442106933532525, + "ttft_s": 2.373140020000392, + "end_to_end_latency_s": 17.43856240199966, + "request_output_throughput_token_per_s": 8.601626472535354, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10829425168131479, + "ttft_s": 7.9330229879997205, + "end_to_end_latency_s": 17.32729535999988, + "request_output_throughput_token_per_s": 8.656861725014252, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10881486292357155, + "ttft_s": 7.688944898000045, + "end_to_end_latency_s": 17.084204533000047, + "request_output_throughput_token_per_s": 8.780040048704537, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10475545159641181, + "ttft_s": 7.9949775100003535, + "end_to_end_latency_s": 17.38964717700037, + "request_output_throughput_token_per_s": 8.625821931476029, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11064921673550411, + "ttft_s": 7.755346916000235, + "end_to_end_latency_s": 17.150911364999956, + "request_output_throughput_token_per_s": 8.745890921348154, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10606621201246266, + "ttft_s": 7.681273150999914, + "end_to_end_latency_s": 17.076924140999836, + "request_output_throughput_token_per_s": 8.783783236459213, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10405399106703155, + "ttft_s": 7.668458016000841, + "end_to_end_latency_s": 17.065082118000646, + "request_output_throughput_token_per_s": 8.78987859318746, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10892238717838879, + "ttft_s": 7.706650124000589, + "end_to_end_latency_s": 17.101924629000678, + "request_output_throughput_token_per_s": 8.77094264265653, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10992133496177645, + "ttft_s": 7.861753737000072, + "end_to_end_latency_s": 17.25790401300037, + "request_output_throughput_token_per_s": 8.691669619149874, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10668304153416373, + "ttft_s": 7.782775407000372, + "end_to_end_latency_s": 17.176772736000203, + "request_output_throughput_token_per_s": 8.732723096791064, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10755252725625723, + "ttft_s": 7.814273900000444, + "end_to_end_latency_s": 17.216175760000624, + "request_output_throughput_token_per_s": 8.712736329545614, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11243443978127061, + "ttft_s": 2.3445144039997103, + "end_to_end_latency_s": 17.98970481499964, + "request_output_throughput_token_per_s": 8.338102350347153, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11293913296915406, + "ttft_s": 2.6507368180000412, + "end_to_end_latency_s": 18.296780510000644, + "request_output_throughput_token_per_s": 8.198163601405891, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11572424622641252, + "ttft_s": 2.7549173790002897, + "end_to_end_latency_s": 18.400342094000735, + "request_output_throughput_token_per_s": 8.152022350111967, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11725803577218921, + "ttft_s": 0.5869326620004358, + "end_to_end_latency_s": 18.526977233000252, + "request_output_throughput_token_per_s": 8.096301847492963, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11525842094905418, + "ttft_s": 2.447313087999646, + "end_to_end_latency_s": 18.0958969229996, + "request_output_throughput_token_per_s": 8.289171884558668, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10449564233934903, + "ttft_s": 7.846476800999881, + "end_to_end_latency_s": 17.242012450000402, + "request_output_throughput_token_per_s": 8.699680529461427, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11194296396774477, + "ttft_s": 7.954953037999985, + "end_to_end_latency_s": 17.351349416999255, + "request_output_throughput_token_per_s": 8.644860776824874, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10437475452172915, + "ttft_s": 7.4079221769998185, + "end_to_end_latency_s": 16.80465654099953, + "request_output_throughput_token_per_s": 8.926097337010976, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10902726354038773, + "ttft_s": 8.160466784000164, + "end_to_end_latency_s": 17.553585472000123, + "request_output_throughput_token_per_s": 8.545262746421027, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10495980931485929, + "ttft_s": 7.610366932999568, + "end_to_end_latency_s": 17.00372547400002, + "request_output_throughput_token_per_s": 8.821596198395541, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1030637795793099, + "ttft_s": 7.509150679999948, + "end_to_end_latency_s": 16.902638823000416, + "request_output_throughput_token_per_s": 8.874353973409535, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10970668508077172, + "ttft_s": 8.269208274999983, + "end_to_end_latency_s": 17.663159732999702, + "request_output_throughput_token_per_s": 8.492251797947466, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10977375025163526, + "ttft_s": 8.05987943800028, + "end_to_end_latency_s": 17.45421762099977, + "request_output_throughput_token_per_s": 8.593911411963251, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11311345258594316, + "ttft_s": 8.366569447999609, + "end_to_end_latency_s": 17.759441281999898, + "request_output_throughput_token_per_s": 8.446211658248092, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10845639731646367, + "ttft_s": 7.74406645199997, + "end_to_end_latency_s": 17.138919811999585, + "request_output_throughput_token_per_s": 8.752010140976301, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11580247413549519, + "ttft_s": 8.554322638000485, + "end_to_end_latency_s": 17.94957106200036, + "request_output_throughput_token_per_s": 8.356745656031487, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11406914461489612, + "ttft_s": 2.6532072789996164, + "end_to_end_latency_s": 18.365347345999908, + "request_output_throughput_token_per_s": 8.167555841663455, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11515018682797988, + "ttft_s": 2.3656862640000327, + "end_to_end_latency_s": 18.078788603999783, + "request_output_throughput_token_per_s": 8.297016093590129, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11150354960735315, + "ttft_s": 2.461485109000023, + "end_to_end_latency_s": 18.175263987000108, + "request_output_throughput_token_per_s": 8.25297503834265, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11782658522929089, + "ttft_s": 2.7870092490002207, + "end_to_end_latency_s": 18.49950198100032, + "request_output_throughput_token_per_s": 8.108326383815932, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1163341329249988, + "ttft_s": 0.5936424560004525, + "end_to_end_latency_s": 18.613689069999964, + "request_output_throughput_token_per_s": 8.058585239922044, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10877850605622826, + "ttft_s": 8.003673131000141, + "end_to_end_latency_s": 17.40479657100059, + "request_output_throughput_token_per_s": 8.618313887674276, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10869437477161643, + "ttft_s": 8.208363665999968, + "end_to_end_latency_s": 17.610010962999695, + "request_output_throughput_token_per_s": 8.517882261127733, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11137400716981014, + "ttft_s": 8.306384639000498, + "end_to_end_latency_s": 17.70869839400075, + "request_output_throughput_token_per_s": 8.470413616102702, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11712481508436814, + "ttft_s": 8.636832213000162, + "end_to_end_latency_s": 18.037509707000027, + "request_output_throughput_token_per_s": 8.3160038406958, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10745466972051342, + "ttft_s": 7.899326746999577, + "end_to_end_latency_s": 17.30039289599972, + "request_output_throughput_token_per_s": 8.670323321656106, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11334885378480493, + "ttft_s": 8.508945418000621, + "end_to_end_latency_s": 17.909326618000705, + "request_output_throughput_token_per_s": 8.375524284047321, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1055968005092842, + "ttft_s": 7.5967872300007, + "end_to_end_latency_s": 17.00136204500086, + "request_output_throughput_token_per_s": 8.822822524628638, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1127164819367057, + "ttft_s": 8.405222782000237, + "end_to_end_latency_s": 17.81055601600019, + "request_output_throughput_token_per_s": 8.42197177141729, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10818293488682684, + "ttft_s": 7.795746183000119, + "end_to_end_latency_s": 17.201261878999503, + "request_output_throughput_token_per_s": 8.720290467941217, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10887962426750164, + "ttft_s": 7.691546871000355, + "end_to_end_latency_s": 17.09435450300043, + "request_output_throughput_token_per_s": 8.774826798734736, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11222740316669352, + "ttft_s": 8.103773370000454, + "end_to_end_latency_s": 17.507679844999984, + "request_output_throughput_token_per_s": 8.567668664722497, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11360740126248743, + "ttft_s": 3.1180897600006574, + "end_to_end_latency_s": 18.177489165000225, + "request_output_throughput_token_per_s": 8.251964759182304, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11662353914194212, + "ttft_s": 3.0171530799998436, + "end_to_end_latency_s": 18.076879209000253, + "request_output_throughput_token_per_s": 8.297892477221227, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11655750098086584, + "ttft_s": 3.2390479659998164, + "end_to_end_latency_s": 18.299754808000216, + "request_output_throughput_token_per_s": 8.196831136470943, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11567760891251737, + "ttft_s": 0.5882566580003186, + "end_to_end_latency_s": 18.508611467000264, + "request_output_throughput_token_per_s": 8.104335663830911, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1130497453396633, + "ttft_s": 2.913195232000362, + "end_to_end_latency_s": 17.975121888999638, + "request_output_throughput_token_per_s": 8.344866918081738, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1172251257069831, + "ttft_s": 3.3419029409997165, + "end_to_end_latency_s": 18.404609717999847, + "request_output_throughput_token_per_s": 8.150132075514694, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1093705956707642, + "ttft_s": 8.538149341000462, + "end_to_end_latency_s": 17.937090804000036, + "request_output_throughput_token_per_s": 8.362560107380927, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10750152240249901, + "ttft_s": 7.693558775999918, + "end_to_end_latency_s": 17.09333259899995, + "request_output_throughput_token_per_s": 8.775351391031599, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11231521297422953, + "ttft_s": 8.007961804999468, + "end_to_end_latency_s": 17.409057094999298, + "request_output_throughput_token_per_s": 8.61620472501564, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10951660967512693, + "ttft_s": 7.795434169000146, + "end_to_end_latency_s": 17.19430869000007, + "request_output_throughput_token_per_s": 8.723816857332423, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10681647342587691, + "ttft_s": 7.902366274999622, + "end_to_end_latency_s": 17.304463277999275, + "request_output_throughput_token_per_s": 8.668283875103398, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11435895014741011, + "ttft_s": 8.440713355999833, + "end_to_end_latency_s": 17.842186859999856, + "request_output_throughput_token_per_s": 8.40704119831201, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10486939824693449, + "ttft_s": 7.586480708999261, + "end_to_end_latency_s": 16.98911343799955, + "request_output_throughput_token_per_s": 8.829183497268021, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10815968392636899, + "ttft_s": 8.230350747000557, + "end_to_end_latency_s": 17.630270192000353, + "request_output_throughput_token_per_s": 8.508094224673979, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1094964673562572, + "ttft_s": 8.11914013600017, + "end_to_end_latency_s": 17.519611007000094, + "request_output_throughput_token_per_s": 8.561833932275457, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11370259581412821, + "ttft_s": 8.33780495999963, + "end_to_end_latency_s": 17.738349102999564, + "request_output_throughput_token_per_s": 8.456254814301458, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.12435010639999466, + "ttft_s": 8.16429888300081, + "end_to_end_latency_s": 16.22955647100025, + "request_output_throughput_token_per_s": 7.948461205979559, + "number_total_tokens": 680, + "number_output_tokens": 130, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11275720366671213, + "ttft_s": 2.867219873999602, + "end_to_end_latency_s": 17.928589694999573, + "request_output_throughput_token_per_s": 8.366525340352688, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11203438703084127, + "ttft_s": 3.087151415000335, + "end_to_end_latency_s": 18.15076606399998, + "request_output_throughput_token_per_s": 8.264114003293132, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11407345942498068, + "ttft_s": 3.1897631869996985, + "end_to_end_latency_s": 18.252211021999756, + "request_output_throughput_token_per_s": 8.218182433854286, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1178593170445635, + "ttft_s": 0.6000349089999872, + "end_to_end_latency_s": 18.50413264100007, + "request_output_throughput_token_per_s": 8.106297274785053, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1164139363544207, + "ttft_s": 3.328335217000131, + "end_to_end_latency_s": 18.39383933099998, + "request_output_throughput_token_per_s": 8.154904329690329, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11344244755970444, + "ttft_s": 2.971286164999583, + "end_to_end_latency_s": 18.037583457999972, + "request_output_throughput_token_per_s": 8.315969838713205, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11254057423272724, + "ttft_s": 8.500834294000015, + "end_to_end_latency_s": 17.894152073999976, + "request_output_throughput_token_per_s": 8.38262687048181, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10780969608858752, + "ttft_s": 7.641282597999634, + "end_to_end_latency_s": 17.034170493999227, + "request_output_throughput_token_per_s": 8.805829438706263, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11177391533541181, + "ttft_s": 8.266956835000201, + "end_to_end_latency_s": 17.66048827800023, + "request_output_throughput_token_per_s": 8.493536398246466, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11321808810828822, + "ttft_s": 8.381051803000446, + "end_to_end_latency_s": 17.77543027699994, + "request_output_throughput_token_per_s": 8.43861429301594, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10586540019016447, + "ttft_s": 7.862138031000541, + "end_to_end_latency_s": 17.256582426000023, + "request_output_throughput_token_per_s": 8.692335266454561, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10857243658229211, + "ttft_s": 7.760431210999741, + "end_to_end_latency_s": 17.154632333000336, + "request_output_throughput_token_per_s": 8.74399387222338, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10651599100003535, + "ttft_s": 7.541372431999662, + "end_to_end_latency_s": 16.936596536999787, + "request_output_throughput_token_per_s": 8.856560978606836, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11127102343590568, + "ttft_s": 7.965315489999739, + "end_to_end_latency_s": 17.35849487899941, + "request_output_throughput_token_per_s": 8.641302200772744, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10980455208806783, + "ttft_s": 8.066652144999352, + "end_to_end_latency_s": 17.459253645999524, + "request_output_throughput_token_per_s": 8.5914325458219, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11668633951592908, + "ttft_s": 2.66465735499969, + "end_to_end_latency_s": 18.319955076000042, + "request_output_throughput_token_per_s": 8.18779300373431, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11513259322980969, + "ttft_s": 0.5841597150001689, + "end_to_end_latency_s": 18.53653346400006, + "request_output_throughput_token_per_s": 8.092127920860507, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11266771893748455, + "ttft_s": 2.370442734999415, + "end_to_end_latency_s": 18.027039162999245, + "request_output_throughput_token_per_s": 8.320833978542474, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11375031161731071, + "ttft_s": 2.7715171109994117, + "end_to_end_latency_s": 18.427767513999243, + "request_output_throughput_token_per_s": 8.139889972350026, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11531922217723668, + "ttft_s": 2.562786254000457, + "end_to_end_latency_s": 18.22062305700001, + "request_output_throughput_token_per_s": 8.232429787431055, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11184984620261079, + "ttft_s": 8.273303586000111, + "end_to_end_latency_s": 17.672501031000138, + "request_output_throughput_token_per_s": 8.487762979154914, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11106968653420046, + "ttft_s": 8.482236087000274, + "end_to_end_latency_s": 17.882426265000504, + "request_output_throughput_token_per_s": 8.38812350053304, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10581088857142282, + "ttft_s": 7.636181283000042, + "end_to_end_latency_s": 17.035971356999653, + "request_output_throughput_token_per_s": 8.80489857940321, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10717524579266555, + "ttft_s": 8.176081044000057, + "end_to_end_latency_s": 17.576941880999584, + "request_output_throughput_token_per_s": 8.533907719302855, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10707920552532195, + "ttft_s": 7.517959412999517, + "end_to_end_latency_s": 16.918721717999688, + "request_output_throughput_token_per_s": 8.865918034482252, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10984580756688563, + "ttft_s": 7.846246697999959, + "end_to_end_latency_s": 17.2460474720001, + "request_output_throughput_token_per_s": 8.697645083230414, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11131529819742969, + "ttft_s": 8.075154183999985, + "end_to_end_latency_s": 17.476687899999888, + "request_output_throughput_token_per_s": 8.582861973520792, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10427838490679323, + "ttft_s": 7.387866451000264, + "end_to_end_latency_s": 16.789025201000186, + "request_output_throughput_token_per_s": 8.934407936385963, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1083978132987906, + "ttft_s": 8.375397241000428, + "end_to_end_latency_s": 17.777448812999864, + "request_output_throughput_token_per_s": 8.437656132656763, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1097969781202365, + "ttft_s": 7.945598748000521, + "end_to_end_latency_s": 17.348110605999864, + "request_output_throughput_token_per_s": 8.646474731843266, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11172824278265518, + "ttft_s": 8.589835841999957, + "end_to_end_latency_s": 17.988452979999238, + "request_output_throughput_token_per_s": 8.338682607491595, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11374584390058043, + "ttft_s": 2.662923331999991, + "end_to_end_latency_s": 18.313301311000032, + "request_output_throughput_token_per_s": 8.190767871541615, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.12112302550984501, + "ttft_s": 0.5933500230003119, + "end_to_end_latency_s": 18.532955648000097, + "request_output_throughput_token_per_s": 8.093690118779655, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11373681490124875, + "ttft_s": 2.7745443399999203, + "end_to_end_latency_s": 18.42556969599991, + "request_output_throughput_token_per_s": 8.140860905514588, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11194412143827827, + "ttft_s": 2.4840160569992804, + "end_to_end_latency_s": 18.13616119800008, + "request_output_throughput_token_per_s": 8.27076901017735, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11199647985096071, + "ttft_s": 2.379245617000379, + "end_to_end_latency_s": 18.031658191000133, + "request_output_throughput_token_per_s": 8.318702495972733, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10781567138992443, + "ttft_s": 7.746560333999696, + "end_to_end_latency_s": 17.142986754999583, + "request_output_throughput_token_per_s": 8.74993384430248, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11451583452569536, + "ttft_s": 8.467896742999983, + "end_to_end_latency_s": 17.86467392799932, + "request_output_throughput_token_per_s": 8.396458877701924, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10751966880747295, + "ttft_s": 7.913991763999547, + "end_to_end_latency_s": 17.310889409999618, + "request_output_throughput_token_per_s": 8.665066042958639, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10962742026874821, + "ttft_s": 8.143513068000175, + "end_to_end_latency_s": 17.540685768000003, + "request_output_throughput_token_per_s": 8.551547070847679, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10964964934585926, + "ttft_s": 8.036836383999798, + "end_to_end_latency_s": 17.434515389999433, + "request_output_throughput_token_per_s": 8.60362313747138, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11681021084865206, + "ttft_s": 8.35708621699996, + "end_to_end_latency_s": 17.755360911000025, + "request_output_throughput_token_per_s": 8.448152687624058, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10656206017722669, + "ttft_s": 7.439353718000348, + "end_to_end_latency_s": 16.837028715000088, + "request_output_throughput_token_per_s": 8.908935331705242, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10652243929935776, + "ttft_s": 7.326044310000725, + "end_to_end_latency_s": 16.724487956000303, + "request_output_throughput_token_per_s": 8.968884452225275, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10958667018634774, + "ttft_s": 8.24743027600016, + "end_to_end_latency_s": 17.643695234000006, + "request_output_throughput_token_per_s": 8.501620437817632, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10848261873247127, + "ttft_s": 7.633044952999626, + "end_to_end_latency_s": 17.0320195459999, + "request_output_throughput_token_per_s": 8.806941513593356, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11680791142860518, + "ttft_s": 8.588193550999677, + "end_to_end_latency_s": 17.98864796199996, + "request_output_throughput_token_per_s": 8.33859222309908, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..85e9b95 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 16, + "results_inter_token_latency_s_quantiles_p25": 0.10813399716427563, + "results_inter_token_latency_s_quantiles_p50": 0.11129316081666768, + "results_inter_token_latency_s_quantiles_p75": 0.11383001986670706, + "results_inter_token_latency_s_quantiles_p90": 0.11661693532583449, + "results_inter_token_latency_s_quantiles_p95": 0.11723993523632585, + "results_inter_token_latency_s_quantiles_p99": 0.12091098990400065, + "results_inter_token_latency_s_mean": 0.1111168072355221, + "results_inter_token_latency_s_min": 0.1030637795793099, + "results_inter_token_latency_s_max": 0.12435010639999466, + "results_inter_token_latency_s_stddev": 0.004198136183385193, + "results_ttft_s_quantiles_p25": 2.8471672177497567, + "results_ttft_s_quantiles_p50": 7.700104450000254, + "results_ttft_s_quantiles_p75": 8.107615061500383, + "results_ttft_s_quantiles_p90": 8.380486346800444, + "results_ttft_s_quantiles_p95": 8.504484299800287, + "results_ttft_s_quantiles_p99": 8.589655189989926, + "results_ttft_s_mean": 6.056374991535766, + "results_ttft_s_min": 0.5841597150001689, + "results_ttft_s_max": 8.636832213000162, + "results_ttft_s_stddev": 2.781611024805391, + "results_end_to_end_latency_s_quantiles_p25": 17.212447289750344, + "results_end_to_end_latency_s_quantiles_p50": 17.63698271300018, + "results_end_to_end_latency_s_quantiles_p75": 18.037528144750013, + "results_end_to_end_latency_s_quantiles_p90": 18.39969181770066, + "results_end_to_end_latency_s_quantiles_p95": 18.502925688550157, + "results_end_to_end_latency_s_quantiles_p99": 18.536139904240063, + "results_end_to_end_latency_s_mean": 17.63281706541962, + "results_end_to_end_latency_s_min": 15.441014682000059, + "results_end_to_end_latency_s_max": 18.613689069999964, + "results_end_to_end_latency_s_stddev": 0.559787183114953, + "results_request_output_throughput_token_per_s_quantiles_p25": 8.297673381313452, + "results_request_output_throughput_token_per_s_quantiles_p50": 8.492894098096965, + "results_request_output_throughput_token_per_s_quantiles_p75": 8.698153944788167, + "results_request_output_throughput_token_per_s_quantiles_p90": 8.805736352775957, + "results_request_output_throughput_token_per_s_quantiles_p95": 8.860771653750774, + "results_request_output_throughput_token_per_s_quantiles_p99": 8.933493770454714, + "results_request_output_throughput_token_per_s_mean": 8.485462695519606, + "results_request_output_throughput_token_per_s_min": 7.641984832613058, + "results_request_output_throughput_token_per_s_max": 8.968884452225275, + "results_request_output_throughput_token_per_s_stddev": 0.25889620592621015, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 157.0, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 161.0, + "results_number_output_tokens_quantiles_p90": 163.0, + "results_number_output_tokens_quantiles_p95": 164.0, + "results_number_output_tokens_quantiles_p99": 166.89, + "results_number_output_tokens_mean": 158.78571428571428, + "results_number_output_tokens_min": "129", + "results_number_output_tokens_max": "169", + "results_number_output_tokens_stddev": 4.9290983248470805, + "results_num_requests_started": 112, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 126.21644247588873, + "results_num_completed_requests": 112, + "results_num_completed_requests_per_min": 47.693122662953904, + "timestamp": 1718206755 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..848f408 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1102 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05121791417197157, + "ttft_s": 1.137063219999618, + "end_to_end_latency_s": 8.04145909700037, + "request_output_throughput_token_per_s": 18.653331216464572, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04958358603682896, + "ttft_s": 0.6027865260002727, + "end_to_end_latency_s": 8.08234403699953, + "request_output_throughput_token_per_s": 18.558972411138, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05147024782162456, + "ttft_s": 0.5891731729998355, + "end_to_end_latency_s": 8.081037990000368, + "request_output_throughput_token_per_s": 18.561971888464438, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05050459532279242, + "ttft_s": 1.0755960080005025, + "end_to_end_latency_s": 7.97991451300004, + "request_output_throughput_token_per_s": 18.797193849086444, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05207968552288257, + "ttft_s": 1.067046778999611, + "end_to_end_latency_s": 7.9683661459994255, + "request_output_throughput_token_per_s": 18.82443618323294, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052775974254889224, + "ttft_s": 0.5838157250000222, + "end_to_end_latency_s": 8.074912751999364, + "request_output_throughput_token_per_s": 18.576052101969736, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05143340019743278, + "ttft_s": 0.5896099980000145, + "end_to_end_latency_s": 8.075290146000043, + "request_output_throughput_token_per_s": 18.575183960950298, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05015958913205144, + "ttft_s": 1.0736866430006557, + "end_to_end_latency_s": 7.975560096000663, + "request_output_throughput_token_per_s": 18.807456554081682, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051178510993637086, + "ttft_s": 0.596353433999866, + "end_to_end_latency_s": 8.08638580799925, + "request_output_throughput_token_per_s": 18.549696188328827, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051499930999999645, + "ttft_s": 1.0831547939997108, + "end_to_end_latency_s": 7.982710598000267, + "request_output_throughput_token_per_s": 18.79060980083334, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051164161301270585, + "ttft_s": 1.080353047999779, + "end_to_end_latency_s": 7.981837815999825, + "request_output_throughput_token_per_s": 18.79266447876461, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049034154084890334, + "ttft_s": 0.603156972000761, + "end_to_end_latency_s": 8.091494604000218, + "request_output_throughput_token_per_s": 18.537984308343237, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05005441855973913, + "ttft_s": 1.058818784000323, + "end_to_end_latency_s": 7.958840033999877, + "request_output_throughput_token_per_s": 18.846967568038234, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05009710076399258, + "ttft_s": 0.5855500770003346, + "end_to_end_latency_s": 8.065840056000525, + "request_output_throughput_token_per_s": 18.59694699604272, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04983883733948347, + "ttft_s": 0.5938378319997355, + "end_to_end_latency_s": 8.074119771999904, + "request_output_throughput_token_per_s": 18.577876503663266, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.048590694164625726, + "ttft_s": 1.068509838000864, + "end_to_end_latency_s": 7.969074875000842, + "request_output_throughput_token_per_s": 18.822762033589772, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051053814967953935, + "ttft_s": 1.0638227580002422, + "end_to_end_latency_s": 7.964582017999419, + "request_output_throughput_token_per_s": 18.83338003940572, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05040645978748444, + "ttft_s": 0.5817225379996671, + "end_to_end_latency_s": 8.065227308999965, + "request_output_throughput_token_per_s": 18.598359879158696, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05278694158442267, + "ttft_s": 0.6424677220002195, + "end_to_end_latency_s": 8.129387275999761, + "request_output_throughput_token_per_s": 18.451575119670114, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04924460202453512, + "ttft_s": 1.1275695490003272, + "end_to_end_latency_s": 8.02710435300014, + "request_output_throughput_token_per_s": 18.686688674221273, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051409922519246136, + "ttft_s": 0.5858123199996044, + "end_to_end_latency_s": 8.020166194999547, + "request_output_throughput_token_per_s": 18.702854324081557, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04757931528023225, + "ttft_s": 0.6218794889991841, + "end_to_end_latency_s": 7.470134309999594, + "request_output_throughput_token_per_s": 20.079960249069224, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04918028050616034, + "ttft_s": 1.066638203000366, + "end_to_end_latency_s": 7.967410883000412, + "request_output_throughput_token_per_s": 18.826693163276673, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05074344332705601, + "ttft_s": 0.5877711650000492, + "end_to_end_latency_s": 8.068435580000369, + "request_output_throughput_token_per_s": 18.59096456961402, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049251225249962574, + "ttft_s": 0.5891758429997935, + "end_to_end_latency_s": 8.077383652999742, + "request_output_throughput_token_per_s": 18.570369620154626, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051112036352506975, + "ttft_s": 1.0733765590002804, + "end_to_end_latency_s": 7.9736898629998905, + "request_output_throughput_token_per_s": 18.81186785255358, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05039961563749671, + "ttft_s": 0.5774544440000682, + "end_to_end_latency_s": 8.06421221599976, + "request_output_throughput_token_per_s": 18.600700971434414, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04979161761876867, + "ttft_s": 1.0655111980004222, + "end_to_end_latency_s": 7.966836086000512, + "request_output_throughput_token_per_s": 18.828051485028425, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04922377370370174, + "ttft_s": 1.0744494529999429, + "end_to_end_latency_s": 7.974472523000259, + "request_output_throughput_token_per_s": 18.810021549057275, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05055920462496601, + "ttft_s": 0.6033077629999752, + "end_to_end_latency_s": 8.089649169999575, + "request_output_throughput_token_per_s": 18.54221324656133, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050812382239000614, + "ttft_s": 0.5900457240004471, + "end_to_end_latency_s": 8.079343129000335, + "request_output_throughput_token_per_s": 18.565865764703528, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05144488668388916, + "ttft_s": 1.0712837729997773, + "end_to_end_latency_s": 7.974182809999547, + "request_output_throughput_token_per_s": 18.810704942944305, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049044696404906656, + "ttft_s": 1.0921328780004842, + "end_to_end_latency_s": 7.994547549000345, + "request_output_throughput_token_per_s": 18.762787897703646, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050903325987483816, + "ttft_s": 0.5944493399993007, + "end_to_end_latency_s": 8.093859722999696, + "request_output_throughput_token_per_s": 18.5325672958918, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04953705360118311, + "ttft_s": 0.5863045070000226, + "end_to_end_latency_s": 8.07483035199948, + "request_output_throughput_token_per_s": 18.576241662198782, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051090837987174335, + "ttft_s": 1.064476635999199, + "end_to_end_latency_s": 7.970378610999433, + "request_output_throughput_token_per_s": 18.819683144411, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.053481589315807515, + "ttft_s": 0.6461335479998525, + "end_to_end_latency_s": 8.129380836000564, + "request_output_throughput_token_per_s": 18.451589736789344, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0508401699050663, + "ttft_s": 1.1327484949997597, + "end_to_end_latency_s": 8.032932277000327, + "request_output_throughput_token_per_s": 18.673131408001026, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05099765172781766, + "ttft_s": 0.5812853030001861, + "end_to_end_latency_s": 8.057813894999526, + "request_output_throughput_token_per_s": 18.615470890074313, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04944246485713774, + "ttft_s": 1.066166327999781, + "end_to_end_latency_s": 7.960459054999774, + "request_output_throughput_token_per_s": 18.843134417705294, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05201144196739049, + "ttft_s": 1.0587955840001086, + "end_to_end_latency_s": 7.957941593000214, + "request_output_throughput_token_per_s": 18.8490953655578, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04947354727612338, + "ttft_s": 0.5872320280004715, + "end_to_end_latency_s": 8.064467449000404, + "request_output_throughput_token_per_s": 18.600112276303204, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04994965775923997, + "ttft_s": 0.5850886009993701, + "end_to_end_latency_s": 8.092037709999204, + "request_output_throughput_token_per_s": 18.53674011116475, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05057729384801445, + "ttft_s": 1.0895204440003, + "end_to_end_latency_s": 7.991436029000397, + "request_output_throughput_token_per_s": 18.77009331685317, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05013664610558645, + "ttft_s": 0.5845507740004905, + "end_to_end_latency_s": 8.072217237000586, + "request_output_throughput_token_per_s": 18.582255109840908, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05177651779871218, + "ttft_s": 1.0680761919993529, + "end_to_end_latency_s": 7.973822353999822, + "request_output_throughput_token_per_s": 18.811555279351957, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0514000173630952, + "ttft_s": 0.5858421709999675, + "end_to_end_latency_s": 8.069979398999749, + "request_output_throughput_token_per_s": 18.58740804451026, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05013761830816647, + "ttft_s": 1.0688794530005907, + "end_to_end_latency_s": 7.972143824000341, + "request_output_throughput_token_per_s": 18.8155160407946, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05044190204377515, + "ttft_s": 0.5923850330000278, + "end_to_end_latency_s": 8.070947871000499, + "request_output_throughput_token_per_s": 18.585177651680898, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04801279106626439, + "ttft_s": 1.067939588999252, + "end_to_end_latency_s": 7.970306329999403, + "request_output_throughput_token_per_s": 18.819853816084287, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05045848412501073, + "ttft_s": 0.5808509270000286, + "end_to_end_latency_s": 8.073576405000495, + "request_output_throughput_token_per_s": 18.579126829975273, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04951695624223242, + "ttft_s": 1.0669780469997932, + "end_to_end_latency_s": 7.972417455999675, + "request_output_throughput_token_per_s": 18.81487024831055, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049819996256263724, + "ttft_s": 1.066967337999813, + "end_to_end_latency_s": 7.971375604000059, + "request_output_throughput_token_per_s": 18.81732933582123, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04982328740124227, + "ttft_s": 0.5870723160005582, + "end_to_end_latency_s": 8.071563190000234, + "request_output_throughput_token_per_s": 18.58376084893113, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050157322285717475, + "ttft_s": 0.5869589949998044, + "end_to_end_latency_s": 8.075518210000155, + "request_output_throughput_token_per_s": 18.57465937161166, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050163815371116416, + "ttft_s": 1.069707493000351, + "end_to_end_latency_s": 7.976221945999896, + "request_output_throughput_token_per_s": 18.80589595118094, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04915142428660324, + "ttft_s": 0.5852561740002784, + "end_to_end_latency_s": 8.061093108999557, + "request_output_throughput_token_per_s": 18.607898205832303, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050660181401303385, + "ttft_s": 1.0518375649999143, + "end_to_end_latency_s": 7.953870462999475, + "request_output_throughput_token_per_s": 18.858743136160363, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04886700248479235, + "ttft_s": 0.5798778950002088, + "end_to_end_latency_s": 8.06323407399941, + "request_output_throughput_token_per_s": 18.602957401880204, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051357290225848465, + "ttft_s": 1.058461239999815, + "end_to_end_latency_s": 7.96056354600023, + "request_output_throughput_token_per_s": 18.842887081200075, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05103733441133722, + "ttft_s": 0.5919053580000764, + "end_to_end_latency_s": 7.243615946000318, + "request_output_throughput_token_per_s": 18.22294293126984, + "number_total_tokens": 691, + "number_output_tokens": 141, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04965110029999664, + "ttft_s": 1.0734240000001591, + "end_to_end_latency_s": 7.9443851100004395, + "request_output_throughput_token_per_s": 18.881259899042295, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05130709628169633, + "ttft_s": 0.9854813290003221, + "end_to_end_latency_s": 7.3330094140001165, + "request_output_throughput_token_per_s": 18.81901306938617, + "number_total_tokens": 692, + "number_output_tokens": 142, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04935692456440686, + "ttft_s": 0.5888771999998426, + "end_to_end_latency_s": 8.045386106999104, + "request_output_throughput_token_per_s": 18.644226393250054, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051762941922080044, + "ttft_s": 1.066813605999414, + "end_to_end_latency_s": 7.971733948999827, + "request_output_throughput_token_per_s": 18.816483460141033, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05111208825316876, + "ttft_s": 0.5910310959998242, + "end_to_end_latency_s": 8.075929124999675, + "request_output_throughput_token_per_s": 18.573714265973827, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05085192740884192, + "ttft_s": 0.5909645259998797, + "end_to_end_latency_s": 8.085667079999439, + "request_output_throughput_token_per_s": 18.55134505488574, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05053617502527013, + "ttft_s": 1.0796205080005166, + "end_to_end_latency_s": 7.984950926000238, + "request_output_throughput_token_per_s": 18.785337742224158, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050128071291855016, + "ttft_s": 0.5824473880002188, + "end_to_end_latency_s": 8.070852159999959, + "request_output_throughput_token_per_s": 18.585398050458252, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04880031942591387, + "ttft_s": 0.9993425459997525, + "end_to_end_latency_s": 7.905859908999446, + "request_output_throughput_token_per_s": 18.973268148762806, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05016618897512129, + "ttft_s": 0.5909987570003068, + "end_to_end_latency_s": 8.076947278000262, + "request_output_throughput_token_per_s": 18.571372925581098, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05279708939071003, + "ttft_s": 1.0710942300001989, + "end_to_end_latency_s": 7.9725673379998625, + "request_output_throughput_token_per_s": 18.814516534096985, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.09177299680955218, + "ttft_s": 1.0467343899999833, + "end_to_end_latency_s": 1.9737729560001753, + "request_output_throughput_token_per_s": 10.639521600577718, + "number_total_tokens": 571, + "number_output_tokens": 21, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04851404981372585, + "ttft_s": 0.5894849670003168, + "end_to_end_latency_s": 7.810947461000069, + "request_output_throughput_token_per_s": 19.203816278236093, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05012332259747548, + "ttft_s": 1.060296433000076, + "end_to_end_latency_s": 7.96983645399996, + "request_output_throughput_token_per_s": 18.82096337431327, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04903327636364503, + "ttft_s": 0.6023655409999265, + "end_to_end_latency_s": 8.090723914000591, + "request_output_throughput_token_per_s": 18.53975016258218, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04929361799997176, + "ttft_s": 1.0783556629994564, + "end_to_end_latency_s": 7.985841857999731, + "request_output_throughput_token_per_s": 18.783241975890007, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0522128852516201, + "ttft_s": 0.6033626240005106, + "end_to_end_latency_s": 8.093226296000466, + "request_output_throughput_token_per_s": 18.534017771642866, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05004682583642894, + "ttft_s": 1.049850728999445, + "end_to_end_latency_s": 7.957639608999671, + "request_output_throughput_token_per_s": 18.849810668776446, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05046407815002567, + "ttft_s": 0.58644780800023, + "end_to_end_latency_s": 8.074427096000363, + "request_output_throughput_token_per_s": 18.577169403672233, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05012386384276042, + "ttft_s": 1.062184655999772, + "end_to_end_latency_s": 7.96992774499995, + "request_output_throughput_token_per_s": 18.820747790857286, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05049242537499481, + "ttft_s": 0.5854291449995799, + "end_to_end_latency_s": 8.079023893999874, + "request_output_throughput_token_per_s": 18.566599377358187, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050903258383638655, + "ttft_s": 0.5867626629997176, + "end_to_end_latency_s": 8.09383080399948, + "request_output_throughput_token_per_s": 18.53263351216572, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05187937729876124, + "ttft_s": 1.0830969930002539, + "end_to_end_latency_s": 7.989606725000158, + "request_output_throughput_token_per_s": 18.77439092598103, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05017458839621162, + "ttft_s": 1.0713288730003114, + "end_to_end_latency_s": 7.977946967000207, + "request_output_throughput_token_per_s": 18.80182967127464, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05117472574679999, + "ttft_s": 0.5903769879996617, + "end_to_end_latency_s": 8.08578598099939, + "request_output_throughput_token_per_s": 18.551072258464632, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05120383054432589, + "ttft_s": 0.6045365079999101, + "end_to_end_latency_s": 8.090396018999854, + "request_output_throughput_token_per_s": 18.540501558605186, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05054195447468074, + "ttft_s": 1.0804940899997746, + "end_to_end_latency_s": 7.985812507999981, + "request_output_throughput_token_per_s": 18.783311009334852, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05048190072326099, + "ttft_s": 1.1214782910001304, + "end_to_end_latency_s": 8.026847439999983, + "request_output_throughput_token_per_s": 18.687286773697583, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05144137294934198, + "ttft_s": 0.6425457229997846, + "end_to_end_latency_s": 8.127919337999629, + "request_output_throughput_token_per_s": 18.454907555334657, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05044218536875746, + "ttft_s": 0.5842432200006442, + "end_to_end_latency_s": 8.070943472000181, + "request_output_throughput_token_per_s": 18.585187781376725, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052760972271466956, + "ttft_s": 1.0617138500001602, + "end_to_end_latency_s": 7.967096399000184, + "request_output_throughput_token_per_s": 18.827436306509856, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050112560503110894, + "ttft_s": 1.0638843689994246, + "end_to_end_latency_s": 7.968074081999475, + "request_output_throughput_token_per_s": 18.825126179343908, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050135641391274736, + "ttft_s": 0.5880056579999291, + "end_to_end_latency_s": 8.072045815999445, + "request_output_throughput_token_per_s": 18.58264972959989, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.048850537687108855, + "ttft_s": 1.0605340259999139, + "end_to_end_latency_s": 7.963482574000409, + "request_output_throughput_token_per_s": 18.835980189085586, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05238820092205509, + "ttft_s": 0.5863257069995598, + "end_to_end_latency_s": 8.06796559400027, + "request_output_throughput_token_per_s": 18.592047555525927, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05139524795538436, + "ttft_s": 0.5847738470001786, + "end_to_end_latency_s": 8.069336862000455, + "request_output_throughput_token_per_s": 18.58888810385018, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04974943267493472, + "ttft_s": 1.0584497990003001, + "end_to_end_latency_s": 7.960126831000707, + "request_output_throughput_token_per_s": 18.843920855108127, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05079805173882398, + "ttft_s": 1.0711990620002325, + "end_to_end_latency_s": 7.975496686999577, + "request_output_throughput_token_per_s": 18.807606082327993, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0508129972201728, + "ttft_s": 0.5853060039999036, + "end_to_end_latency_s": 8.079445040999417, + "request_output_throughput_token_per_s": 18.565631579745876, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..1de6a45 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 2, + "results_inter_token_latency_s_quantiles_p25": 0.04981290159688996, + "results_inter_token_latency_s_quantiles_p50": 0.05047298943664333, + "results_inter_token_latency_s_quantiles_p75": 0.051184840881309285, + "results_inter_token_latency_s_quantiles_p90": 0.05189258376562417, + "results_inter_token_latency_s_quantiles_p95": 0.05276172237063807, + "results_inter_token_latency_s_quantiles_p99": 0.05386450339074516, + "results_inter_token_latency_s_mean": 0.0508974474171928, + "results_inter_token_latency_s_min": 0.04757931528023225, + "results_inter_token_latency_s_max": 0.09177299680955218, + "results_inter_token_latency_s_stddev": 0.004271111418964108, + "results_ttft_s_quantiles_p25": 0.5879470347499591, + "results_ttft_s_quantiles_p50": 0.6443396354998185, + "results_ttft_s_quantiles_p75": 1.0679737397492772, + "results_ttft_s_quantiles_p90": 1.0796937620004428, + "results_ttft_s_quantiles_p95": 1.0896510657003091, + "results_ttft_s_quantiles_p99": 1.1327916422497584, + "results_ttft_s_mean": 0.8269834441700096, + "results_ttft_s_min": 0.5774544440000682, + "results_ttft_s_max": 1.137063219999618, + "results_ttft_s_stddev": 0.24078355563908607, + "results_end_to_end_latency_s_quantiles_p25": 7.970360540749425, + "results_end_to_end_latency_s_quantiles_p50": 8.026975896500062, + "results_end_to_end_latency_s_quantiles_p75": 8.074850951999451, + "results_end_to_end_latency_s_quantiles_p90": 8.089723854899603, + "results_end_to_end_latency_s_quantiles_p95": 8.093256521400416, + "results_end_to_end_latency_s_quantiles_p99": 8.129380900400555, + "results_end_to_end_latency_s_mean": 7.944187725759966, + "results_end_to_end_latency_s_min": 1.9737729560001753, + "results_end_to_end_latency_s_max": 8.129387275999761, + "results_end_to_end_latency_s_stddev": 0.6170747384805549, + "results_request_output_throughput_token_per_s_quantiles_p25": 18.57505281361564, + "results_request_output_throughput_token_per_s_quantiles_p50": 18.6632313122328, + "results_request_output_throughput_token_per_s_quantiles_p75": 18.816694929061082, + "results_request_output_throughput_token_per_s_quantiles_p90": 18.8429118148506, + "results_request_output_throughput_token_per_s_quantiles_p95": 18.850257292145642, + "results_request_output_throughput_token_per_s_quantiles_p99": 19.21257771794443, + "results_request_output_throughput_token_per_s_mean": 18.622558438760585, + "results_request_output_throughput_token_per_s_min": 10.639521600577718, + "results_request_output_throughput_token_per_s_max": 20.079960249069224, + "results_request_output_throughput_token_per_s_stddev": 0.8309475169291582, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 157.0, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 161.0, + "results_number_output_tokens_quantiles_p90": 163.0, + "results_number_output_tokens_quantiles_p95": 164.0, + "results_number_output_tokens_quantiles_p99": 165.01, + "results_number_output_tokens_mean": 157.22, + "results_number_output_tokens_min": "21", + "results_number_output_tokens_max": "166", + "results_number_output_tokens_stddev": 14.33973077808223, + "results_num_requests_started": 100, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 38.19783647998263, + "results_num_completed_requests": 100, + "results_num_completed_requests_per_min": 14.57747226052002, + "timestamp": 1718206143 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..f5f1267 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1410 @@ +[ + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13449398951220468, + "ttft_s": 0.592265960000077, + "end_to_end_latency_s": 22.057289241000035, + "request_output_throughput_token_per_s": 6.800473002873824, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14363700933743395, + "ttft_s": 0.7249649149998731, + "end_to_end_latency_s": 23.413039587999265, + "request_output_throughput_token_per_s": 6.406686301290199, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14346761856876356, + "ttft_s": 1.4199243670000214, + "end_to_end_latency_s": 22.95508261499981, + "request_output_throughput_token_per_s": 6.5345005511757, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1439346276832038, + "ttft_s": 1.6386611829993853, + "end_to_end_latency_s": 23.17425191699931, + "request_output_throughput_token_per_s": 6.472700846492851, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14165031274045806, + "ttft_s": 9.906273276999855, + "end_to_end_latency_s": 22.380969818000267, + "request_output_throughput_token_per_s": 6.702122437936537, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13900603808590084, + "ttft_s": 10.180826024000453, + "end_to_end_latency_s": 22.65820412999983, + "request_output_throughput_token_per_s": 6.620118661628507, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13893333025314375, + "ttft_s": 9.476399121999748, + "end_to_end_latency_s": 21.95167783099987, + "request_output_throughput_token_per_s": 6.833190663365694, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14059000473747574, + "ttft_s": 10.016111841000566, + "end_to_end_latency_s": 22.49461250100012, + "request_output_throughput_token_per_s": 6.6682633449823125, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13913722680628665, + "ttft_s": 9.783798180999838, + "end_to_end_latency_s": 22.262219909999658, + "request_output_throughput_token_per_s": 6.737872530520803, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14605058891559908, + "ttft_s": 10.012374392999845, + "end_to_end_latency_s": 22.491998636999597, + "request_output_throughput_token_per_s": 6.6690382842744915, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1424223173835973, + "ttft_s": 10.164687428999969, + "end_to_end_latency_s": 22.645386746999975, + "request_output_throughput_token_per_s": 6.623865676300351, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14143516940643636, + "ttft_s": 9.440571584999816, + "end_to_end_latency_s": 21.92266682099944, + "request_output_throughput_token_per_s": 6.842233256782288, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14113540503162939, + "ttft_s": 9.82330627600004, + "end_to_end_latency_s": 22.299685918999785, + "request_output_throughput_token_per_s": 6.726552138216303, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14065071539749535, + "ttft_s": 10.168506046999937, + "end_to_end_latency_s": 22.64501307199953, + "request_output_throughput_token_per_s": 6.623974979527587, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14064462578748477, + "ttft_s": 10.026288000000022, + "end_to_end_latency_s": 22.503349411000272, + "request_output_throughput_token_per_s": 6.665674396304568, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14321998529376287, + "ttft_s": 10.438090112999816, + "end_to_end_latency_s": 22.915447878999657, + "request_output_throughput_token_per_s": 6.54580267390122, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13697337424844205, + "ttft_s": 9.575216634999379, + "end_to_end_latency_s": 22.052912064999873, + "request_output_throughput_token_per_s": 6.801822795913863, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13631226975771174, + "ttft_s": 9.467810152000311, + "end_to_end_latency_s": 21.946514084999762, + "request_output_throughput_token_per_s": 6.834798429447326, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14707508823183113, + "ttft_s": 9.729630188000556, + "end_to_end_latency_s": 22.208559004000563, + "request_output_throughput_token_per_s": 6.754152755835243, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14417212396054574, + "ttft_s": 9.434522006999941, + "end_to_end_latency_s": 21.914371525000206, + "request_output_throughput_token_per_s": 6.844823262619145, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1421753459687352, + "ttft_s": 11.524508866999895, + "end_to_end_latency_s": 22.74830509200001, + "request_output_throughput_token_per_s": 6.593897848361069, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18785608138146065, + "ttft_s": 2.8737908370003424, + "end_to_end_latency_s": 18.30211873799999, + "request_output_throughput_token_per_s": 4.80818648702617, + "number_total_tokens": 647, + "number_output_tokens": 97, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.15191553590574441, + "ttft_s": 10.597980567000377, + "end_to_end_latency_s": 21.045450850000634, + "request_output_throughput_token_per_s": 6.272139330291231, + "number_total_tokens": 688, + "number_output_tokens": 138, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14718691961490935, + "ttft_s": 3.325604432000546, + "end_to_end_latency_s": 23.697294291000617, + "request_output_throughput_token_per_s": 6.329836569441796, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.15209359814190623, + "ttft_s": 3.2027417219997005, + "end_to_end_latency_s": 23.57468804400014, + "request_output_throughput_token_per_s": 6.362756517500372, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14618005988958893, + "ttft_s": 0.5897073059995819, + "end_to_end_latency_s": 23.8275938959996, + "request_output_throughput_token_per_s": 6.2952222811378125, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14508388950929182, + "ttft_s": 2.9838426050000635, + "end_to_end_latency_s": 23.358696323000004, + "request_output_throughput_token_per_s": 6.421591253459782, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13720943390051732, + "ttft_s": 3.0890353989998403, + "end_to_end_latency_s": 23.464612087000205, + "request_output_throughput_token_per_s": 6.392605147012107, + "number_total_tokens": 721, + "number_output_tokens": 171, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13888634732298918, + "ttft_s": 10.489107135000268, + "end_to_end_latency_s": 22.360937052000736, + "request_output_throughput_token_per_s": 6.708126750286559, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1339855799876575, + "ttft_s": 9.834021583999856, + "end_to_end_latency_s": 21.706350527000723, + "request_output_throughput_token_per_s": 6.910420054878118, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14052124041666922, + "ttft_s": 10.04976392000026, + "end_to_end_latency_s": 21.921500125999955, + "request_output_throughput_token_per_s": 6.842597410662274, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14072721480643527, + "ttft_s": 9.940337471999555, + "end_to_end_latency_s": 21.812934247999692, + "request_output_throughput_token_per_s": 6.876653929021742, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14924144552635185, + "ttft_s": 10.81133020299967, + "end_to_end_latency_s": 22.684896151999965, + "request_output_throughput_token_per_s": 6.6123291460064975, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1418526084423425, + "ttft_s": 10.25633623000067, + "end_to_end_latency_s": 22.129211360000227, + "request_output_throughput_token_per_s": 6.778370795044838, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14290133214557546, + "ttft_s": 10.704974113999924, + "end_to_end_latency_s": 22.578943757999696, + "request_output_throughput_token_per_s": 6.6433577056435675, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1376694969124969, + "ttft_s": 10.153834949999691, + "end_to_end_latency_s": 22.027308458000334, + "request_output_throughput_token_per_s": 6.8097289455952525, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14066192198765468, + "ttft_s": 10.912360244999945, + "end_to_end_latency_s": 22.78752065399931, + "request_output_throughput_token_per_s": 6.582550259748173, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14259599993588615, + "ttft_s": 10.368997890000173, + "end_to_end_latency_s": 22.245187292000082, + "request_output_throughput_token_per_s": 6.74303156143548, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14844137083229372, + "ttft_s": 11.134090197999285, + "end_to_end_latency_s": 23.008633819999886, + "request_output_throughput_token_per_s": 6.5192918959671085, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1478857151401177, + "ttft_s": 11.346751686000061, + "end_to_end_latency_s": 23.218266799000048, + "request_output_throughput_token_per_s": 6.460430543698469, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1417661003558345, + "ttft_s": 11.236852580999766, + "end_to_end_latency_s": 23.108427465000204, + "request_output_throughput_token_per_s": 6.491138361846063, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.139560406634157, + "ttft_s": 11.016556896000111, + "end_to_end_latency_s": 22.89478848500039, + "request_output_throughput_token_per_s": 6.551709359458511, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14758156297511113, + "ttft_s": 2.775511670999549, + "end_to_end_latency_s": 23.760865252999793, + "request_output_throughput_token_per_s": 6.312901420164512, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.15109367944305746, + "ttft_s": 0.587464098000055, + "end_to_end_latency_s": 23.873003716999847, + "request_output_throughput_token_per_s": 6.2832478802483385, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14641547455629506, + "ttft_s": 2.4412803499999427, + "end_to_end_latency_s": 23.426719822999985, + "request_output_throughput_token_per_s": 6.402945061592975, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14871189304405744, + "ttft_s": 2.659125273999962, + "end_to_end_latency_s": 23.645558549999805, + "request_output_throughput_token_per_s": 6.343686053463991, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.15589180415237622, + "ttft_s": 2.553174420000687, + "end_to_end_latency_s": 23.53988796900012, + "request_output_throughput_token_per_s": 6.372162866600567, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14161082627041588, + "ttft_s": 10.631010797999807, + "end_to_end_latency_s": 22.51637018800011, + "request_output_throughput_token_per_s": 6.661819767021822, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14713618091663286, + "ttft_s": 11.067413106000458, + "end_to_end_latency_s": 22.95348097400074, + "request_output_throughput_token_per_s": 6.5349565135634125, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14418194838276296, + "ttft_s": 11.471664513000178, + "end_to_end_latency_s": 23.35766349000005, + "request_output_throughput_token_per_s": 6.42187520443637, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1464429267307948, + "ttft_s": 10.95777634499973, + "end_to_end_latency_s": 22.845270582000012, + "request_output_throughput_token_per_s": 6.565910412905606, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14232687116129397, + "ttft_s": 10.174616995999713, + "end_to_end_latency_s": 22.060848436000015, + "request_output_throughput_token_per_s": 6.799375846090415, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13918164555900533, + "ttft_s": 10.521218425999905, + "end_to_end_latency_s": 22.408517240999572, + "request_output_throughput_token_per_s": 6.693883329574063, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1358746781069494, + "ttft_s": 9.716345789000115, + "end_to_end_latency_s": 21.604269581000153, + "request_output_throughput_token_per_s": 6.943072036645817, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14424924126090993, + "ttft_s": 11.336578116000055, + "end_to_end_latency_s": 23.224328556999353, + "request_output_throughput_token_per_s": 6.458744313397727, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13628753903614493, + "ttft_s": 10.735140690000662, + "end_to_end_latency_s": 22.623927264000486, + "request_output_throughput_token_per_s": 6.630148614324893, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13802535244021155, + "ttft_s": 10.057571919999646, + "end_to_end_latency_s": 21.946291622999524, + "request_output_throughput_token_per_s": 6.834867711445213, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14193446672549065, + "ttft_s": 9.825583425000332, + "end_to_end_latency_s": 21.716206272000818, + "request_output_throughput_token_per_s": 6.907283810128397, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14174649307789594, + "ttft_s": 9.938760851000552, + "end_to_end_latency_s": 21.829191675999937, + "request_output_throughput_token_per_s": 6.871532497692859, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14100874158227467, + "ttft_s": 10.391082541999822, + "end_to_end_latency_s": 22.279569001999334, + "request_output_throughput_token_per_s": 6.732625751716258, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14301098472325244, + "ttft_s": 10.849398050000673, + "end_to_end_latency_s": 22.738970063000124, + "request_output_throughput_token_per_s": 6.59660484113454, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.136042698509233, + "ttft_s": 10.284083374999682, + "end_to_end_latency_s": 22.175151968000137, + "request_output_throughput_token_per_s": 6.764327938607031, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14596707830376637, + "ttft_s": 11.173888777000684, + "end_to_end_latency_s": 23.0629972050001, + "request_output_throughput_token_per_s": 6.5039248223765, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14406306493254897, + "ttft_s": 2.533197615000063, + "end_to_end_latency_s": 23.482539015999464, + "request_output_throughput_token_per_s": 6.387724934590754, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.15078021587092558, + "ttft_s": 2.4204344539994054, + "end_to_end_latency_s": 23.371135841999603, + "request_output_throughput_token_per_s": 6.41817329778381, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1512517984712909, + "ttft_s": 2.7969399559997328, + "end_to_end_latency_s": 23.74671969299925, + "request_output_throughput_token_per_s": 6.316661919592262, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14681872981368288, + "ttft_s": 2.6887708340000245, + "end_to_end_latency_s": 23.638917805000347, + "request_output_throughput_token_per_s": 6.345468148642171, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.15321923037818008, + "ttft_s": 0.6444090060003873, + "end_to_end_latency_s": 23.902429992999714, + "request_output_throughput_token_per_s": 6.275512575245712, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13956798570969464, + "ttft_s": 9.745372129000316, + "end_to_end_latency_s": 21.63327830200069, + "request_output_throughput_token_per_s": 6.93376186013045, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14400128396914905, + "ttft_s": 11.440354753000065, + "end_to_end_latency_s": 23.328401527000096, + "request_output_throughput_token_per_s": 6.429930478793897, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14135308727160054, + "ttft_s": 11.00969681800052, + "end_to_end_latency_s": 22.89938232300028, + "request_output_throughput_token_per_s": 6.550395023071826, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14606846937735923, + "ttft_s": 11.335751086000528, + "end_to_end_latency_s": 23.225086966000163, + "request_output_throughput_token_per_s": 6.458533404830264, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13713112607451905, + "ttft_s": 10.188033217999873, + "end_to_end_latency_s": 22.07833994000066, + "request_output_throughput_token_per_s": 6.793989059305856, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13909472857593053, + "ttft_s": 10.08564246900005, + "end_to_end_latency_s": 21.97718618700037, + "request_output_throughput_token_per_s": 6.825259554324832, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13911613176688478, + "ttft_s": 10.783800342000177, + "end_to_end_latency_s": 22.6761439109996, + "request_output_throughput_token_per_s": 6.614881286197825, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14201668644447613, + "ttft_s": 11.118702501999906, + "end_to_end_latency_s": 23.00692097800038, + "request_output_throughput_token_per_s": 6.519777250655689, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13579795495785352, + "ttft_s": 10.654372587999205, + "end_to_end_latency_s": 22.542633964999368, + "request_output_throughput_token_per_s": 6.65405827166853, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13591047110907256, + "ttft_s": 10.535445627000627, + "end_to_end_latency_s": 22.42542768600015, + "request_output_throughput_token_per_s": 6.6888356423027195, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13980651530672636, + "ttft_s": 10.898102072000256, + "end_to_end_latency_s": 22.7886618980001, + "request_output_throughput_token_per_s": 6.582220609151421, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1454094279496582, + "ttft_s": 11.22993751300055, + "end_to_end_latency_s": 23.120313637000436, + "request_output_throughput_token_per_s": 6.487801262347433, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13493896433333752, + "ttft_s": 9.968974428000365, + "end_to_end_latency_s": 21.860355464999884, + "request_output_throughput_token_per_s": 6.861736545874635, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13945456484373153, + "ttft_s": 10.420786431999659, + "end_to_end_latency_s": 22.312965198999336, + "request_output_throughput_token_per_s": 6.722548915494522, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13426620696911953, + "ttft_s": 9.858345474000089, + "end_to_end_latency_s": 21.751322781000454, + "request_output_throughput_token_per_s": 6.896132318491609, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13867979620624168, + "ttft_s": 10.295704784000009, + "end_to_end_latency_s": 22.189022595000097, + "request_output_throughput_token_per_s": 6.760099475215274, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..3f7645d --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 32, + "results_inter_token_latency_s_quantiles_p25": 0.1391319530464362, + "results_inter_token_latency_s_quantiles_p50": 0.14189353758391657, + "results_inter_token_latency_s_quantiles_p75": 0.1460550590310391, + "results_inter_token_latency_s_quantiles_p90": 0.14908257978166353, + "results_inter_token_latency_s_quantiles_p95": 0.1518159752905764, + "results_inter_token_latency_s_quantiles_p99": 0.16132573128132063, + "results_inter_token_latency_s_mean": 0.1430334284903326, + "results_inter_token_latency_s_min": 0.1339855799876575, + "results_inter_token_latency_s_max": 0.18785608138146065, + "results_inter_token_latency_s_stddev": 0.006860024683307713, + "results_ttft_s_quantiles_p25": 9.439059190499847, + "results_ttft_s_quantiles_p50": 10.071607194499848, + "results_ttft_s_quantiles_p75": 10.712515758000109, + "results_ttft_s_quantiles_p90": 11.161949203300264, + "results_ttft_s_quantiles_p95": 11.336454061500126, + "results_ttft_s_quantiles_p99": 11.48064805318013, + "results_ttft_s_mean": 8.44362164483337, + "results_ttft_s_min": 0.587464098000055, + "results_ttft_s_max": 11.524508866999895, + "results_ttft_s_stddev": 3.619744316177655, + "results_end_to_end_latency_s_quantiles_p25": 22.0739670640005, + "results_end_to_end_latency_s_quantiles_p50": 22.634470168000007, + "results_end_to_end_latency_s_quantiles_p75": 23.185255637499495, + "results_end_to_end_latency_s_quantiles_p90": 23.564248021500134, + "results_end_to_end_latency_s_quantiles_p95": 23.739305882699455, + "results_end_to_end_latency_s_quantiles_p99": 23.878006183919823, + "results_end_to_end_latency_s_mean": 22.60646658780952, + "results_end_to_end_latency_s_min": 18.30211873799999, + "results_end_to_end_latency_s_max": 23.902429992999714, + "results_end_to_end_latency_s_stddev": 0.8065539327033907, + "results_request_output_throughput_token_per_s_quantiles_p25": 6.458691586255861, + "results_request_output_throughput_token_per_s_quantiles_p50": 6.621992168964429, + "results_request_output_throughput_token_per_s_quantiles_p75": 6.767838652716483, + "results_request_output_throughput_token_per_s_quantiles_p90": 6.844155507032084, + "results_request_output_throughput_token_per_s_quantiles_p95": 6.893210560071129, + "results_request_output_throughput_token_per_s_quantiles_p99": 6.935344590138063, + "results_request_output_throughput_token_per_s_mean": 6.593738543687555, + "results_request_output_throughput_token_per_s_min": 4.80818648702617, + "results_request_output_throughput_token_per_s_max": 6.943072036645817, + "results_request_output_throughput_token_per_s_stddev": 0.2710909389728309, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 156.0, + "results_number_output_tokens_quantiles_p50": 160.0, + "results_number_output_tokens_quantiles_p75": 162.0, + "results_number_output_tokens_quantiles_p90": 163.0, + "results_number_output_tokens_quantiles_p95": 164.0, + "results_number_output_tokens_quantiles_p99": 166.85000000000002, + "results_number_output_tokens_mean": 158.3452380952381, + "results_number_output_tokens_min": "97", + "results_number_output_tokens_max": "171", + "results_number_output_tokens_stddev": 8.01278202912335, + "results_num_requests_started": 128, + "results_error_rate": 0.34375, + "results_number_errors": 44, + "results_error_code_frequency": "{-1.0: 44}", + "results_mean_output_throughput_token_per_s": 112.72457186619263, + "results_num_completed_requests": 84, + "results_num_completed_requests_per_min": 42.71346832611164, + "timestamp": 1718206892 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..e7a2694 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1102 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05947233936884118, + "ttft_s": 0.6045686090001254, + "end_to_end_latency_s": 9.515756101000079, + "request_output_throughput_token_per_s": 15.763329619622704, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0597044136265186, + "ttft_s": 2.2405072130004555, + "end_to_end_latency_s": 9.433660745999987, + "request_output_throughput_token_per_s": 15.900508194934002, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0583753180185284, + "ttft_s": 2.264089403000071, + "end_to_end_latency_s": 9.457004603000314, + "request_output_throughput_token_per_s": 15.861259066365607, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05922494781133302, + "ttft_s": 2.225026224999965, + "end_to_end_latency_s": 9.417024002999824, + "request_output_throughput_token_per_s": 15.928599093749469, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060076102189549854, + "ttft_s": 1.9994920929993896, + "end_to_end_latency_s": 9.19182651599931, + "request_output_throughput_token_per_s": 16.318845850594517, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06138486309035568, + "ttft_s": 0.6024707719998332, + "end_to_end_latency_s": 9.51504123199993, + "request_output_throughput_token_per_s": 15.764513925124849, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05883797718987105, + "ttft_s": 2.1043968579997454, + "end_to_end_latency_s": 9.296668441000293, + "request_output_throughput_token_per_s": 16.13481226656078, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05802069338890053, + "ttft_s": 2.207688194000184, + "end_to_end_latency_s": 9.400671735999822, + "request_output_throughput_token_per_s": 15.956306550475091, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058459034161500514, + "ttft_s": 2.220032051999624, + "end_to_end_latency_s": 9.41209980199983, + "request_output_throughput_token_per_s": 15.936932582050272, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057818308726726976, + "ttft_s": 2.1161314490000223, + "end_to_end_latency_s": 9.308965438000087, + "request_output_throughput_token_per_s": 16.113498433207806, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.056306798908021594, + "ttft_s": 1.98592391999955, + "end_to_end_latency_s": 9.178250543000104, + "request_output_throughput_token_per_s": 16.342983806908517, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05944650728749821, + "ttft_s": 0.5897580600003494, + "end_to_end_latency_s": 9.511631908000709, + "request_output_throughput_token_per_s": 15.770164515494708, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058886177611425033, + "ttft_s": 2.054056797999692, + "end_to_end_latency_s": 9.245297276999736, + "request_output_throughput_token_per_s": 16.22446477444992, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0602298141646062, + "ttft_s": 0.6041106330003458, + "end_to_end_latency_s": 9.516485990000547, + "request_output_throughput_token_per_s": 15.762120614438206, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0585883909312372, + "ttft_s": 2.182377802000701, + "end_to_end_latency_s": 9.374379140000201, + "request_output_throughput_token_per_s": 16.001059671243123, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0560745475276754, + "ttft_s": 1.9495313970001007, + "end_to_end_latency_s": 9.1403663709998, + "request_output_throughput_token_per_s": 16.41072074265143, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05672628010494952, + "ttft_s": 1.9994011409999075, + "end_to_end_latency_s": 9.189968092000527, + "request_output_throughput_token_per_s": 16.32214589847908, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057769031826096835, + "ttft_s": 2.1096127949995207, + "end_to_end_latency_s": 9.301010145999498, + "request_output_throughput_token_per_s": 16.127280547534635, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05902985067699412, + "ttft_s": 0.5819393110004967, + "end_to_end_latency_s": 9.505300688000716, + "request_output_throughput_token_per_s": 15.780668589406828, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06028084221153362, + "ttft_s": 2.213121613000112, + "end_to_end_latency_s": 9.40397929700066, + "request_output_throughput_token_per_s": 15.950694409529543, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060140210358953575, + "ttft_s": 2.1923538399996687, + "end_to_end_latency_s": 9.38204830900031, + "request_output_throughput_token_per_s": 15.98797992290268, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05748697024688526, + "ttft_s": 1.8925660009999774, + "end_to_end_latency_s": 9.083124791000046, + "request_output_throughput_token_per_s": 16.514140612559515, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060506491980382646, + "ttft_s": 2.0677591030007534, + "end_to_end_latency_s": 9.257758786000522, + "request_output_throughput_token_per_s": 16.202625653503556, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060468581904451695, + "ttft_s": 0.586525370000345, + "end_to_end_latency_s": 9.493738811000185, + "request_output_throughput_token_per_s": 15.799886955621565, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05808209560624959, + "ttft_s": 2.1008610839999164, + "end_to_end_latency_s": 9.29331412800002, + "request_output_throughput_token_per_s": 16.140635938266833, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0585041398917441, + "ttft_s": 1.993290915000216, + "end_to_end_latency_s": 9.185340293999616, + "request_output_throughput_token_per_s": 16.330369392845302, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05918677812581982, + "ttft_s": 2.2176977819999593, + "end_to_end_latency_s": 9.410948275999544, + "request_output_throughput_token_per_s": 15.93888262913318, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05910019492544419, + "ttft_s": 0.593939784000213, + "end_to_end_latency_s": 9.515336516000389, + "request_output_throughput_token_per_s": 15.764024713973013, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057769143735806563, + "ttft_s": 1.994772113000181, + "end_to_end_latency_s": 9.185469764999652, + "request_output_throughput_token_per_s": 16.330139213082006, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06109229509210261, + "ttft_s": 2.0953747240000666, + "end_to_end_latency_s": 9.28620361799949, + "request_output_throughput_token_per_s": 16.15299493425433, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05872647750001542, + "ttft_s": 0.5847220959994957, + "end_to_end_latency_s": 9.514496445000077, + "request_output_throughput_token_per_s": 15.765416579542249, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0610185766104046, + "ttft_s": 2.205459126000278, + "end_to_end_latency_s": 9.397031669999706, + "request_output_throughput_token_per_s": 15.962487439398478, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0588108757062912, + "ttft_s": 2.218741764999322, + "end_to_end_latency_s": 9.409947524000017, + "request_output_throughput_token_per_s": 15.940577736212221, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05744344085798258, + "ttft_s": 2.1146748299997853, + "end_to_end_latency_s": 9.306007349000538, + "request_output_throughput_token_per_s": 16.118620410944544, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05940985857415663, + "ttft_s": 2.017675215000054, + "end_to_end_latency_s": 9.20871698100018, + "request_output_throughput_token_per_s": 16.288914113604147, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06175925779870659, + "ttft_s": 0.5924541650001629, + "end_to_end_latency_s": 9.5110937620002, + "request_output_throughput_token_per_s": 15.771056805190693, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05883708771874012, + "ttft_s": 2.219133000000511, + "end_to_end_latency_s": 9.414122827000028, + "request_output_throughput_token_per_s": 15.933507853731719, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05648691850304321, + "ttft_s": 2.0129191139994873, + "end_to_end_latency_s": 9.207543835999786, + "request_output_throughput_token_per_s": 16.290989505097752, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0583518633435802, + "ttft_s": 0.587740935000511, + "end_to_end_latency_s": 9.514603956000428, + "request_output_throughput_token_per_s": 15.76523843700313, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05858262754718339, + "ttft_s": 2.119169446000342, + "end_to_end_latency_s": 9.31509409499995, + "request_output_throughput_token_per_s": 16.102896918724127, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05836434465193553, + "ttft_s": 2.0338252009996722, + "end_to_end_latency_s": 9.222088581999742, + "request_output_throughput_token_per_s": 16.26529594313153, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.053285515766108064, + "ttft_s": 1.9232527430003756, + "end_to_end_latency_s": 9.112386045000676, + "request_output_throughput_token_per_s": 16.461111201746597, + "number_total_tokens": 721, + "number_output_tokens": 171, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06088013817947495, + "ttft_s": 0.5886850970000523, + "end_to_end_latency_s": 9.497475909000059, + "request_output_throughput_token_per_s": 15.793669964232922, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06030128490321893, + "ttft_s": 2.157683188000192, + "end_to_end_latency_s": 9.346873000999949, + "request_output_throughput_token_per_s": 16.048147865489632, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06977087910234497, + "ttft_s": 1.9900051820004592, + "end_to_end_latency_s": 6.188806914000452, + "request_output_throughput_token_per_s": 14.219218861219359, + "number_total_tokens": 638, + "number_output_tokens": 88, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05866429453122919, + "ttft_s": 2.2020851830002357, + "end_to_end_latency_s": 9.386471306000203, + "request_output_throughput_token_per_s": 15.980446230535438, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0632883288333566, + "ttft_s": 0.5860858930000177, + "end_to_end_latency_s": 9.493450848000066, + "request_output_throughput_token_per_s": 15.800366210522878, + "number_total_tokens": 700, + "number_output_tokens": 150, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05913303540764437, + "ttft_s": 2.0980362370000876, + "end_to_end_latency_s": 9.284234703000038, + "request_output_throughput_token_per_s": 16.15642051267081, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05616589194673019, + "ttft_s": 0.5834438490001048, + "end_to_end_latency_s": 9.492207440999664, + "request_output_throughput_token_per_s": 15.802435938357757, + "number_total_tokens": 719, + "number_output_tokens": 169, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0575819732944378, + "ttft_s": 2.1945462070007125, + "end_to_end_latency_s": 9.386530095000126, + "request_output_throughput_token_per_s": 15.980346143022514, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05989056585807585, + "ttft_s": 2.091516004999903, + "end_to_end_latency_s": 9.283216150000044, + "request_output_throughput_token_per_s": 16.158193192560674, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05669574595681353, + "ttft_s": 1.994036404000326, + "end_to_end_latency_s": 9.184901418000663, + "request_output_throughput_token_per_s": 16.331149695959553, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05762616329556975, + "ttft_s": 1.9739373669999623, + "end_to_end_latency_s": 9.162736455999948, + "request_output_throughput_token_per_s": 16.370655286257517, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05970281830187165, + "ttft_s": 0.5824904880000759, + "end_to_end_latency_s": 9.492923060999601, + "request_output_throughput_token_per_s": 15.80124467839151, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05895938000633366, + "ttft_s": 2.1854788410000765, + "end_to_end_latency_s": 9.374754524999844, + "request_output_throughput_token_per_s": 16.000418954970183, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05906674858601221, + "ttft_s": 2.0842856130002474, + "end_to_end_latency_s": 9.273828420999962, + "request_output_throughput_token_per_s": 16.174549839668703, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05748439068715116, + "ttft_s": 2.181685283000661, + "end_to_end_latency_s": 9.37013279600069, + "request_output_throughput_token_per_s": 16.00831100963929, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06008560782908107, + "ttft_s": 0.5840566879996913, + "end_to_end_latency_s": 9.493694799999503, + "request_output_throughput_token_per_s": 15.799960200954413, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05753591679251784, + "ttft_s": 1.9589564269999755, + "end_to_end_latency_s": 9.148374873000648, + "request_output_throughput_token_per_s": 16.396354771456835, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0592754441666691, + "ttft_s": 2.058064918999662, + "end_to_end_latency_s": 9.247883989999536, + "request_output_throughput_token_per_s": 16.21992665156773, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05764836444518551, + "ttft_s": 2.264332115999423, + "end_to_end_latency_s": 9.454505920999509, + "request_output_throughput_token_per_s": 15.865450955700744, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05893728087265198, + "ttft_s": 2.0632971960003488, + "end_to_end_latency_s": 9.254009139000118, + "request_output_throughput_token_per_s": 16.20919082172068, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060102031559710334, + "ttft_s": 0.6441996329995163, + "end_to_end_latency_s": 9.556417689000227, + "request_output_throughput_token_per_s": 15.696258250898271, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05811475850928585, + "ttft_s": 2.165568458999587, + "end_to_end_latency_s": 9.3566613160001, + "request_output_throughput_token_per_s": 16.031359363568782, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06050924025476558, + "ttft_s": 0.5910257270006696, + "end_to_end_latency_s": 9.500278434000393, + "request_output_throughput_token_per_s": 15.789010926581629, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05817752514910991, + "ttft_s": 2.1739216839996516, + "end_to_end_latency_s": 9.366760922999674, + "request_output_throughput_token_per_s": 16.01407372656235, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05827752548427711, + "ttft_s": 2.0739175910002814, + "end_to_end_latency_s": 9.26631866400021, + "request_output_throughput_token_per_s": 16.187658274990294, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05837399421016827, + "ttft_s": 1.9726546309993864, + "end_to_end_latency_s": 9.165022303999649, + "request_output_throughput_token_per_s": 16.36657228150328, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06237646925177182, + "ttft_s": 0.5833956700007548, + "end_to_end_latency_s": 8.719589223000185, + "request_output_throughput_token_per_s": 15.367696410117594, + "number_total_tokens": 689, + "number_output_tokens": 139, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.054812249457816244, + "ttft_s": 1.909328964999986, + "end_to_end_latency_s": 9.099161895999714, + "request_output_throughput_token_per_s": 16.4850347443477, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05580741894542426, + "ttft_s": 2.018218321999484, + "end_to_end_latency_s": 9.20854898899961, + "request_output_throughput_token_per_s": 16.289211273045044, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05805355873549894, + "ttft_s": 1.8086632439999448, + "end_to_end_latency_s": 8.998487114999989, + "request_output_throughput_token_per_s": 16.669468776585582, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05810530594997658, + "ttft_s": 2.10432905900052, + "end_to_end_latency_s": 9.297017416000017, + "request_output_throughput_token_per_s": 16.134206626509318, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05987505543307171, + "ttft_s": 2.208084178999343, + "end_to_end_latency_s": 9.401612186999955, + "request_output_throughput_token_per_s": 15.954710428006374, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058662361179004724, + "ttft_s": 0.591795376999471, + "end_to_end_latency_s": 9.503483924999273, + "request_output_throughput_token_per_s": 15.783685349897771, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05603197252440157, + "ttft_s": 1.9969819119996828, + "end_to_end_latency_s": 9.189665908999814, + "request_output_throughput_token_per_s": 16.322682618211278, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06065139104459238, + "ttft_s": 0.6045379589995719, + "end_to_end_latency_s": 9.522448955999607, + "request_output_throughput_token_per_s": 15.752250360501296, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057760427687497895, + "ttft_s": 2.0492242869995607, + "end_to_end_latency_s": 9.242141366000396, + "request_output_throughput_token_per_s": 16.23000493714733, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059511331071940864, + "ttft_s": 1.9132805360004568, + "end_to_end_latency_s": 9.105530666999584, + "request_output_throughput_token_per_s": 16.47350445412616, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058881721850013946, + "ttft_s": 2.2293068799999674, + "end_to_end_latency_s": 9.421618972000033, + "request_output_throughput_token_per_s": 15.92083063916963, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05781901272667648, + "ttft_s": 2.118073073999767, + "end_to_end_latency_s": 9.30919091099986, + "request_output_throughput_token_per_s": 16.11310815666677, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.061028833596129775, + "ttft_s": 0.6054196390005018, + "end_to_end_latency_s": 9.521411593000266, + "request_output_throughput_token_per_s": 15.753966576791363, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05845122691300942, + "ttft_s": 2.220024330999877, + "end_to_end_latency_s": 9.410819564000121, + "request_output_throughput_token_per_s": 15.939100625603926, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05611042473780311, + "ttft_s": 2.0117811000000074, + "end_to_end_latency_s": 9.202283369000725, + "request_output_throughput_token_per_s": 16.300302216871255, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05997345233546155, + "ttft_s": 2.1086423730002934, + "end_to_end_latency_s": 9.296061932999692, + "request_output_throughput_token_per_s": 16.13586495885117, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06111901509094813, + "ttft_s": 2.224391857000228, + "end_to_end_latency_s": 9.413084533000074, + "request_output_throughput_token_per_s": 15.935265371742394, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05670712028392733, + "ttft_s": 1.9983716690003348, + "end_to_end_latency_s": 9.186726691000331, + "request_output_throughput_token_per_s": 16.327904926892593, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0630379058741511, + "ttft_s": 0.6110308609995627, + "end_to_end_latency_s": 9.518894451000051, + "request_output_throughput_token_per_s": 15.758132498700105, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05673246664413834, + "ttft_s": 2.060958365999795, + "end_to_end_latency_s": 9.247612275999927, + "request_output_throughput_token_per_s": 16.220403226602706, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05791069267090993, + "ttft_s": 1.963300342000366, + "end_to_end_latency_s": 9.15007787400009, + "request_output_throughput_token_per_s": 16.393303102504124, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05710577741460173, + "ttft_s": 2.177096724999501, + "end_to_end_latency_s": 9.365539757999613, + "request_output_throughput_token_per_s": 16.01616178841982, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05831038883434024, + "ttft_s": 0.5864146180001626, + "end_to_end_latency_s": 9.504792970999915, + "request_output_throughput_token_per_s": 15.781511544508668, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060552024793592396, + "ttft_s": 2.197414283999933, + "end_to_end_latency_s": 9.385933188000308, + "request_output_throughput_token_per_s": 15.981362427741487, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060900198890994246, + "ttft_s": 0.5946105819994045, + "end_to_end_latency_s": 9.500661958999444, + "request_output_throughput_token_per_s": 15.78837355200428, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05716684363749778, + "ttft_s": 1.9580943159999151, + "end_to_end_latency_s": 9.146868633999475, + "request_output_throughput_token_per_s": 16.399054802475323, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05803217959378344, + "ttft_s": 2.096837002999564, + "end_to_end_latency_s": 9.285345676999896, + "request_output_throughput_token_per_s": 16.154487427598404, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057157949218736806, + "ttft_s": 1.9607403599993631, + "end_to_end_latency_s": 9.14557972700004, + "request_output_throughput_token_per_s": 16.401365957935116, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05860511055628308, + "ttft_s": 2.192319688999305, + "end_to_end_latency_s": 9.377005344999816, + "request_output_throughput_token_per_s": 15.996578276452176, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05891168122977337, + "ttft_s": 0.588970290999896, + "end_to_end_latency_s": 9.484954088999984, + "request_output_throughput_token_per_s": 15.814520407005446, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059093429171987705, + "ttft_s": 2.0914987549995203, + "end_to_end_latency_s": 9.277985202999844, + "request_output_throughput_token_per_s": 16.16730321487262, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..73af550 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 4, + "results_inter_token_latency_s_quantiles_p25": 0.05776911575837913, + "results_inter_token_latency_s_quantiles_p50": 0.058695386015622304, + "results_inter_token_latency_s_quantiles_p75": 0.059911287477422276, + "results_inter_token_latency_s_quantiles_p90": 0.060912036662935286, + "results_inter_token_latency_s_quantiles_p95": 0.061403582825773226, + "results_inter_token_latency_s_quantiles_p99": 0.06335315433604652, + "results_inter_token_latency_s_mean": 0.05885630310777595, + "results_inter_token_latency_s_min": 0.053285515766108064, + "results_inter_token_latency_s_max": 0.06977087910234497, + "results_inter_token_latency_s_stddev": 0.0020006321731086134, + "results_ttft_s_quantiles_p25": 1.5175473412498377, + "results_ttft_s_quantiles_p50": 2.026021761499578, + "results_ttft_s_quantiles_p75": 2.1676567652496033, + "results_ttft_s_quantiles_p90": 2.2178021802998957, + "results_ttft_s_quantiles_p95": 2.224423575400215, + "results_ttft_s_quantiles_p99": 2.2640918301300643, + "results_ttft_s_mean": 1.7164627722299883, + "results_ttft_s_min": 0.5819393110004967, + "results_ttft_s_max": 2.264332115999423, + "results_ttft_s_stddev": 0.6576397397336418, + "results_end_to_end_latency_s_quantiles_p25": 9.208674983000037, + "results_end_to_end_latency_s_quantiles_p50": 9.351767158500024, + "results_end_to_end_latency_s_quantiles_p75": 9.45513059149971, + "results_end_to_end_latency_s_quantiles_p90": 9.511918361700646, + "results_end_to_end_latency_s_quantiles_p95": 9.515792595450103, + "results_end_to_end_latency_s_quantiles_p99": 9.522788643329614, + "results_end_to_end_latency_s_mean": 9.300104119000016, + "results_end_to_end_latency_s_min": 6.188806914000452, + "results_end_to_end_latency_s_max": 9.556417689000227, + "results_end_to_end_latency_s_stddev": 0.3468125556885386, + "results_request_output_throughput_token_per_s_quantiles_p25": 15.811499289843525, + "results_request_output_throughput_token_per_s_quantiles_p50": 16.015117757491083, + "results_request_output_throughput_token_per_s_quantiles_p75": 16.238827688643383, + "results_request_output_throughput_token_per_s_quantiles_p90": 16.372920067882177, + "results_request_output_throughput_token_per_s_quantiles_p95": 16.41324026560619, + "results_request_output_throughput_token_per_s_quantiles_p99": 16.515693894199778, + "results_request_output_throughput_token_per_s_mean": 16.041614503501986, + "results_request_output_throughput_token_per_s_min": 14.219218861219359, + "results_request_output_throughput_token_per_s_max": 16.669468776585582, + "results_request_output_throughput_token_per_s_stddev": 0.30252411389576794, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 157.0, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 161.0, + "results_number_output_tokens_quantiles_p90": 163.0, + "results_number_output_tokens_quantiles_p95": 164.0, + "results_number_output_tokens_quantiles_p99": 169.02, + "results_number_output_tokens_mean": 158.22, + "results_number_output_tokens_min": "88", + "results_number_output_tokens_max": "171", + "results_number_output_tokens_stddev": 8.200246362399145, + "results_num_requests_started": 100, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 65.03792192326891, + "results_num_completed_requests": 100, + "results_num_completed_requests_per_min": 24.663603308027646, + "timestamp": 1718206405 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..405e34b --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1410 @@ +[ + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1486298166089688, + "ttft_s": 0.5925165119997473, + "end_to_end_latency_s": 23.186478292000174, + "request_output_throughput_token_per_s": 6.469287750859222, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14445408820860495, + "ttft_s": 0.8862028060002558, + "end_to_end_latency_s": 23.546282891000374, + "request_output_throughput_token_per_s": 6.370432254397636, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13899034696344909, + "ttft_s": 0.6834267450003608, + "end_to_end_latency_s": 22.794651375000285, + "request_output_throughput_token_per_s": 6.5804910780302786, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14323103176584942, + "ttft_s": 3.3484425749993534, + "end_to_end_latency_s": 22.63076422099948, + "request_output_throughput_token_per_s": 6.628145586918023, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14419549936538273, + "ttft_s": 3.208448164999936, + "end_to_end_latency_s": 22.494723792000514, + "request_output_throughput_token_per_s": 6.668230354237219, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.140662786540307, + "ttft_s": 3.3625309539993395, + "end_to_end_latency_s": 22.64698636799949, + "request_output_throughput_token_per_s": 6.6233978138456475, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13709811333953362, + "ttft_s": 2.9251054039996234, + "end_to_end_latency_s": 22.210132103999968, + "request_output_throughput_token_per_s": 6.753674372471901, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14436003569035458, + "ttft_s": 3.0878891850006767, + "end_to_end_latency_s": 22.376011555000332, + "request_output_throughput_token_per_s": 6.703607550045251, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13536974035029345, + "ttft_s": 9.356663462000142, + "end_to_end_latency_s": 21.253269284999988, + "request_output_throughput_token_per_s": 7.057737705599306, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13012541461872615, + "ttft_s": 8.923292153000148, + "end_to_end_latency_s": 20.82029323200004, + "request_output_throughput_token_per_s": 7.204509481617454, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13137663841713845, + "ttft_s": 9.517192184000123, + "end_to_end_latency_s": 21.41516160500032, + "request_output_throughput_token_per_s": 7.00438328539047, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1335508405030828, + "ttft_s": 9.603420685999481, + "end_to_end_latency_s": 21.50229446799949, + "request_output_throughput_token_per_s": 6.975999711251073, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1318781887468066, + "ttft_s": 8.937602965999758, + "end_to_end_latency_s": 20.836997524999788, + "request_output_throughput_token_per_s": 7.198733878047122, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13628057242578154, + "ttft_s": 9.223695871999553, + "end_to_end_latency_s": 21.123691968999992, + "request_output_throughput_token_per_s": 7.10103140209259, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13301367912344525, + "ttft_s": 9.652097737999611, + "end_to_end_latency_s": 21.54845928799932, + "request_output_throughput_token_per_s": 6.961054523445088, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1315031856144238, + "ttft_s": 9.929250950000096, + "end_to_end_latency_s": 21.82974085299975, + "request_output_throughput_token_per_s": 6.8713596285953, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13861438845394428, + "ttft_s": 9.169120395000391, + "end_to_end_latency_s": 21.06961028800015, + "request_output_throughput_token_per_s": 7.11925839869141, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13098161101248137, + "ttft_s": 9.057461857999442, + "end_to_end_latency_s": 20.95730058299978, + "request_output_throughput_token_per_s": 7.157410345189092, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14005612856774186, + "ttft_s": 9.807779858000686, + "end_to_end_latency_s": 21.708905149000202, + "request_output_throughput_token_per_s": 6.909606862735232, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1392474749370692, + "ttft_s": 10.24326357300015, + "end_to_end_latency_s": 22.14057359599974, + "request_output_throughput_token_per_s": 6.774892228948455, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13093407689504244, + "ttft_s": 9.313906544999554, + "end_to_end_latency_s": 21.211521940999774, + "request_output_throughput_token_per_s": 7.071628354496564, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": -1, + "error_msg": "", + "inter_token_latency_s": 0.0, + "ttft_s": 0, + "end_to_end_latency_s": 0, + "request_output_throughput_token_per_s": 0, + "number_total_tokens": 551, + "number_output_tokens": 1, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14395980998779123, + "ttft_s": 3.2218972179998673, + "end_to_end_latency_s": 23.609614591000536, + "request_output_throughput_token_per_s": 6.353343864290639, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.15299945056132505, + "ttft_s": 3.3274701670006834, + "end_to_end_latency_s": 23.71510483900056, + "request_output_throughput_token_per_s": 6.325082727583739, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1474823745632743, + "ttft_s": 2.913854379999975, + "end_to_end_latency_s": 23.302756097999918, + "request_output_throughput_token_per_s": 6.437006823105982, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14536718319257444, + "ttft_s": 3.0148371209998004, + "end_to_end_latency_s": 23.404350607000197, + "request_output_throughput_token_per_s": 6.409064815288457, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14888304559997892, + "ttft_s": 0.5813920509999662, + "end_to_end_latency_s": 23.821505848000015, + "request_output_throughput_token_per_s": 6.296831147330409, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.143322959945103, + "ttft_s": 3.115560797999933, + "end_to_end_latency_s": 23.506501361999653, + "request_output_throughput_token_per_s": 6.381213337110571, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1347556232072781, + "ttft_s": 10.211136273000193, + "end_to_end_latency_s": 22.1001230690008, + "request_output_throughput_token_per_s": 6.787292520121783, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14339720210631982, + "ttft_s": 11.055061028000637, + "end_to_end_latency_s": 22.943796041000496, + "request_output_throughput_token_per_s": 6.537715020302239, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14337546705734264, + "ttft_s": 10.620353262000208, + "end_to_end_latency_s": 22.510182349000388, + "request_output_throughput_token_per_s": 6.663651039089031, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1456113281854727, + "ttft_s": 10.098225970000385, + "end_to_end_latency_s": 21.987505430000056, + "request_output_throughput_token_per_s": 6.8220563027280905, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14626503222014595, + "ttft_s": 11.366292114999851, + "end_to_end_latency_s": 23.25638450599945, + "request_output_throughput_token_per_s": 6.449841761143246, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1427560279872772, + "ttft_s": 10.522444181000537, + "end_to_end_latency_s": 22.412922556000012, + "request_output_throughput_token_per_s": 6.692567630357716, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1345587691151443, + "ttft_s": 10.311210351999762, + "end_to_end_latency_s": 22.20244073699996, + "request_output_throughput_token_per_s": 6.756013979581431, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14836655994811546, + "ttft_s": 10.957157557000755, + "end_to_end_latency_s": 22.848659715000394, + "request_output_throughput_token_per_s": 6.564936493912742, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14647997863293416, + "ttft_s": 11.25230756900055, + "end_to_end_latency_s": 23.144138472000122, + "request_output_throughput_token_per_s": 6.481122647164881, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13751623017719505, + "ttft_s": 9.8352132480004, + "end_to_end_latency_s": 21.727762630000143, + "request_output_throughput_token_per_s": 6.9036100289907765, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.13687863499997804, + "ttft_s": 10.417981276000319, + "end_to_end_latency_s": 22.31152420000035, + "request_output_throughput_token_per_s": 6.722983094090795, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1382121469240114, + "ttft_s": 9.944441104000362, + "end_to_end_latency_s": 21.837752905000343, + "request_output_throughput_token_per_s": 6.868838595827021, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14489134604458173, + "ttft_s": 10.853556992999984, + "end_to_end_latency_s": 22.748186030999932, + "request_output_throughput_token_per_s": 6.593932359951187, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.15061510696079874, + "ttft_s": 11.151011865000328, + "end_to_end_latency_s": 23.044387597000423, + "request_output_throughput_token_per_s": 6.509177098701672, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.14224485205030868, + "ttft_s": 10.724074358000507, + "end_to_end_latency_s": 22.61715798700061, + "request_output_throughput_token_per_s": 6.632133006552533, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..5119f6a --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 64, + "results_inter_token_latency_s_quantiles_p25": 0.13559744836916549, + "results_inter_token_latency_s_quantiles_p50": 0.14145381929530784, + "results_inter_token_latency_s_quantiles_p75": 0.14478203158558753, + "results_inter_token_latency_s_quantiles_p90": 0.14827814140963133, + "results_inter_token_latency_s_quantiles_p95": 0.14887038415042841, + "results_inter_token_latency_s_quantiles_p99": 0.15202186968510925, + "results_inter_token_latency_s_mean": 0.1405355425622709, + "results_inter_token_latency_s_min": 0.13012541461872615, + "results_inter_token_latency_s_max": 0.15299945056132505, + "results_inter_token_latency_s_stddev": 0.0061116513696183525, + "results_ttft_s_quantiles_p25": 3.2482904552500713, + "results_ttft_s_quantiles_p50": 9.436927823000133, + "results_ttft_s_quantiles_p75": 10.294223657249859, + "results_ttft_s_quantiles_p90": 10.946797500600677, + "results_ttft_s_quantiles_p95": 11.146214323150343, + "results_ttft_s_quantiles_p99": 11.319558451140137, + "results_ttft_s_mean": 7.531542606476272, + "results_ttft_s_min": 0.5813920509999662, + "results_ttft_s_max": 11.366292114999851, + "results_ttft_s_stddev": 3.745885076726451, + "results_end_to_end_latency_s_quantiles_p25": 21.713619519250187, + "results_end_to_end_latency_s_quantiles_p50": 22.394467055500172, + "results_end_to_end_latency_s_quantiles_p75": 23.01923970800044, + "results_end_to_end_latency_s_quantiles_p90": 23.496286286499707, + "results_end_to_end_latency_s_quantiles_p95": 23.606448006000527, + "results_end_to_end_latency_s_quantiles_p99": 23.77788143431024, + "results_end_to_end_latency_s_mean": 22.341823998809602, + "results_end_to_end_latency_s_min": 20.82029323200004, + "results_end_to_end_latency_s_max": 23.821505848000015, + "results_end_to_end_latency_s_stddev": 0.8602269181013169, + "results_request_output_throughput_token_per_s_quantiles_p25": 6.516311579101814, + "results_request_output_throughput_token_per_s_quantiles_p50": 6.698087590201483, + "results_request_output_throughput_token_per_s_quantiles_p75": 6.908107654299118, + "results_request_output_throughput_token_per_s_quantiles_p90": 7.098091097332987, + "results_request_output_throughput_token_per_s_quantiles_p95": 7.155502747864208, + "results_request_output_throughput_token_per_s_quantiles_p99": 7.202141484153618, + "results_request_output_throughput_token_per_s_mean": 6.723649687145935, + "results_request_output_throughput_token_per_s_min": 6.296831147330409, + "results_request_output_throughput_token_per_s_max": 7.204509481617454, + "results_request_output_throughput_token_per_s_stddev": 0.2604966544351911, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 157.0, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 162.0, + "results_number_output_tokens_quantiles_p90": 164.0, + "results_number_output_tokens_quantiles_p95": 164.0, + "results_number_output_tokens_quantiles_p99": 165.59, + "results_number_output_tokens_mean": 159.04761904761904, + "results_number_output_tokens_min": "151", + "results_number_output_tokens_max": "166", + "results_number_output_tokens_stddev": 3.6755778105734014, + "results_num_requests_started": 128, + "results_error_rate": 0.671875, + "results_number_errors": 86, + "results_error_code_frequency": "{-1.0: 86}", + "results_mean_output_throughput_token_per_s": 71.91492735226427, + "results_num_completed_requests": 42, + "results_num_completed_requests_per_min": 27.129583372411076, + "timestamp": 1718207004 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..def7ca8 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1146 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.09451980468477063, + "ttft_s": 4.251671874999374, + "end_to_end_latency_s": 8.749650034999831, + "request_output_throughput_token_per_s": 9.828964547837673, + "number_total_tokens": 642, + "number_output_tokens": 92, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07894236746207316, + "ttft_s": 0.608746771999904, + "end_to_end_latency_s": 12.473087952000242, + "request_output_throughput_token_per_s": 12.0258913091321, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07356788208284418, + "ttft_s": 4.551225029000307, + "end_to_end_latency_s": 12.43316751400016, + "request_output_throughput_token_per_s": 12.064504063915733, + "number_total_tokens": 719, + "number_output_tokens": 169, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07411416631704769, + "ttft_s": 4.272414478000428, + "end_to_end_latency_s": 12.154930620000414, + "request_output_throughput_token_per_s": 12.340671015693127, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0761423900310671, + "ttft_s": 4.37736031500026, + "end_to_end_latency_s": 12.259155357999589, + "request_output_throughput_token_per_s": 12.235753248866285, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07772984786074227, + "ttft_s": 4.39970540000013, + "end_to_end_latency_s": 12.281523453000773, + "request_output_throughput_token_per_s": 12.213468514229817, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07645342752795548, + "ttft_s": 4.427380072000233, + "end_to_end_latency_s": 12.310174016999554, + "request_output_throughput_token_per_s": 12.185043021557592, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0774108639746638, + "ttft_s": 4.348860512000101, + "end_to_end_latency_s": 12.231156200999976, + "request_output_throughput_token_per_s": 12.263762929275364, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07748369893123482, + "ttft_s": 2.783591279000575, + "end_to_end_latency_s": 12.397641051999926, + "request_output_throughput_token_per_s": 12.09907589442612, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07983966278565773, + "ttft_s": 2.681553160000476, + "end_to_end_latency_s": 12.295544632000201, + "request_output_throughput_token_per_s": 12.199540930428753, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07966995980388995, + "ttft_s": 2.574488635999842, + "end_to_end_latency_s": 12.189681392999773, + "request_output_throughput_token_per_s": 12.30548979615999, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07964007292351888, + "ttft_s": 0.5904932390003523, + "end_to_end_latency_s": 12.503680921000523, + "request_output_throughput_token_per_s": 11.996467356110145, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0731054757987949, + "ttft_s": 2.3726948860003176, + "end_to_end_latency_s": 11.989488754000377, + "request_output_throughput_token_per_s": 12.510958813815265, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07513052567948109, + "ttft_s": 3.83707039500041, + "end_to_end_latency_s": 11.72055761899992, + "request_output_throughput_token_per_s": 12.798025902525195, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07543991229297016, + "ttft_s": 3.960289463000663, + "end_to_end_latency_s": 11.844244973000059, + "request_output_throughput_token_per_s": 12.664378383082878, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07206684999999209, + "ttft_s": 3.7200747240003693, + "end_to_end_latency_s": 11.603442786999949, + "request_output_throughput_token_per_s": 12.927197794093855, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07446083431250372, + "ttft_s": 2.3133415499996772, + "end_to_end_latency_s": 11.913910680999834, + "request_output_throughput_token_per_s": 12.590324370923668, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07818285128205506, + "ttft_s": 2.5961459820000528, + "end_to_end_latency_s": 12.196883605000039, + "request_output_throughput_token_per_s": 12.298223452629195, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0748491144550679, + "ttft_s": 0.5904966400003104, + "end_to_end_latency_s": 12.499994454999978, + "request_output_throughput_token_per_s": 12.000005323202382, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07558905128930832, + "ttft_s": 2.418725491999794, + "end_to_end_latency_s": 12.019053541000176, + "request_output_throughput_token_per_s": 12.480184025165563, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07753730942136632, + "ttft_s": 2.7274404729996604, + "end_to_end_latency_s": 12.33172242099954, + "request_output_throughput_token_per_s": 12.163750924572128, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07040465286821175, + "ttft_s": 3.8755974449995847, + "end_to_end_latency_s": 11.75811019699995, + "request_output_throughput_token_per_s": 12.757152083697267, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07467696022429383, + "ttft_s": 3.767617309000343, + "end_to_end_latency_s": 11.649802808000459, + "request_output_throughput_token_per_s": 12.875754420236886, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07231389419510423, + "ttft_s": 3.9777027250001993, + "end_to_end_latency_s": 11.859962784000345, + "request_output_throughput_token_per_s": 12.647594493496822, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07468007810626887, + "ttft_s": 2.34073429900036, + "end_to_end_latency_s": 11.948995667999952, + "request_output_throughput_token_per_s": 12.553356296019757, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07824500192258593, + "ttft_s": 2.5195780960002594, + "end_to_end_latency_s": 12.128162779000377, + "request_output_throughput_token_per_s": 12.367907879643685, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07784081059751015, + "ttft_s": 2.7682436330005658, + "end_to_end_latency_s": 12.376875427000414, + "request_output_throughput_token_per_s": 12.1193754340269, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07859457264144884, + "ttft_s": 0.5894232459995692, + "end_to_end_latency_s": 12.496720752999863, + "request_output_throughput_token_per_s": 12.003148903202641, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07891850662583766, + "ttft_s": 2.6241530079996664, + "end_to_end_latency_s": 12.233158527000342, + "request_output_throughput_token_per_s": 12.261755593939897, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0733201356851741, + "ttft_s": 3.9919746960003977, + "end_to_end_latency_s": 11.878086104000431, + "request_output_throughput_token_per_s": 12.628297074684562, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0728566361812284, + "ttft_s": 3.7714911190005296, + "end_to_end_latency_s": 11.657241752000118, + "request_output_throughput_token_per_s": 12.867537895425682, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07403004898111798, + "ttft_s": 3.885222586999589, + "end_to_end_latency_s": 11.770972829999664, + "request_output_throughput_token_per_s": 12.743211811491733, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.09355339796589264, + "ttft_s": 3.9986038110000663, + "end_to_end_latency_s": 8.286535496999932, + "request_output_throughput_token_per_s": 9.895570957209728, + "number_total_tokens": 638, + "number_output_tokens": 88, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07493125089499868, + "ttft_s": 3.111814307999339, + "end_to_end_latency_s": 12.139335891999508, + "request_output_throughput_token_per_s": 12.356524387702153, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07930809106406765, + "ttft_s": 3.3432742360000702, + "end_to_end_latency_s": 12.372368350000215, + "request_output_throughput_token_per_s": 12.123790349322844, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08053898905160285, + "ttft_s": 0.5946043820003979, + "end_to_end_latency_s": 12.483769018000203, + "request_output_throughput_token_per_s": 12.015602001584355, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07714154331446159, + "ttft_s": 3.2373111659999267, + "end_to_end_latency_s": 12.265752250999867, + "request_output_throughput_token_per_s": 12.229172490237804, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07813665142211076, + "ttft_s": 3.0049047369993787, + "end_to_end_latency_s": 12.033228090999728, + "request_output_throughput_token_per_s": 12.465482983090194, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0760097663184236, + "ttft_s": 2.9051899179994507, + "end_to_end_latency_s": 11.933725733999381, + "request_output_throughput_token_per_s": 12.569419085327857, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07268374413576376, + "ttft_s": 3.899361047000639, + "end_to_end_latency_s": 11.774944980999862, + "request_output_throughput_token_per_s": 12.738913026094059, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07332057251223, + "ttft_s": 2.4198610359999293, + "end_to_end_latency_s": 12.025181607999912, + "request_output_throughput_token_per_s": 12.473824087630453, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0745543013062047, + "ttft_s": 2.32361727100033, + "end_to_end_latency_s": 11.92891604200031, + "request_output_throughput_token_per_s": 12.574487025633147, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07807127272499201, + "ttft_s": 0.5933349959996121, + "end_to_end_latency_s": 12.491601267999613, + "request_output_throughput_token_per_s": 12.008068203734844, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07441028261443962, + "ttft_s": 2.746366565000244, + "end_to_end_latency_s": 12.352309834000152, + "request_output_throughput_token_per_s": 12.143477779930674, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08001598015036114, + "ttft_s": 2.637145913000495, + "end_to_end_latency_s": 12.242626137000116, + "request_output_throughput_token_per_s": 12.252273190526049, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07321101093749575, + "ttft_s": 3.8325501469998926, + "end_to_end_latency_s": 11.713947494999957, + "request_output_throughput_token_per_s": 12.805247766735064, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07362321395650939, + "ttft_s": 3.9713236629995663, + "end_to_end_latency_s": 11.85350980200019, + "request_output_throughput_token_per_s": 12.654479770598295, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07145829810492671, + "ttft_s": 3.6952673380001215, + "end_to_end_latency_s": 11.576553025000067, + "request_output_throughput_token_per_s": 12.957224803969586, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08023606185067612, + "ttft_s": 2.183523866999167, + "end_to_end_latency_s": 12.3566176789991, + "request_output_throughput_token_per_s": 12.13924424115954, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07661735268710203, + "ttft_s": 0.5868381930004034, + "end_to_end_latency_s": 12.488806513000782, + "request_output_throughput_token_per_s": 12.010755378734613, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07957235644809842, + "ttft_s": 2.08075548800025, + "end_to_end_latency_s": 12.254324665999775, + "request_output_throughput_token_per_s": 12.240576619957064, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07681261117087057, + "ttft_s": 1.9625760629996876, + "end_to_end_latency_s": 12.136662187999718, + "request_output_throughput_token_per_s": 12.359246527295983, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06868346208534049, + "ttft_s": 3.3865884080005344, + "end_to_end_latency_s": 11.264309359000436, + "request_output_throughput_token_per_s": 13.316395636821412, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0724835464223711, + "ttft_s": 3.792426195999724, + "end_to_end_latency_s": 11.670037445000162, + "request_output_throughput_token_per_s": 12.853429194802205, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07474458065186083, + "ttft_s": 3.931849032000173, + "end_to_end_latency_s": 11.809849995999684, + "request_output_throughput_token_per_s": 12.701262086377817, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07175380103726864, + "ttft_s": 3.6743703330002973, + "end_to_end_latency_s": 11.552559458999895, + "request_output_throughput_token_per_s": 12.984135726143712, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07569849148102306, + "ttft_s": 2.347952740000437, + "end_to_end_latency_s": 11.960819918000198, + "request_output_throughput_token_per_s": 12.540946275285066, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08014864574997424, + "ttft_s": 0.584443392999674, + "end_to_end_latency_s": 12.503382398999747, + "request_output_throughput_token_per_s": 11.9967537753624, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07739232482497868, + "ttft_s": 2.7697847519993957, + "end_to_end_latency_s": 12.383132396999827, + "request_output_throughput_token_per_s": 12.113251735590088, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07648168018352672, + "ttft_s": 2.471717106000142, + "end_to_end_latency_s": 12.08570840900029, + "request_output_throughput_token_per_s": 12.41135355278754, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07583041828566928, + "ttft_s": 2.5957761669997126, + "end_to_end_latency_s": 12.209676638000019, + "request_output_throughput_token_per_s": 12.285337642207242, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07394752151554057, + "ttft_s": 4.025471183000263, + "end_to_end_latency_s": 11.906075640999916, + "request_output_throughput_token_per_s": 12.598609694991191, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07287916032098031, + "ttft_s": 3.9257156629992096, + "end_to_end_latency_s": 11.806737655999314, + "request_output_throughput_token_per_s": 12.704610229378735, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07225439798149176, + "ttft_s": 3.8248146480000287, + "end_to_end_latency_s": 11.705423015999258, + "request_output_throughput_token_per_s": 12.814573193551086, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.16623768413158982, + "ttft_s": 2.7920753169992167, + "end_to_end_latency_s": 6.370256893999795, + "request_output_throughput_token_per_s": 5.651263457508086, + "number_total_tokens": 588, + "number_output_tokens": 38, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07724397709432579, + "ttft_s": 2.6849319220000325, + "end_to_end_latency_s": 12.281957927999429, + "request_output_throughput_token_per_s": 12.213036462048283, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08112379427273647, + "ttft_s": 0.5852852840007472, + "end_to_end_latency_s": 12.493267240000023, + "request_output_throughput_token_per_s": 12.006466932824509, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0734682621454395, + "ttft_s": 2.5251856780005255, + "end_to_end_latency_s": 12.122455787000035, + "request_output_throughput_token_per_s": 12.373730425221106, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07455689182609926, + "ttft_s": 2.406098041000405, + "end_to_end_latency_s": 12.003867727000397, + "request_output_throughput_token_per_s": 12.49597241584092, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07173594383951319, + "ttft_s": 3.752823740999702, + "end_to_end_latency_s": 11.621653968999453, + "request_output_throughput_token_per_s": 12.906940819277722, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07499128046540891, + "ttft_s": 4.053904346000309, + "end_to_end_latency_s": 11.92388488800043, + "request_output_throughput_token_per_s": 12.579792694153909, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07385545367296857, + "ttft_s": 3.8715767240000787, + "end_to_end_latency_s": 11.743389808999382, + "request_output_throughput_token_per_s": 12.773143226928362, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07856227611723468, + "ttft_s": 2.521986936999383, + "end_to_end_latency_s": 11.44573760899948, + "request_output_throughput_token_per_s": 11.96952129081489, + "number_total_tokens": 695, + "number_output_tokens": 145, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07554193393827047, + "ttft_s": 2.6246841350002796, + "end_to_end_latency_s": 12.237967798000682, + "request_output_throughput_token_per_s": 12.256936974822365, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07665782074535916, + "ttft_s": 2.729272006999963, + "end_to_end_latency_s": 12.342094294000162, + "request_output_throughput_token_per_s": 12.153528925226183, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07536101133734836, + "ttft_s": 0.5893350149999605, + "end_to_end_latency_s": 12.510303247000593, + "request_output_throughput_token_per_s": 11.990117029014725, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07369526450308117, + "ttft_s": 2.3983828629998243, + "end_to_end_latency_s": 12.012636827999813, + "request_output_throughput_token_per_s": 12.486850484846967, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07127567598170727, + "ttft_s": 3.8008638930004963, + "end_to_end_latency_s": 11.689468983000552, + "request_output_throughput_token_per_s": 12.83206279242778, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07563577435441886, + "ttft_s": 4.062436213999717, + "end_to_end_latency_s": 11.950636468999619, + "request_output_throughput_token_per_s": 12.5516327426665, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07629164981932968, + "ttft_s": 3.9371745490007015, + "end_to_end_latency_s": 11.825425343999996, + "request_output_throughput_token_per_s": 12.684533167858293, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07776860519230375, + "ttft_s": 2.5204864180004733, + "end_to_end_latency_s": 12.13209241200002, + "request_output_throughput_token_per_s": 12.363901865075881, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08066675179996954, + "ttft_s": 0.5866109199996572, + "end_to_end_latency_s": 12.503670162999697, + "request_output_throughput_token_per_s": 11.996477677720044, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07611788263974355, + "ttft_s": 2.6419568250003067, + "end_to_end_latency_s": 12.255189919000259, + "request_output_throughput_token_per_s": 12.239712398699126, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07975058941288632, + "ttft_s": 2.7488725059993158, + "end_to_end_latency_s": 12.36153534299956, + "request_output_throughput_token_per_s": 12.134415008969436, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07654839885991774, + "ttft_s": 2.406431716000043, + "end_to_end_latency_s": 12.018293002999599, + "request_output_throughput_token_per_s": 12.480973792414787, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07540040617416033, + "ttft_s": 3.805429740999898, + "end_to_end_latency_s": 11.687340419000066, + "request_output_throughput_token_per_s": 12.834399839688555, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07349424830245868, + "ttft_s": 4.024762453999756, + "end_to_end_latency_s": 11.907487792999746, + "request_output_throughput_token_per_s": 12.597115580347939, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07662298444802218, + "ttft_s": 3.9186175029999504, + "end_to_end_latency_s": 11.800120306999816, + "request_output_throughput_token_per_s": 12.711734804179937, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07613748306832677, + "ttft_s": 3.225624284999867, + "end_to_end_latency_s": 12.259537822000311, + "request_output_throughput_token_per_s": 12.235371526879097, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07930068784611859, + "ttft_s": 3.3384630769996875, + "end_to_end_latency_s": 12.371527432999756, + "request_output_throughput_token_per_s": 12.124614427147508, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07785991873716878, + "ttft_s": 3.112577279999641, + "end_to_end_latency_s": 12.146335815999919, + "request_output_throughput_token_per_s": 12.349403332189329, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07229611252729559, + "ttft_s": 2.8934266690002914, + "end_to_end_latency_s": 11.929806178000035, + "request_output_throughput_token_per_s": 12.573548787122597, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07848992477359977, + "ttft_s": 0.5889905379999618, + "end_to_end_latency_s": 12.480193381999925, + "request_output_throughput_token_per_s": 12.019044529898368, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08024334388000473, + "ttft_s": 2.999269982000442, + "end_to_end_latency_s": 12.036694584000543, + "request_output_throughput_token_per_s": 12.4618930017036, + "number_total_tokens": 700, + "number_output_tokens": 150, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07198491259624681, + "ttft_s": 3.7115181239996673, + "end_to_end_latency_s": 11.58981820200006, + "request_output_throughput_token_per_s": 12.94239455577607, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07316979820983267, + "ttft_s": 3.9754333169994425, + "end_to_end_latency_s": 11.853696444999514, + "request_output_throughput_token_per_s": 12.654280518823102, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07047289831181629, + "ttft_s": 2.3756073210006434, + "end_to_end_latency_s": 11.980597025000861, + "request_output_throughput_token_per_s": 12.520244165376994, + "number_total_tokens": 720, + "number_output_tokens": 170, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07741672940882376, + "ttft_s": 2.704058339999392, + "end_to_end_latency_s": 12.309534109999731, + "request_output_throughput_token_per_s": 12.18567645692996, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07505756055277699, + "ttft_s": 2.4787255589999404, + "end_to_end_latency_s": 12.084464632999698, + "request_output_throughput_token_per_s": 12.412630973356233, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07899746125313649, + "ttft_s": 0.5852612690005117, + "end_to_end_latency_s": 12.48333854000066, + "request_output_throughput_token_per_s": 12.016016350061438, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07721197331019412, + "ttft_s": 2.5942867169997044, + "end_to_end_latency_s": 12.199671474999377, + "request_output_throughput_token_per_s": 12.295413061523254, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07470380815922596, + "ttft_s": 3.844979370999681, + "end_to_end_latency_s": 11.728703556000255, + "request_output_throughput_token_per_s": 12.789137289028156, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07263752038750226, + "ttft_s": 3.7392858889998024, + "end_to_end_latency_s": 11.622199554999497, + "request_output_throughput_token_per_s": 12.906334923106256, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07358875159877278, + "ttft_s": 4.038558043999728, + "end_to_end_latency_s": 11.921562911000365, + "request_output_throughput_token_per_s": 12.58224287535242, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..ff41e27 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/tgi/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 8, + "results_inter_token_latency_s_quantiles_p25": 0.07361459836707523, + "results_inter_token_latency_s_quantiles_p50": 0.07576445488334617, + "results_inter_token_latency_s_quantiles_p75": 0.0780876173992717, + "results_inter_token_latency_s_quantiles_p90": 0.07981294077382631, + "results_inter_token_latency_s_quantiles_p95": 0.08049464227586313, + "results_inter_token_latency_s_quantiles_p99": 0.09449081248320429, + "results_inter_token_latency_s_mean": 0.07698495761357861, + "results_inter_token_latency_s_min": 0.06868346208534049, + "results_inter_token_latency_s_max": 0.16623768413158982, + "results_inter_token_latency_s_stddev": 0.009578760759440424, + "results_ttft_s_quantiles_p25": 2.4195771499998955, + "results_ttft_s_quantiles_p50": 2.842750992999754, + "results_ttft_s_quantiles_p75": 3.8516287092497805, + "results_ttft_s_quantiles_p90": 4.0252585643001115, + "results_ttft_s_quantiles_p95": 4.26930308755027, + "results_ttft_s_quantiles_p99": 4.426549831840229, + "results_ttft_s_mean": 2.9150951233846345, + "results_ttft_s_min": 0.584443392999674, + "results_ttft_s_max": 4.551225029000307, + "results_ttft_s_stddev": 1.1043842372796846, + "results_end_to_end_latency_s_quantiles_p25": 11.809071910999592, + "results_end_to_end_latency_s_quantiles_p50": 12.034961337500135, + "results_end_to_end_latency_s_quantiles_p75": 12.299042001500084, + "results_end_to_end_latency_s_quantiles_p90": 12.482394992600438, + "results_end_to_end_latency_s_quantiles_p95": 12.496202726049887, + "results_end_to_end_latency_s_quantiles_p99": 12.503680598260498, + "results_end_to_end_latency_s_mean": 11.941957940932683, + "results_end_to_end_latency_s_min": 6.370256893999795, + "results_end_to_end_latency_s_max": 12.510303247000593, + "results_end_to_end_latency_s_stddev": 0.793922218426776, + "results_request_output_throughput_token_per_s_quantiles_p25": 12.151016138902307, + "results_request_output_throughput_token_per_s_quantiles_p50": 12.370819152432396, + "results_request_output_throughput_token_per_s_quantiles_p75": 12.6543303317669, + "results_request_output_throughput_token_per_s_quantiles_p90": 12.833698725510322, + "results_request_output_throughput_token_per_s_quantiles_p95": 12.906849934852001, + "results_request_output_throughput_token_per_s_quantiles_p99": 12.98332839847849, + "results_request_output_throughput_token_per_s_mean": 12.314455004866655, + "results_request_output_throughput_token_per_s_min": 5.651263457508086, + "results_request_output_throughput_token_per_s_max": 13.316395636821412, + "results_request_output_throughput_token_per_s_stddev": 0.8065536661208635, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 156.0, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 162.0, + "results_number_output_tokens_quantiles_p90": 164.0, + "results_number_output_tokens_quantiles_p95": 165.85, + "results_number_output_tokens_quantiles_p99": 168.94, + "results_number_output_tokens_mean": 156.83653846153845, + "results_number_output_tokens_min": "38", + "results_number_output_tokens_max": "170", + "results_number_output_tokens_stddev": 15.660915779323059, + "results_num_requests_started": 104, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 96.03924737558681, + "results_num_completed_requests": 104, + "results_num_completed_requests_per_min": 36.74115036623516, + "timestamp": 1718206595 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv b/benchmarks/results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv new file mode 100644 index 0000000..359ff3c --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv @@ -0,0 +1,16 @@ +,1,2,4,8,16,32,64 +concurrency,1.0,2.0,4.0,8.0,16.0,32.0,64.0 +mean_input_token_length,550.0,550.0,550.0,550.0,550.0,550.0,550.0 +mean_output_token_length,159.0375,159.22,159.18,159.5,159.27678571428572,160.0,159.40625 +time_to_first_token_in_ms_(ttft)_p50,586.917666500085,593.2397525002671,2035.751768499722,3176.6039474996433,6561.932362000334,9879.810319000171,17220.491680999658 +time_to_first_token_in_ms_(ttft)_p75,588.4995797496231,1060.5307812497813,2150.540910750351,3838.76859575048,7219.509921999816,13748.74519900004,25327.48620450093 +time_to_first_token_in_ms_(ttft)_p95,592.0850745994358,1091.2284775492935,2235.427715849619,4254.826318100367,7814.290068099763,14925.1189030987,29665.552642800958 +throughput_token_per_s_(token/sec)_p50,20.305458996005164,18.483502378173807,15.919263824400435,12.154551419986344,8.382047222156121,5.108561161797954,3.08544989041975 +throughput_token_per_s_(token/sec)_p75,20.350613901547643,18.62048349387721,16.125915244796303,12.34099733333681,8.560894932024599,5.273474722315811,3.1354960779360517 +throughput_token_per_s_(token/sec)_p95,20.389396045804336,18.740678393133553,16.28105133190631,12.524066100837192,8.716126272636831,5.4585904110907855,3.251683697677572 +latency_ms_per_token_(inter_token_latency)_p50,46.70670377673851,51.326324681221536,59.637933259232845,78.06983328522192,113.26328822561354,183.93538549267723,307.45351493844254 +latency_ms_per_token_(inter_token_latency)_p75,47.63683723238133,52.004817064281525,60.647559917482496,79.20261970734134,116.40027259278233,188.72039812495933,319.2453598913946 +latency_ms_per_token_(inter_token_latency)_p95,48.62765116823143,52.996850070530854,61.93614683918202,81.29656306738025,120.10312021074212,194.89754481931246,330.42386755233827 +requests_per_minute_(qpm),7.915935703584854,14.13572641706053,24.32544640710275,36.044085449240555,46.821420363866935,52.57208219530792,50.95750230507558 +results_number_errors,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +results_num_completed_requests,80.0,100.0,100.0,104.0,112.0,128.0,128.0 diff --git a/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_1.json b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_1.json new file mode 100644 index 0000000..4c913e6 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_1.json @@ -0,0 +1,17 @@ +{ + "concurrency": 1, + "mean_input_token_length": 550.0, + "mean_output_token_length": 159.0375, + "time_to_first_token_in_ms_(ttft)_p50": 586.917666500085, + "time_to_first_token_in_ms_(ttft)_p75": 588.4995797496231, + "time_to_first_token_in_ms_(ttft)_p95": 592.0850745994358, + "throughput_token_per_s_(token/sec)_p50": 20.305458996005164, + "throughput_token_per_s_(token/sec)_p75": 20.350613901547643, + "throughput_token_per_s_(token/sec)_p95": 20.389396045804336, + "latency_ms_per_token_(inter_token_latency)_p50": 46.70670377673851, + "latency_ms_per_token_(inter_token_latency)_p75": 47.63683723238133, + "latency_ms_per_token_(inter_token_latency)_p95": 48.62765116823143, + "requests_per_minute_(qpm)": 7.915935703584854, + "results_number_errors": 0, + "results_num_completed_requests": 80 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_16.json b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_16.json new file mode 100644 index 0000000..09a5af2 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_16.json @@ -0,0 +1,17 @@ +{ + "concurrency": 16, + "mean_input_token_length": 550.0, + "mean_output_token_length": 159.27678571428572, + "time_to_first_token_in_ms_(ttft)_p50": 6561.932362000334, + "time_to_first_token_in_ms_(ttft)_p75": 7219.509921999816, + "time_to_first_token_in_ms_(ttft)_p95": 7814.290068099763, + "throughput_token_per_s_(token/sec)_p50": 8.382047222156121, + "throughput_token_per_s_(token/sec)_p75": 8.560894932024599, + "throughput_token_per_s_(token/sec)_p95": 8.716126272636831, + "latency_ms_per_token_(inter_token_latency)_p50": 113.26328822561354, + "latency_ms_per_token_(inter_token_latency)_p75": 116.40027259278233, + "latency_ms_per_token_(inter_token_latency)_p95": 120.10312021074212, + "requests_per_minute_(qpm)": 46.821420363866935, + "results_number_errors": 0, + "results_num_completed_requests": 112 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_2.json b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_2.json new file mode 100644 index 0000000..acb57cf --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_2.json @@ -0,0 +1,17 @@ +{ + "concurrency": 2, + "mean_input_token_length": 550.0, + "mean_output_token_length": 159.22, + "time_to_first_token_in_ms_(ttft)_p50": 593.2397525002671, + "time_to_first_token_in_ms_(ttft)_p75": 1060.5307812497813, + "time_to_first_token_in_ms_(ttft)_p95": 1091.2284775492935, + "throughput_token_per_s_(token/sec)_p50": 18.483502378173807, + "throughput_token_per_s_(token/sec)_p75": 18.62048349387721, + "throughput_token_per_s_(token/sec)_p95": 18.740678393133553, + "latency_ms_per_token_(inter_token_latency)_p50": 51.326324681221536, + "latency_ms_per_token_(inter_token_latency)_p75": 52.004817064281525, + "latency_ms_per_token_(inter_token_latency)_p95": 52.996850070530854, + "requests_per_minute_(qpm)": 14.13572641706053, + "results_number_errors": 0, + "results_num_completed_requests": 100 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_32.json b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_32.json new file mode 100644 index 0000000..324952c --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_32.json @@ -0,0 +1,17 @@ +{ + "concurrency": 32, + "mean_input_token_length": 550.0, + "mean_output_token_length": 160.0, + "time_to_first_token_in_ms_(ttft)_p50": 9879.810319000171, + "time_to_first_token_in_ms_(ttft)_p75": 13748.74519900004, + "time_to_first_token_in_ms_(ttft)_p95": 14925.1189030987, + "throughput_token_per_s_(token/sec)_p50": 5.108561161797954, + "throughput_token_per_s_(token/sec)_p75": 5.273474722315811, + "throughput_token_per_s_(token/sec)_p95": 5.4585904110907855, + "latency_ms_per_token_(inter_token_latency)_p50": 183.93538549267723, + "latency_ms_per_token_(inter_token_latency)_p75": 188.72039812495933, + "latency_ms_per_token_(inter_token_latency)_p95": 194.89754481931246, + "requests_per_minute_(qpm)": 52.57208219530792, + "results_number_errors": 0, + "results_num_completed_requests": 128 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_4.json b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_4.json new file mode 100644 index 0000000..f5a9467 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_4.json @@ -0,0 +1,17 @@ +{ + "concurrency": 4, + "mean_input_token_length": 550.0, + "mean_output_token_length": 159.18, + "time_to_first_token_in_ms_(ttft)_p50": 2035.751768499722, + "time_to_first_token_in_ms_(ttft)_p75": 2150.540910750351, + "time_to_first_token_in_ms_(ttft)_p95": 2235.427715849619, + "throughput_token_per_s_(token/sec)_p50": 15.919263824400435, + "throughput_token_per_s_(token/sec)_p75": 16.125915244796303, + "throughput_token_per_s_(token/sec)_p95": 16.28105133190631, + "latency_ms_per_token_(inter_token_latency)_p50": 59.637933259232845, + "latency_ms_per_token_(inter_token_latency)_p75": 60.647559917482496, + "latency_ms_per_token_(inter_token_latency)_p95": 61.93614683918202, + "requests_per_minute_(qpm)": 24.32544640710275, + "results_number_errors": 0, + "results_num_completed_requests": 100 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_64.json b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_64.json new file mode 100644 index 0000000..6dc2c37 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_64.json @@ -0,0 +1,17 @@ +{ + "concurrency": 64, + "mean_input_token_length": 550.0, + "mean_output_token_length": 159.40625, + "time_to_first_token_in_ms_(ttft)_p50": 17220.491680999658, + "time_to_first_token_in_ms_(ttft)_p75": 25327.48620450093, + "time_to_first_token_in_ms_(ttft)_p95": 29665.552642800958, + "throughput_token_per_s_(token/sec)_p50": 3.08544989041975, + "throughput_token_per_s_(token/sec)_p75": 3.1354960779360517, + "throughput_token_per_s_(token/sec)_p95": 3.251683697677572, + "latency_ms_per_token_(inter_token_latency)_p50": 307.45351493844254, + "latency_ms_per_token_(inter_token_latency)_p75": 319.2453598913946, + "latency_ms_per_token_(inter_token_latency)_p95": 330.42386755233827, + "requests_per_minute_(qpm)": 50.95750230507558, + "results_number_errors": 0, + "results_num_completed_requests": 128 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_8.json b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_8.json new file mode 100644 index 0000000..e975cad --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/01-ai_Yi-1.5-34B-Chat_cur_8.json @@ -0,0 +1,17 @@ +{ + "concurrency": 8, + "mean_input_token_length": 550.0, + "mean_output_token_length": 159.5, + "time_to_first_token_in_ms_(ttft)_p50": 3176.6039474996433, + "time_to_first_token_in_ms_(ttft)_p75": 3838.76859575048, + "time_to_first_token_in_ms_(ttft)_p95": 4254.826318100367, + "throughput_token_per_s_(token/sec)_p50": 12.154551419986344, + "throughput_token_per_s_(token/sec)_p75": 12.34099733333681, + "throughput_token_per_s_(token/sec)_p95": 12.524066100837192, + "latency_ms_per_token_(inter_token_latency)_p50": 78.06983328522192, + "latency_ms_per_token_(inter_token_latency)_p75": 79.20261970734134, + "latency_ms_per_token_(inter_token_latency)_p95": 81.29656306738025, + "requests_per_minute_(qpm)": 36.044085449240555, + "results_number_errors": 0, + "results_num_completed_requests": 104 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..e1d4bec --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,882 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04658090811874445, + "ttft_s": 0.6055155840003863, + "end_to_end_latency_s": 7.453143953000108, + "request_output_throughput_token_per_s": 20.25990655114317, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04770505742948654, + "ttft_s": 0.5807030780006244, + "end_to_end_latency_s": 7.442206753999926, + "request_output_throughput_token_per_s": 20.28968086903025, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047425895847169205, + "ttft_s": 0.5831447390000903, + "end_to_end_latency_s": 7.446071403000133, + "request_output_throughput_token_per_s": 20.27915014878314, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04803681977419112, + "ttft_s": 0.5826252920005572, + "end_to_end_latency_s": 7.445894740000767, + "request_output_throughput_token_per_s": 20.279631296531655, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047159521563272286, + "ttft_s": 0.5920296120002604, + "end_to_end_latency_s": 7.451430320999862, + "request_output_throughput_token_per_s": 20.264565794092835, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04652561250315094, + "ttft_s": 0.588360124999781, + "end_to_end_latency_s": 7.490822564999689, + "request_output_throughput_token_per_s": 20.15799983109148, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047839947038471275, + "ttft_s": 0.5877562180003224, + "end_to_end_latency_s": 7.463258802000382, + "request_output_throughput_token_per_s": 20.232448586605006, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047506586388570526, + "ttft_s": 0.5859248939996178, + "end_to_end_latency_s": 7.458786084999701, + "request_output_throughput_token_per_s": 20.244581126099696, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0487436932875592, + "ttft_s": 0.5835029329991812, + "end_to_end_latency_s": 7.4579953249995015, + "request_output_throughput_token_per_s": 20.246727628514584, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04593277099382159, + "ttft_s": 0.5825801820001288, + "end_to_end_latency_s": 7.4413399919994845, + "request_output_throughput_token_per_s": 20.292044196656356, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04862703643785233, + "ttft_s": 0.5865829829999711, + "end_to_end_latency_s": 7.440136217000145, + "request_output_throughput_token_per_s": 20.295327342929085, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.045017167333329174, + "ttft_s": 0.5885397689999081, + "end_to_end_latency_s": 7.428043592999529, + "request_output_throughput_token_per_s": 20.328367504777187, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0457575806748665, + "ttft_s": 0.5892114369999035, + "end_to_end_latency_s": 7.4587141840002005, + "request_output_throughput_token_per_s": 20.24477628113333, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047507439968192636, + "ttft_s": 0.586209936999694, + "end_to_end_latency_s": 7.458861684999647, + "request_output_throughput_token_per_s": 20.24437593522786, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.048588897248402076, + "ttft_s": 0.5829824669999653, + "end_to_end_latency_s": 7.4342899519997445, + "request_output_throughput_token_per_s": 20.311287422867146, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047780860980759454, + "ttft_s": 0.5878754389996175, + "end_to_end_latency_s": 7.454024704999938, + "request_output_throughput_token_per_s": 20.257512682875024, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04683877432705328, + "ttft_s": 0.5892650169998888, + "end_to_end_latency_s": 7.4475846619998265, + "request_output_throughput_token_per_s": 20.27502967108983, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04757197324359087, + "ttft_s": 0.5869506170001841, + "end_to_end_latency_s": 7.421420108000348, + "request_output_throughput_token_per_s": 20.346510209982704, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04632624480622667, + "ttft_s": 0.5835531339998852, + "end_to_end_latency_s": 7.412392403000013, + "request_output_throughput_token_per_s": 20.371290642800545, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04834143885717196, + "ttft_s": 0.5872906419999708, + "end_to_end_latency_s": 7.444796626000425, + "request_output_throughput_token_per_s": 20.28262255984847, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04690143100635649, + "ttft_s": 0.5828402260003713, + "end_to_end_latency_s": 7.4107037220001075, + "request_output_throughput_token_per_s": 20.375932659637613, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.048942490111866584, + "ttft_s": 0.587158190000082, + "end_to_end_latency_s": 7.4394775979999395, + "request_output_throughput_token_per_s": 20.297124093847057, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046641989668773934, + "ttft_s": 0.5885318979999283, + "end_to_end_latency_s": 7.462911648000045, + "request_output_throughput_token_per_s": 20.233389744131014, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04863933104543439, + "ttft_s": 0.59008306800024, + "end_to_end_latency_s": 7.490666112999861, + "request_output_throughput_token_per_s": 20.158420856316546, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04788965114008174, + "ttft_s": 0.5920801329993992, + "end_to_end_latency_s": 7.518884213000092, + "request_output_throughput_token_per_s": 20.08276703329494, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047392079264196044, + "ttft_s": 0.5874005929999839, + "end_to_end_latency_s": 7.535624977000225, + "request_output_throughput_token_per_s": 20.038152171966225, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04624487435802183, + "ttft_s": 0.5870166679997055, + "end_to_end_latency_s": 7.491867147999983, + "request_output_throughput_token_per_s": 20.155189222797514, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044913376716829, + "ttft_s": 0.5898306940007387, + "end_to_end_latency_s": 7.4558893080002235, + "request_output_throughput_token_per_s": 20.25244659117671, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.048200552564925274, + "ttft_s": 0.5808202190000884, + "end_to_end_latency_s": 7.423084339000525, + "request_output_throughput_token_per_s": 20.341948589571228, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04868810349345447, + "ttft_s": 0.5838910590000523, + "end_to_end_latency_s": 7.400806075000219, + "request_output_throughput_token_per_s": 20.403182905991162, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04514600132317976, + "ttft_s": 0.5856772420002017, + "end_to_end_latency_s": 7.4041536980003, + "request_output_throughput_token_per_s": 20.39395806178116, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04632057854999516, + "ttft_s": 0.5884599370001524, + "end_to_end_latency_s": 7.411585081999874, + "request_output_throughput_token_per_s": 20.37350962437519, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04628557077500091, + "ttft_s": 0.5872101609993479, + "end_to_end_latency_s": 7.4058975489997465, + "request_output_throughput_token_per_s": 20.38915593970029, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04659189501259371, + "ttft_s": 0.5888776319998215, + "end_to_end_latency_s": 7.408395640999515, + "request_output_throughput_token_per_s": 20.382280768637187, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04788914011538882, + "ttft_s": 0.6327967529996386, + "end_to_end_latency_s": 7.4709364899999855, + "request_output_throughput_token_per_s": 20.211656223034, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047479505519213025, + "ttft_s": 0.5821959470004003, + "end_to_end_latency_s": 7.407066475000647, + "request_output_throughput_token_per_s": 20.38593828064528, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0481339417272631, + "ttft_s": 0.5874247530000503, + "end_to_end_latency_s": 7.412848938000025, + "request_output_throughput_token_per_s": 20.3700360364742, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0451127388048795, + "ttft_s": 0.583365672000582, + "end_to_end_latency_s": 7.398668017000091, + "request_output_throughput_token_per_s": 20.409078992738124, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04556889850308461, + "ttft_s": 0.5864940510000451, + "end_to_end_latency_s": 7.427965631000006, + "request_output_throughput_token_per_s": 20.328580866046803, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04498284877580304, + "ttft_s": 0.5868061550008861, + "end_to_end_latency_s": 7.422363299000608, + "request_output_throughput_token_per_s": 20.343924693140735, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04783134761292463, + "ttft_s": 0.5822101869998733, + "end_to_end_latency_s": 7.414054884000507, + "request_output_throughput_token_per_s": 20.36672271281094, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04637132144375187, + "ttft_s": 0.5880257909993816, + "end_to_end_latency_s": 7.419622734999393, + "request_output_throughput_token_per_s": 20.35143906814992, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046037417608735136, + "ttft_s": 0.5868847159999859, + "end_to_end_latency_s": 7.412232489999951, + "request_output_throughput_token_per_s": 20.371730137137266, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04664639622015961, + "ttft_s": 0.5874750940001832, + "end_to_end_latency_s": 7.416991840999799, + "request_output_throughput_token_per_s": 20.358657962288582, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04684389880383352, + "ttft_s": 0.5816567900001246, + "end_to_end_latency_s": 7.401548363999609, + "request_output_throughput_token_per_s": 20.401136704645328, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04676701133331741, + "ttft_s": 0.5879545509997115, + "end_to_end_latency_s": 7.436274927999875, + "request_output_throughput_token_per_s": 20.305865700505276, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04574320788888593, + "ttft_s": 0.5824682810007289, + "end_to_end_latency_s": 7.410585430000538, + "request_output_throughput_token_per_s": 20.376257911918984, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04663166155973753, + "ttft_s": 0.5866071529999317, + "end_to_end_latency_s": 7.4146296610006175, + "request_output_throughput_token_per_s": 20.365143898450928, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0460469268757818, + "ttft_s": 0.5840593309994802, + "end_to_end_latency_s": 7.413783891000094, + "request_output_throughput_token_per_s": 20.367467169269027, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.045616633858931624, + "ttft_s": 0.5866628439998749, + "end_to_end_latency_s": 7.4357405009996, + "request_output_throughput_token_per_s": 20.307325138592557, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04603648266668272, + "ttft_s": 0.5807224879999922, + "end_to_end_latency_s": 7.458105196000361, + "request_output_throughput_token_per_s": 20.24642935862294, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04575297965027054, + "ttft_s": 0.5833539620007286, + "end_to_end_latency_s": 7.457959555000343, + "request_output_throughput_token_per_s": 20.246824736232167, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04699574884806438, + "ttft_s": 0.5874476039998626, + "end_to_end_latency_s": 7.425532299999759, + "request_output_throughput_token_per_s": 20.33524249837347, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04596378551235111, + "ttft_s": 0.5868010559997856, + "end_to_end_latency_s": 7.4463179760005005, + "request_output_throughput_token_per_s": 20.27847863691469, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04828808662340363, + "ttft_s": 0.5863663400004953, + "end_to_end_latency_s": 7.436572821000482, + "request_output_throughput_token_per_s": 20.30505229150505, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047614097166679595, + "ttft_s": 0.5914675849999185, + "end_to_end_latency_s": 7.4280408730001, + "request_output_throughput_token_per_s": 20.328374948617217, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04703560442408449, + "ttft_s": 0.588828430999456, + "end_to_end_latency_s": 7.431863560999773, + "request_output_throughput_token_per_s": 20.317918750877432, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04626015475152474, + "ttft_s": 0.5917458089998036, + "end_to_end_latency_s": 7.448107988000629, + "request_output_throughput_token_per_s": 20.273605087798202, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04611518624839072, + "ttft_s": 0.5830476770006499, + "end_to_end_latency_s": 7.424756450000132, + "request_output_throughput_token_per_s": 20.337367429742066, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04559073965646365, + "ttft_s": 0.5878560379996998, + "end_to_end_latency_s": 7.4315073159996246, + "request_output_throughput_token_per_s": 20.31889273322861, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04704271501265759, + "ttft_s": 0.5916065069995966, + "end_to_end_latency_s": 7.432979845999398, + "request_output_throughput_token_per_s": 20.31486740560338, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.044430156401160305, + "ttft_s": 0.5826454419993752, + "end_to_end_latency_s": 7.420023869999568, + "request_output_throughput_token_per_s": 20.35033884601355, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04653948091871598, + "ttft_s": 0.5798595060005027, + "end_to_end_latency_s": 7.446530498000357, + "request_output_throughput_token_per_s": 20.27789989452787, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046079446388853, + "ttft_s": 0.5821752370002287, + "end_to_end_latency_s": 7.465059576000385, + "request_output_throughput_token_per_s": 20.227567973530157, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04613122000619339, + "ttft_s": 0.591315773000133, + "end_to_end_latency_s": 7.473449692000031, + "request_output_throughput_token_per_s": 20.204859365232398, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04570151014114941, + "ttft_s": 0.5881887929999721, + "end_to_end_latency_s": 7.449545176999891, + "request_output_throughput_token_per_s": 20.26969384200866, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04791993352258531, + "ttft_s": 0.5921789650001301, + "end_to_end_latency_s": 7.4277792390003015, + "request_output_throughput_token_per_s": 20.32909098955975, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0469270148175995, + "ttft_s": 0.5880996719997711, + "end_to_end_latency_s": 7.4615883510005006, + "request_output_throughput_token_per_s": 20.236978093243764, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046569304800010515, + "ttft_s": 0.5818372819994693, + "end_to_end_latency_s": 7.451317858999573, + "request_output_throughput_token_per_s": 20.26487164517144, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.046938954553474986, + "ttft_s": 0.5930746959993485, + "end_to_end_latency_s": 7.463564336999298, + "request_output_throughput_token_per_s": 20.231620333390076, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04612446886335278, + "ttft_s": 0.5822059060001266, + "end_to_end_latency_s": 7.426264198999888, + "request_output_throughput_token_per_s": 20.333238348877423, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.048428337811696726, + "ttft_s": 0.5883057539995207, + "end_to_end_latency_s": 7.458354188999692, + "request_output_throughput_token_per_s": 20.245753442858682, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04759794538460063, + "ttft_s": 0.5886596390000705, + "end_to_end_latency_s": 7.425508785999227, + "request_output_throughput_token_per_s": 20.335306893005097, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04728672432481585, + "ttft_s": 0.5816342799998893, + "end_to_end_latency_s": 7.424307575000057, + "request_output_throughput_token_per_s": 20.33859703071351, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04467948324100328, + "ttft_s": 0.5756712330003211, + "end_to_end_latency_s": 7.417246984000485, + "request_output_throughput_token_per_s": 20.35795765271366, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04757454700636586, + "ttft_s": 0.5828455740002028, + "end_to_end_latency_s": 7.469436540000061, + "request_output_throughput_token_per_s": 20.215714959404256, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04538562379880426, + "ttft_s": 0.5812511849999282, + "end_to_end_latency_s": 7.443630840999504, + "request_output_throughput_token_per_s": 20.28579912484272, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047739945461584145, + "ttft_s": 0.5884888069995213, + "end_to_end_latency_s": 7.447614802999851, + "request_output_throughput_token_per_s": 20.274947616675632, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04644016007503069, + "ttft_s": 0.591104319999431, + "end_to_end_latency_s": 7.430732585999976, + "request_output_throughput_token_per_s": 20.321011185962288, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04462062418668167, + "ttft_s": 0.5819630539999707, + "end_to_end_latency_s": 7.407204496000304, + "request_output_throughput_token_per_s": 20.38555842241645, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..9be7890 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_1/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 1, + "results_inter_token_latency_s_quantiles_p25": 0.04604454955902014, + "results_inter_token_latency_s_quantiles_p50": 0.04670670377673851, + "results_inter_token_latency_s_quantiles_p75": 0.04763683723238133, + "results_inter_token_latency_s_quantiles_p90": 0.04829342184678046, + "results_inter_token_latency_s_quantiles_p95": 0.04862765116823143, + "results_inter_token_latency_s_quantiles_p99": 0.04878544062066375, + "results_inter_token_latency_s_mean": 0.046801568884606674, + "results_inter_token_latency_s_min": 0.044430156401160305, + "results_inter_token_latency_s_max": 0.048942490111866584, + "results_inter_token_latency_s_stddev": 0.0011053031465647549, + "results_ttft_s_quantiles_p25": 0.5829482437500246, + "results_ttft_s_quantiles_p50": 0.586917666500085, + "results_ttft_s_quantiles_p75": 0.5884995797496231, + "results_ttft_s_quantiles_p90": 0.5914814771998863, + "results_ttft_s_quantiles_p95": 0.5920850745994357, + "results_ttft_s_quantiles_p99": 0.6112446294902291, + "results_ttft_s_mean": 0.5869289812874854, + "results_ttft_s_min": 0.5756712330003211, + "results_ttft_s_max": 0.6327967529996386, + "results_ttft_s_stddev": 0.006632683782084771, + "results_end_to_end_latency_s_quantiles_p25": 7.419923586249524, + "results_end_to_end_latency_s_quantiles_p50": 7.4364238745001785, + "results_end_to_end_latency_s_quantiles_p75": 7.4564068697502535, + "results_end_to_end_latency_s_quantiles_p90": 7.465497272400353, + "results_end_to_end_latency_s_quantiles_p95": 7.490673935599853, + "results_end_to_end_latency_s_quantiles_p99": 7.52239977344012, + "results_end_to_end_latency_s_mean": 7.439467228012518, + "results_end_to_end_latency_s_min": 7.398668017000091, + "results_end_to_end_latency_s_max": 7.535624977000225, + "results_end_to_end_latency_s_stddev": 0.026269789196923198, + "results_request_output_throughput_token_per_s_quantiles_p25": 20.251041127440573, + "results_request_output_throughput_token_per_s_quantiles_p50": 20.305458996005164, + "results_request_output_throughput_token_per_s_quantiles_p75": 20.350613901547643, + "results_request_output_throughput_token_per_s_quantiles_p90": 20.376860197590805, + "results_request_output_throughput_token_per_s_quantiles_p95": 20.389396045804336, + "results_request_output_throughput_token_per_s_quantiles_p99": 20.404421084208025, + "results_request_output_throughput_token_per_s_mean": 20.297401444735172, + "results_request_output_throughput_token_per_s_min": 20.038152171966225, + "results_request_output_throughput_token_per_s_max": 20.409078992738124, + "results_request_output_throughput_token_per_s_stddev": 0.07142599532546101, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 156.0, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 162.0, + "results_number_output_tokens_quantiles_p90": 164.0, + "results_number_output_tokens_quantiles_p95": 165.05, + "results_number_output_tokens_quantiles_p99": 166.20999999999998, + "results_number_output_tokens_mean": 159.0375, + "results_number_output_tokens_min": "152", + "results_number_output_tokens_max": "167", + "results_number_output_tokens_stddev": 3.671455924779016, + "results_num_requests_started": 80, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 20.982177074314603, + "results_num_completed_requests": 80, + "results_num_completed_requests_per_min": 7.915935703584854, + "timestamp": 1718208212 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..b7b5de8 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1234 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11918652480883671, + "ttft_s": 0.5865403649986547, + "end_to_end_latency_s": 18.712564567998925, + "request_output_throughput_token_per_s": 8.069444434047854, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11177040574848263, + "ttft_s": 4.033081877001678, + "end_to_end_latency_s": 17.77173345600022, + "request_output_throughput_token_per_s": 8.496638798564598, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11643536094875227, + "ttft_s": 4.424421738000092, + "end_to_end_latency_s": 18.164180500998555, + "request_output_throughput_token_per_s": 8.313064274586951, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1147229197935127, + "ttft_s": 4.041277120999439, + "end_to_end_latency_s": 17.782257619999655, + "request_output_throughput_token_per_s": 8.491610189595427, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1094179278880683, + "ttft_s": 7.876270406999538, + "end_to_end_latency_s": 17.61677781599974, + "request_output_throughput_token_per_s": 8.571374491813152, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10971573519378808, + "ttft_s": 7.813637766999818, + "end_to_end_latency_s": 17.554739343999245, + "request_output_throughput_token_per_s": 8.601665740574866, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10920949417897735, + "ttft_s": 7.953810717999659, + "end_to_end_latency_s": 17.692215170000054, + "request_output_throughput_token_per_s": 8.534827241760226, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11329330937573753, + "ttft_s": 4.048001196999394, + "end_to_end_latency_s": 17.787291374999768, + "request_output_throughput_token_per_s": 8.489207087045987, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11178656573174078, + "ttft_s": 0.6041261099999247, + "end_to_end_latency_s": 18.333223370998894, + "request_output_throughput_token_per_s": 8.236413037920276, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10971632058125351, + "ttft_s": 7.815087324999695, + "end_to_end_latency_s": 17.554853424000612, + "request_output_throughput_token_per_s": 8.601609842754716, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11737963032692278, + "ttft_s": 4.571739390001312, + "end_to_end_latency_s": 18.311850628000684, + "request_output_throughput_token_per_s": 8.246026197325223, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11267724947796776, + "ttft_s": 4.1731412159988395, + "end_to_end_latency_s": 17.915917207999883, + "request_output_throughput_token_per_s": 8.42825953295737, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10851910832101522, + "ttft_s": 7.836530128999584, + "end_to_end_latency_s": 17.580556212999, + "request_output_throughput_token_per_s": 8.589034281426839, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1106674105184325, + "ttft_s": 4.1828433709997626, + "end_to_end_latency_s": 17.92833594699914, + "request_output_throughput_token_per_s": 8.42242138067892, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11046493504399217, + "ttft_s": 7.826684064000801, + "end_to_end_latency_s": 17.57149810699957, + "request_output_throughput_token_per_s": 8.59346192797582, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11340067414740145, + "ttft_s": 7.94568669399996, + "end_to_end_latency_s": 17.69176661399979, + "request_output_throughput_token_per_s": 8.535043633263351, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11246033432499872, + "ttft_s": 7.007283761999133, + "end_to_end_latency_s": 17.994636913999784, + "request_output_throughput_token_per_s": 8.391389096743728, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11881444677560583, + "ttft_s": 2.9844258459997945, + "end_to_end_latency_s": 18.535251151999546, + "request_output_throughput_token_per_s": 8.146639004873178, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11384379808806677, + "ttft_s": 7.113257946000886, + "end_to_end_latency_s": 18.10135744800027, + "request_output_throughput_token_per_s": 8.341915816743432, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10722604238415487, + "ttft_s": 6.5983330060007575, + "end_to_end_latency_s": 17.58529662300134, + "request_output_throughput_token_per_s": 8.586718963983465, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10662386029872967, + "ttft_s": 7.65149993399973, + "end_to_end_latency_s": 17.48651048099964, + "request_output_throughput_token_per_s": 8.63522771819297, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11619813972083648, + "ttft_s": 6.905256128999099, + "end_to_end_latency_s": 17.894959340999776, + "request_output_throughput_token_per_s": 8.438130376414913, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11515767324389588, + "ttft_s": 3.332863878998978, + "end_to_end_latency_s": 18.88603622499977, + "request_output_throughput_token_per_s": 7.995325128102778, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11304194022885708, + "ttft_s": 3.2117464409984677, + "end_to_end_latency_s": 18.765172411000094, + "request_output_throughput_token_per_s": 8.04682188326094, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10987449900003073, + "ttft_s": 6.6987288700001955, + "end_to_end_latency_s": 17.690037231999668, + "request_output_throughput_token_per_s": 8.535878021039704, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11052499545350532, + "ttft_s": 6.803012751999631, + "end_to_end_latency_s": 17.794752209998478, + "request_output_throughput_token_per_s": 8.485647803241477, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1190314667712146, + "ttft_s": 7.219639936000021, + "end_to_end_latency_s": 18.212038382000173, + "request_output_throughput_token_per_s": 8.291219073491549, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10802906120504377, + "ttft_s": 7.553282079999917, + "end_to_end_latency_s": 17.392871674999697, + "request_output_throughput_token_per_s": 8.681717592215987, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1145089055341334, + "ttft_s": 2.878833407001366, + "end_to_end_latency_s": 18.43615442600094, + "request_output_throughput_token_per_s": 8.190428248259906, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11673513199997279, + "ttft_s": 7.332784942000217, + "end_to_end_latency_s": 18.32763071000045, + "request_output_throughput_token_per_s": 8.238926372387406, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11651274339258919, + "ttft_s": 0.5920799260002241, + "end_to_end_latency_s": 18.991822256000887, + "request_output_throughput_token_per_s": 7.950790501542747, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11726450186163348, + "ttft_s": 3.0864601109988143, + "end_to_end_latency_s": 18.64542968099886, + "request_output_throughput_token_per_s": 8.098499341845725, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11357595874061546, + "ttft_s": 6.997978954001155, + "end_to_end_latency_s": 17.945663919001163, + "request_output_throughput_token_per_s": 8.414288860058207, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.12258333549342448, + "ttft_s": 2.887740901000143, + "end_to_end_latency_s": 18.387724006999633, + "request_output_throughput_token_per_s": 8.21200056855971, + "number_total_tokens": 700, + "number_output_tokens": 150, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11426830993078288, + "ttft_s": 7.219466583999747, + "end_to_end_latency_s": 18.16895133199978, + "request_output_throughput_token_per_s": 8.310881417468142, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11074723102522316, + "ttft_s": 6.551024432999839, + "end_to_end_latency_s": 17.498315683000328, + "request_output_throughput_token_per_s": 8.62940197991153, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11606082621469209, + "ttft_s": 0.5857618350000848, + "end_to_end_latency_s": 18.91813575500055, + "request_output_throughput_token_per_s": 7.981758982783852, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11208118095753228, + "ttft_s": 2.9947770179987856, + "end_to_end_latency_s": 18.49365406099969, + "request_output_throughput_token_per_s": 8.164962938202468, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10722177539749773, + "ttft_s": 7.46688182500111, + "end_to_end_latency_s": 17.262946814000315, + "request_output_throughput_token_per_s": 8.747058171872396, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11566355128489485, + "ttft_s": 7.323891027999707, + "end_to_end_latency_s": 18.27505793799901, + "request_output_throughput_token_per_s": 8.262627703413642, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11766524265407978, + "ttft_s": 3.210124811001151, + "end_to_end_latency_s": 18.708968783001183, + "request_output_throughput_token_per_s": 8.070995347279503, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11481966895674607, + "ttft_s": 3.1013403209999524, + "end_to_end_latency_s": 18.60097375299847, + "request_output_throughput_token_per_s": 8.117854581438719, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10926028743398213, + "ttft_s": 7.575139479000427, + "end_to_end_latency_s": 17.372772168000665, + "request_output_throughput_token_per_s": 8.691761944482908, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10943524774239712, + "ttft_s": 6.885037290001492, + "end_to_end_latency_s": 17.838183084000775, + "request_output_throughput_token_per_s": 8.464987677777186, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1137291055484257, + "ttft_s": 6.673584799000309, + "end_to_end_latency_s": 17.628216962000806, + "request_output_throughput_token_per_s": 8.56581243159725, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11542183773000128, + "ttft_s": 3.308332235999842, + "end_to_end_latency_s": 18.813994525000453, + "request_output_throughput_token_per_s": 8.025940466781142, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11294959421658836, + "ttft_s": 6.777203002000533, + "end_to_end_latency_s": 17.733301555001162, + "request_output_throughput_token_per_s": 8.51505285305515, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11364181306296517, + "ttft_s": 7.11665843999981, + "end_to_end_latency_s": 18.07322621899948, + "request_output_throughput_token_per_s": 8.354900125206271, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11257600513291909, + "ttft_s": 6.2622571520005295, + "end_to_end_latency_s": 17.787292595001418, + "request_output_throughput_token_per_s": 8.489206504784994, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11902769018239989, + "ttft_s": 0.5926334429987037, + "end_to_end_latency_s": 18.925639480999962, + "request_output_throughput_token_per_s": 7.9785943376758075, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11664426035260989, + "ttft_s": 6.670925724998597, + "end_to_end_latency_s": 18.196751187999325, + "request_output_throughput_token_per_s": 8.298184573715764, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11323326707548953, + "ttft_s": 6.477613734001352, + "end_to_end_latency_s": 18.004349218001153, + "request_output_throughput_token_per_s": 8.386862428164124, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11333616448718413, + "ttft_s": 6.153851987000962, + "end_to_end_latency_s": 17.6807120630001, + "request_output_throughput_token_per_s": 8.540380017612142, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10650136141208129, + "ttft_s": 6.0469191709998995, + "end_to_end_latency_s": 17.572932226001285, + "request_output_throughput_token_per_s": 8.592760619458668, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1157400506188992, + "ttft_s": 2.4586470180001925, + "end_to_end_latency_s": 18.51861972000006, + "request_output_throughput_token_per_s": 8.15395543961197, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10807389691141411, + "ttft_s": 7.305218748999323, + "end_to_end_latency_s": 17.075867402998483, + "request_output_throughput_token_per_s": 8.842888998627663, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11618319964198258, + "ttft_s": 2.7607083080001757, + "end_to_end_latency_s": 18.82188673500059, + "request_output_throughput_token_per_s": 8.022575107691257, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10911372796342066, + "ttft_s": 6.366292161999809, + "end_to_end_latency_s": 17.894924689999243, + "request_output_throughput_token_per_s": 8.43814671566558, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11233306378653651, + "ttft_s": 2.359650413000054, + "end_to_end_latency_s": 18.422866916000203, + "request_output_throughput_token_per_s": 8.196335602297435, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10806744802510869, + "ttft_s": 7.410096039999189, + "end_to_end_latency_s": 17.182959811998444, + "request_output_throughput_token_per_s": 8.787775892635235, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11221187816861485, + "ttft_s": 2.563490336999166, + "end_to_end_latency_s": 18.62741220999851, + "request_output_throughput_token_per_s": 8.106332661653814, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11177258371618894, + "ttft_s": 7.556467930999133, + "end_to_end_latency_s": 17.324962316999517, + "request_output_throughput_token_per_s": 8.715747673046105, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1158855815379129, + "ttft_s": 6.778699631999189, + "end_to_end_latency_s": 18.310140506000607, + "request_output_throughput_token_per_s": 8.246796355850694, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10894859355620383, + "ttft_s": 5.899608247998913, + "end_to_end_latency_s": 17.432003644998986, + "request_output_throughput_token_per_s": 8.662228569652688, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11213315581881034, + "ttft_s": 6.979249305000849, + "end_to_end_latency_s": 17.94151397600035, + "request_output_throughput_token_per_s": 8.416235118284147, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11722531202500477, + "ttft_s": 3.240265806000025, + "end_to_end_latency_s": 18.75632922800105, + "request_output_throughput_token_per_s": 8.050615776917281, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11958410985807272, + "ttft_s": 3.0183849610002653, + "end_to_end_latency_s": 18.535738398999456, + "request_output_throughput_token_per_s": 8.146424855033068, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11582015987975945, + "ttft_s": 7.337003905999154, + "end_to_end_latency_s": 18.29979918400022, + "request_output_throughput_token_per_s": 8.25145666800658, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10960446664374786, + "ttft_s": 6.57284029100083, + "end_to_end_latency_s": 17.53703214699999, + "request_output_throughput_token_per_s": 8.610350869763966, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11105233498085745, + "ttft_s": 7.509195096999974, + "end_to_end_latency_s": 17.3244017800007, + "request_output_throughput_token_per_s": 8.71602967407016, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11708596057407569, + "ttft_s": 0.5882929180006613, + "end_to_end_latency_s": 18.96820642500097, + "request_output_throughput_token_per_s": 7.9606894092514215, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10950144809204103, + "ttft_s": 6.882577669000966, + "end_to_end_latency_s": 17.84893433199977, + "request_output_throughput_token_per_s": 8.459888819764746, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11214178858377037, + "ttft_s": 7.088012214000628, + "end_to_end_latency_s": 18.055295299000136, + "request_output_throughput_token_per_s": 8.36319747195506, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10697339831286219, + "ttft_s": 6.468481447000158, + "end_to_end_latency_s": 17.436846635999245, + "request_output_throughput_token_per_s": 8.659822681943702, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11026002452121192, + "ttft_s": 7.224167544000011, + "end_to_end_latency_s": 18.193228682001063, + "request_output_throughput_token_per_s": 8.29979123768105, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11810741326919527, + "ttft_s": 2.9013453740008117, + "end_to_end_latency_s": 18.424959062000198, + "request_output_throughput_token_per_s": 8.19540491199374, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11734176858487255, + "ttft_s": 3.1327877929998067, + "end_to_end_latency_s": 18.657545435999054, + "request_output_throughput_token_per_s": 8.093240373873135, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11455485682466593, + "ttft_s": 6.669806900999902, + "end_to_end_latency_s": 17.641720745001294, + "request_output_throughput_token_per_s": 8.559255765500382, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10630355691356036, + "ttft_s": 7.399242371000582, + "end_to_end_latency_s": 17.223583180999412, + "request_output_throughput_token_per_s": 8.767049133340564, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11298053576651065, + "ttft_s": 3.340830299999652, + "end_to_end_latency_s": 18.86795357399933, + "request_output_throughput_token_per_s": 8.002987680024983, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11031733700625637, + "ttft_s": 6.695465188000526, + "end_to_end_latency_s": 17.651034583999717, + "request_output_throughput_token_per_s": 8.554739342977564, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.12017511630961493, + "ttft_s": 3.1186311220008065, + "end_to_end_latency_s": 18.627366059999986, + "request_output_throughput_token_per_s": 8.106352745397226, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11882093231170901, + "ttft_s": 7.342528096000024, + "end_to_end_latency_s": 18.29862382799911, + "request_output_throughput_token_per_s": 8.251986675028082, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11935085335033503, + "ttft_s": 3.2279021680005826, + "end_to_end_latency_s": 18.738337958000557, + "request_output_throughput_token_per_s": 8.058345427350389, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11430102565433724, + "ttft_s": 3.0080319279986725, + "end_to_end_latency_s": 18.51699939899845, + "request_output_throughput_token_per_s": 8.154668947506003, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10889599818005284, + "ttft_s": 6.577221296998687, + "end_to_end_latency_s": 17.53245307799989, + "request_output_throughput_token_per_s": 8.612599693164338, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11372754758746169, + "ttft_s": 7.238938612999846, + "end_to_end_latency_s": 18.19659019500068, + "request_output_throughput_token_per_s": 8.298257991295843, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11315551289607793, + "ttft_s": 7.615824168999097, + "end_to_end_latency_s": 17.426138769998943, + "request_output_throughput_token_per_s": 8.665143896361222, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11575728303151268, + "ttft_s": 2.8934185130001424, + "end_to_end_latency_s": 18.405656896000437, + "request_output_throughput_token_per_s": 8.20399950152349, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10827316226250332, + "ttft_s": 7.512661180999203, + "end_to_end_latency_s": 17.323975114999485, + "request_output_throughput_token_per_s": 8.716244337551652, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11170274084910282, + "ttft_s": 6.802174651000314, + "end_to_end_latency_s": 17.76095088799957, + "request_output_throughput_token_per_s": 8.501797057612789, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11307814401880023, + "ttft_s": 7.133220891999372, + "end_to_end_latency_s": 18.0928404389997, + "request_output_throughput_token_per_s": 8.345842683413856, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.12228825356773262, + "ttft_s": 0.5849698549991444, + "end_to_end_latency_s": 18.954901744999006, + "request_output_throughput_token_per_s": 7.96627711561941, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.120837383993603, + "ttft_s": 3.335334060000605, + "end_to_end_latency_s": 18.850860025999282, + "request_output_throughput_token_per_s": 8.01024461439634, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11102477065219742, + "ttft_s": 6.912903946998995, + "end_to_end_latency_s": 17.875221697999223, + "request_output_throughput_token_per_s": 8.447447676517571, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11381654560122852, + "ttft_s": 7.020913266000207, + "end_to_end_latency_s": 17.984197069999937, + "request_output_throughput_token_per_s": 8.396260306326845, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.12004421431166437, + "ttft_s": 2.353200231000301, + "end_to_end_latency_s": 18.487031967000803, + "request_output_throughput_token_per_s": 8.16788764521713, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10990754903046186, + "ttft_s": 6.470075426999756, + "end_to_end_latency_s": 18.025046901999303, + "request_output_throughput_token_per_s": 8.377232016148117, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11566787307184712, + "ttft_s": 6.142256678000194, + "end_to_end_latency_s": 17.697388245000184, + "request_output_throughput_token_per_s": 8.532332449826889, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11263038571426179, + "ttft_s": 6.5805704399990645, + "end_to_end_latency_s": 18.133726411999305, + "request_output_throughput_token_per_s": 8.327025376322071, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1113263428249752, + "ttft_s": 6.2569248149993655, + "end_to_end_latency_s": 17.812461567000355, + "request_output_throughput_token_per_s": 8.47721127324395, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1149339553394833, + "ttft_s": 2.485081654998794, + "end_to_end_latency_s": 18.619505228998605, + "request_output_throughput_token_per_s": 8.109775106420543, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.12157520929036002, + "ttft_s": 2.709794577000139, + "end_to_end_latency_s": 18.84445773399966, + "request_output_throughput_token_per_s": 8.01296604717693, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.12593679464246194, + "ttft_s": 0.5896919349997916, + "end_to_end_latency_s": 19.01670113399996, + "request_output_throughput_token_per_s": 7.940388763329046, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11707280301260425, + "ttft_s": 2.596607131001292, + "end_to_end_latency_s": 18.7318820950004, + "request_output_throughput_token_per_s": 8.061122701615894, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1120647001783712, + "ttft_s": 6.037101705000168, + "end_to_end_latency_s": 17.594403058999887, + "request_output_throughput_token_per_s": 8.582274686651587, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10862645137269554, + "ttft_s": 7.67310550000002, + "end_to_end_latency_s": 17.48906333299965, + "request_output_throughput_token_per_s": 8.633967247124213, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11700439405133996, + "ttft_s": 6.69407487999888, + "end_to_end_latency_s": 18.25299278499915, + "request_output_throughput_token_per_s": 8.272615991175774, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11846973462589601, + "ttft_s": 6.80430362900006, + "end_to_end_latency_s": 18.362997510999776, + "request_output_throughput_token_per_s": 8.223058349245443, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10795895885718175, + "ttft_s": 7.563126654998996, + "end_to_end_latency_s": 17.38161671999842, + "request_output_throughput_token_per_s": 8.687339183257157, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.10337719622155331, + "ttft_s": 7.445865486999537, + "end_to_end_latency_s": 17.264194429999407, + "request_output_throughput_token_per_s": 8.746426056092858, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.11638857647412569, + "ttft_s": 6.362377211000421, + "end_to_end_latency_s": 17.92406313200081, + "request_output_throughput_token_per_s": 8.424429153589145, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..6a1e22e --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_16/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 16, + "results_inter_token_latency_s_quantiles_p25": 0.11030300888499525, + "results_inter_token_latency_s_quantiles_p50": 0.11326328822561353, + "results_inter_token_latency_s_quantiles_p75": 0.11640027259278234, + "results_inter_token_latency_s_quantiles_p90": 0.1190070143953308, + "results_inter_token_latency_s_quantiles_p95": 0.12010312021074213, + "results_inter_token_latency_s_quantiles_p99": 0.12255087648159838, + "results_inter_token_latency_s_mean": 0.11350145435953386, + "results_inter_token_latency_s_min": 0.10337719622155331, + "results_inter_token_latency_s_max": 0.12593679464246194, + "results_inter_token_latency_s_stddev": 0.00413427391369037, + "results_ttft_s_quantiles_p25": 3.190790556500815, + "results_ttft_s_quantiles_p50": 6.561932362000334, + "results_ttft_s_quantiles_p75": 7.219509921999816, + "results_ttft_s_quantiles_p90": 7.562460782599009, + "results_ttft_s_quantiles_p95": 7.814290068099763, + "results_ttft_s_quantiles_p99": 7.9380509024299135, + "results_ttft_s_mean": 5.367653323758857, + "results_ttft_s_min": 0.5849698549991444, + "results_ttft_s_max": 7.953810717999659, + "results_ttft_s_stddev": 2.2589576153527693, + "results_end_to_end_latency_s_quantiles_p25": 17.638344799251172, + "results_end_to_end_latency_s_quantiles_p50": 18.014698060000228, + "results_end_to_end_latency_s_quantiles_p75": 18.49949039549938, + "results_end_to_end_latency_s_quantiles_p90": 18.809112313600416, + "results_end_to_end_latency_s_quantiles_p95": 18.90048101350012, + "results_end_to_end_latency_s_quantiles_p99": 18.989224514590898, + "results_end_to_end_latency_s_mean": 18.06960619555345, + "results_end_to_end_latency_s_min": 17.075867402998483, + "results_end_to_end_latency_s_max": 19.01670113399996, + "results_end_to_end_latency_s_stddev": 0.5132737239526131, + "results_request_output_throughput_token_per_s_quantiles_p25": 8.16238944052835, + "results_request_output_throughput_token_per_s_quantiles_p50": 8.382047222156121, + "results_request_output_throughput_token_per_s_quantiles_p75": 8.560894932024599, + "results_request_output_throughput_token_per_s_quantiles_p90": 8.664852363690368, + "results_request_output_throughput_token_per_s_quantiles_p95": 8.716126272636831, + "results_request_output_throughput_token_per_s_quantiles_p99": 8.785495949112821, + "results_request_output_throughput_token_per_s_mean": 8.363242277576312, + "results_request_output_throughput_token_per_s_min": 7.940388763329046, + "results_request_output_throughput_token_per_s_max": 8.842888998627663, + "results_request_output_throughput_token_per_s_stddev": 0.23695528559813203, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 156.75, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 162.0, + "results_number_output_tokens_quantiles_p90": 164.0, + "results_number_output_tokens_quantiles_p95": 165.0, + "results_number_output_tokens_quantiles_p99": 166.89, + "results_number_output_tokens_mean": 159.27678571428572, + "results_number_output_tokens_min": "150", + "results_number_output_tokens_max": "167", + "results_number_output_tokens_stddev": 3.4957111983824953, + "results_num_requests_started": 112, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 124.29275563556878, + "results_num_completed_requests": 112, + "results_num_completed_requests_per_min": 46.821420363866935, + "timestamp": 1718209276 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..c4f934c --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1102 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052162291679497944, + "ttft_s": 1.0949466170004598, + "end_to_end_latency_s": 8.137578386000314, + "request_output_throughput_token_per_s": 18.555888845233937, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052002930316460534, + "ttft_s": 0.5888717309999265, + "end_to_end_latency_s": 8.216689595999924, + "request_output_throughput_token_per_s": 18.377230663978146, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0519053893885128, + "ttft_s": 1.0687716120000914, + "end_to_end_latency_s": 8.149374676000662, + "request_output_throughput_token_per_s": 18.529029036385385, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05092416553706255, + "ttft_s": 0.5873975140002585, + "end_to_end_latency_s": 8.249942401000226, + "request_output_throughput_token_per_s": 18.303158090133177, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05229957520885341, + "ttft_s": 0.5880462420000185, + "end_to_end_latency_s": 8.263525424999898, + "request_output_throughput_token_per_s": 18.273072597220438, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050958842124987316, + "ttft_s": 1.0625246519994107, + "end_to_end_latency_s": 8.153616959999454, + "request_output_throughput_token_per_s": 18.51938848008015, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05079782961252022, + "ttft_s": 1.0625887329997568, + "end_to_end_latency_s": 8.12792359300056, + "request_output_throughput_token_per_s": 18.577930546743218, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050199486109754506, + "ttft_s": 0.5877806579992466, + "end_to_end_latency_s": 8.233007584999541, + "request_output_throughput_token_per_s": 18.34080661787808, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049331654442423975, + "ttft_s": 1.060029759999452, + "end_to_end_latency_s": 8.140190668999821, + "request_output_throughput_token_per_s": 18.54993404209207, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0515433449374882, + "ttft_s": 0.5904866219998439, + "end_to_end_latency_s": 8.247164266000254, + "request_output_throughput_token_per_s": 18.30932368141524, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049945698824238646, + "ttft_s": 0.589981646000524, + "end_to_end_latency_s": 8.241239069000585, + "request_output_throughput_token_per_s": 18.322487521079978, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04987868982203352, + "ttft_s": 1.054718481999771, + "end_to_end_latency_s": 8.130445193999549, + "request_output_throughput_token_per_s": 18.572168730863766, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052887020500001644, + "ttft_s": 0.5937420440004644, + "end_to_end_latency_s": 8.250642060000246, + "request_output_throughput_token_per_s": 18.301605972226056, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052236591929467745, + "ttft_s": 1.0761955070001932, + "end_to_end_latency_s": 8.149127382999723, + "request_output_throughput_token_per_s": 18.529591317348675, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05058008947238749, + "ttft_s": 0.5876674259998254, + "end_to_end_latency_s": 8.244897265999498, + "request_output_throughput_token_per_s": 18.314357975410726, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05166770620510364, + "ttft_s": 0.9868490049993852, + "end_to_end_latency_s": 8.060386168999685, + "request_output_throughput_token_per_s": 18.73359375519095, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05256118003246064, + "ttft_s": 1.0560765099999117, + "end_to_end_latency_s": 8.094773628999974, + "request_output_throughput_token_per_s": 18.65401145487678, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049469166156557616, + "ttft_s": 0.5885706480003137, + "end_to_end_latency_s": 8.212102127000435, + "request_output_throughput_token_per_s": 18.38749660741914, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05268601609031682, + "ttft_s": 1.0748202889999448, + "end_to_end_latency_s": 8.16653058599968, + "request_output_throughput_token_per_s": 18.490104017839272, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05134346478260013, + "ttft_s": 0.5898906449992865, + "end_to_end_latency_s": 8.266598603999228, + "request_output_throughput_token_per_s": 18.2662794256091, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0518138091465053, + "ttft_s": 1.0661868799998047, + "end_to_end_latency_s": 8.135023183999692, + "request_output_throughput_token_per_s": 18.56171722988979, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052481139165572774, + "ttft_s": 0.5869819979998283, + "end_to_end_latency_s": 8.239776751000136, + "request_output_throughput_token_per_s": 18.3257392236594, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049999659660510655, + "ttft_s": 1.0660083470002064, + "end_to_end_latency_s": 8.100176478000321, + "request_output_throughput_token_per_s": 18.641569157179294, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051927016702531456, + "ttft_s": 0.583623794000232, + "end_to_end_latency_s": 8.2046606720005, + "request_output_throughput_token_per_s": 18.404173680857717, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051062584160504625, + "ttft_s": 0.5896099410001625, + "end_to_end_latency_s": 8.272366057, + "request_output_throughput_token_per_s": 18.253544265274044, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05272260358064335, + "ttft_s": 1.11934502800068, + "end_to_end_latency_s": 8.172366250000778, + "request_output_throughput_token_per_s": 18.47690073850834, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05269007612183393, + "ttft_s": 0.587151119999362, + "end_to_end_latency_s": 8.219847605999348, + "request_output_throughput_token_per_s": 18.370170255929192, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05010250394447953, + "ttft_s": 1.0627334750006412, + "end_to_end_latency_s": 8.116834251, + "request_output_throughput_token_per_s": 18.60331199708762, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05073168520624449, + "ttft_s": 0.5904844529995898, + "end_to_end_latency_s": 8.11727001700001, + "request_output_throughput_token_per_s": 18.602313300378142, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.047369290805094715, + "ttft_s": 0.5900073270004214, + "end_to_end_latency_s": 7.531978521000383, + "request_output_throughput_token_per_s": 20.04785324055126, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05100377787975728, + "ttft_s": 1.0256221909994565, + "end_to_end_latency_s": 8.058799578999242, + "request_output_throughput_token_per_s": 18.737281963618642, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05095445024224838, + "ttft_s": 0.5879612609996911, + "end_to_end_latency_s": 8.203898622999986, + "request_output_throughput_token_per_s": 18.40588321955429, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05299173369873573, + "ttft_s": 0.5927374610000697, + "end_to_end_latency_s": 8.266936588000135, + "request_output_throughput_token_per_s": 18.2655326302108, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051646622177233, + "ttft_s": 1.1152651459997287, + "end_to_end_latency_s": 8.160361805999855, + "request_output_throughput_token_per_s": 18.504081508858857, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05093675604348956, + "ttft_s": 0.5860495060005633, + "end_to_end_latency_s": 8.201262569000392, + "request_output_throughput_token_per_s": 18.41179924793026, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.048796840701220943, + "ttft_s": 0.9686645440006032, + "end_to_end_latency_s": 8.002866195000024, + "request_output_throughput_token_per_s": 18.86823999310906, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05104998083649244, + "ttft_s": 0.5844934360002298, + "end_to_end_latency_s": 8.117170405000252, + "request_output_throughput_token_per_s": 18.602541583577278, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04755810131445167, + "ttft_s": 0.6199715689999721, + "end_to_end_latency_s": 7.561959001999639, + "request_output_throughput_token_per_s": 19.968370624605406, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04976448123638186, + "ttft_s": 0.5901353179997386, + "end_to_end_latency_s": 8.211441929000102, + "request_output_throughput_token_per_s": 18.388974957822917, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05037016381245962, + "ttft_s": 1.021607729000607, + "end_to_end_latency_s": 8.059436776999974, + "request_output_throughput_token_per_s": 18.735800550098478, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051590443493739715, + "ttft_s": 0.5898131039994041, + "end_to_end_latency_s": 8.254743671999677, + "request_output_throughput_token_per_s": 18.292512281416595, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052768634168818634, + "ttft_s": 1.0910327859992321, + "end_to_end_latency_s": 8.126608614999896, + "request_output_throughput_token_per_s": 18.580936667884792, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051696807647793136, + "ttft_s": 0.5913657439996314, + "end_to_end_latency_s": 8.219982147999872, + "request_output_throughput_token_per_s": 18.36986957894332, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0520104773077445, + "ttft_s": 1.0677750790000573, + "end_to_end_latency_s": 8.113852993000364, + "request_output_throughput_token_per_s": 18.61014737761015, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04986476775462908, + "ttft_s": 1.0571231929998248, + "end_to_end_latency_s": 8.128184076000252, + "request_output_throughput_token_per_s": 18.577335181895222, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05114533454660949, + "ttft_s": 0.581898392999392, + "end_to_end_latency_s": 8.234637804999693, + "request_output_throughput_token_per_s": 18.3371756688946, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0502512955987792, + "ttft_s": 1.0655106510002952, + "end_to_end_latency_s": 8.140991060000488, + "request_output_throughput_token_per_s": 18.548110283760828, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051863173106865806, + "ttft_s": 0.5880292609999742, + "end_to_end_latency_s": 8.246439305999957, + "request_output_throughput_token_per_s": 18.310933288520683, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051456450829126754, + "ttft_s": 1.0585727219995533, + "end_to_end_latency_s": 8.130362203000004, + "request_output_throughput_token_per_s": 18.57235830702387, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05151204562497469, + "ttft_s": 0.5869624169999952, + "end_to_end_latency_s": 8.242170161000104, + "request_output_throughput_token_per_s": 18.32041768738219, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051909459173115326, + "ttft_s": 1.0683647670002756, + "end_to_end_latency_s": 8.09807617200022, + "request_output_throughput_token_per_s": 18.64640400915161, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.053604972503224865, + "ttft_s": 0.5867261450002843, + "end_to_end_latency_s": 8.201825887000268, + "request_output_throughput_token_per_s": 18.410534688298128, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050653164225002455, + "ttft_s": 1.070942059999652, + "end_to_end_latency_s": 8.104750896999576, + "request_output_throughput_token_per_s": 18.63104763107538, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050652944555538315, + "ttft_s": 0.5868866770006207, + "end_to_end_latency_s": 8.20596845900036, + "request_output_throughput_token_per_s": 18.40124060364651, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04972519762577576, + "ttft_s": 1.0544786389991714, + "end_to_end_latency_s": 8.105432495999594, + "request_output_throughput_token_per_s": 18.62948091598141, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05131896983749016, + "ttft_s": 0.5821919769996384, + "end_to_end_latency_s": 8.21123193800031, + "request_output_throughput_token_per_s": 18.38944523064747, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05178115652834438, + "ttft_s": 0.586136906998945, + "end_to_end_latency_s": 8.233393098998931, + "request_output_throughput_token_per_s": 18.339947842203667, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05136054106337782, + "ttft_s": 1.0438355639998917, + "end_to_end_latency_s": 8.115204239999002, + "request_output_throughput_token_per_s": 18.607048637881054, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05170303193705231, + "ttft_s": 0.5871441299987055, + "end_to_end_latency_s": 8.220984391999082, + "request_output_throughput_token_per_s": 18.367630054979536, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051345813727862205, + "ttft_s": 1.0656651619992772, + "end_to_end_latency_s": 8.112849109998933, + "request_output_throughput_token_per_s": 18.612450195073315, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051333679524952915, + "ttft_s": 0.5872648420008773, + "end_to_end_latency_s": 8.213580377001563, + "request_output_throughput_token_per_s": 18.38418729337666, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049959410388959284, + "ttft_s": 1.0496297379995667, + "end_to_end_latency_s": 8.093631574998653, + "request_output_throughput_token_per_s": 18.656643634044478, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0494861830843821, + "ttft_s": 0.5846020970002428, + "end_to_end_latency_s": 8.214903983000113, + "request_output_throughput_token_per_s": 18.381225186865088, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052714568292236884, + "ttft_s": 1.0620338450007694, + "end_to_end_latency_s": 8.11822963800114, + "request_output_throughput_token_per_s": 18.600114400949494, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051234660672918333, + "ttft_s": 1.1155073390000325, + "end_to_end_latency_s": 8.146519489999264, + "request_output_throughput_token_per_s": 18.53552307649529, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050298014445116876, + "ttft_s": 0.5872475510004733, + "end_to_end_latency_s": 8.24908269000116, + "request_output_throughput_token_per_s": 18.305065626633787, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.053267998293352624, + "ttft_s": 1.1133729310004128, + "end_to_end_latency_s": 8.038384487999792, + "request_output_throughput_token_per_s": 18.41165973348796, + "number_total_tokens": 700, + "number_output_tokens": 150, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0530940611346383, + "ttft_s": 0.5872147299996868, + "end_to_end_latency_s": 8.282876801000384, + "request_output_throughput_token_per_s": 18.230381017107803, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052297944267448444, + "ttft_s": 0.5836765550011478, + "end_to_end_latency_s": 8.210985243000323, + "request_output_throughput_token_per_s": 18.389997732455317, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.049107181939402694, + "ttft_s": 1.0535580779996963, + "end_to_end_latency_s": 8.102883712999756, + "request_output_throughput_token_per_s": 18.635340867319265, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.048690061530085434, + "ttft_s": 1.0531098820010811, + "end_to_end_latency_s": 8.082750175999536, + "request_output_throughput_token_per_s": 18.681760132630465, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050542384493787215, + "ttft_s": 0.580766028000653, + "end_to_end_latency_s": 8.188072970000576, + "request_output_throughput_token_per_s": 18.44145753869477, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05050002379869986, + "ttft_s": 0.996285176001038, + "end_to_end_latency_s": 8.029689408000195, + "request_output_throughput_token_per_s": 18.8052105539169, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05259440195517621, + "ttft_s": 0.5858519729990803, + "end_to_end_latency_s": 8.204917325998395, + "request_output_throughput_token_per_s": 18.403597988920133, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05125955098717382, + "ttft_s": 0.9543479210005898, + "end_to_end_latency_s": 7.996728726000583, + "request_output_throughput_token_per_s": 18.882721319411303, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05134747238749924, + "ttft_s": 0.5883004049992451, + "end_to_end_latency_s": 8.21593278599903, + "request_output_throughput_token_per_s": 18.378923481132023, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05130702296193266, + "ttft_s": 1.0481814489994576, + "end_to_end_latency_s": 8.106742821000807, + "request_output_throughput_token_per_s": 18.626469759078713, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.053720408163356895, + "ttft_s": 0.5821991170014371, + "end_to_end_latency_s": 8.219413321001412, + "request_output_throughput_token_per_s": 18.371140871353933, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05138843610117724, + "ttft_s": 1.0698443460005365, + "end_to_end_latency_s": 8.119573785999819, + "request_output_throughput_token_per_s": 18.59703526068842, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0517063906541956, + "ttft_s": 0.5897503730011522, + "end_to_end_latency_s": 8.221567638000124, + "request_output_throughput_token_per_s": 18.366327037446883, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.052967859696746604, + "ttft_s": 0.5910673700000189, + "end_to_end_latency_s": 8.210326225000244, + "request_output_throughput_token_per_s": 18.391473841832088, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05076836171068656, + "ttft_s": 1.0372061689995462, + "end_to_end_latency_s": 8.072354603000349, + "request_output_throughput_token_per_s": 18.705818491159448, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050324931478547344, + "ttft_s": 0.5884682369996881, + "end_to_end_latency_s": 8.203184824000346, + "request_output_throughput_token_per_s": 18.407484804952098, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04908341054547098, + "ttft_s": 1.0669051980003132, + "end_to_end_latency_s": 8.099038672999086, + "request_output_throughput_token_per_s": 18.644188044614495, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05214446189247495, + "ttft_s": 0.5920861030008382, + "end_to_end_latency_s": 8.239068212000348, + "request_output_throughput_token_per_s": 18.327315190820467, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05214599942946034, + "ttft_s": 1.0742504009995173, + "end_to_end_latency_s": 8.135006852000515, + "request_output_throughput_token_per_s": 18.561754494757054, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05198684464107245, + "ttft_s": 1.0698790160004137, + "end_to_end_latency_s": 8.110218156001793, + "request_output_throughput_token_per_s": 18.618488072143375, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05101165588821911, + "ttft_s": 0.5877903079999669, + "end_to_end_latency_s": 8.213106499999412, + "request_output_throughput_token_per_s": 18.38524801791025, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05195778362026373, + "ttft_s": 0.5836336550000851, + "end_to_end_latency_s": 8.209522204000677, + "request_output_throughput_token_per_s": 18.39327505885963, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05438612181878486, + "ttft_s": 1.0543530779996217, + "end_to_end_latency_s": 8.103735173001041, + "request_output_throughput_token_per_s": 18.63338285079724, + "number_total_tokens": 699, + "number_output_tokens": 149, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0512318731624191, + "ttft_s": 0.5773630950006918, + "end_to_end_latency_s": 8.197288998000658, + "request_output_throughput_token_per_s": 18.42072422197501, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051900252955175134, + "ttft_s": 1.0645070570008102, + "end_to_end_latency_s": 8.096658862999902, + "request_output_throughput_token_per_s": 18.649668036532887, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.050860755610055604, + "ttft_s": 1.0540815439999278, + "end_to_end_latency_s": 8.087070575000325, + "request_output_throughput_token_per_s": 18.67177967591731, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05291844250328253, + "ttft_s": 0.5883422249989962, + "end_to_end_latency_s": 8.202586629999132, + "request_output_throughput_token_per_s": 18.408827216496704, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05058881420002308, + "ttft_s": 1.059402377999504, + "end_to_end_latency_s": 8.094415514999127, + "request_output_throughput_token_per_s": 18.65483674765568, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.051262954437504504, + "ttft_s": 0.5886372910008504, + "end_to_end_latency_s": 8.202270492000025, + "request_output_throughput_token_per_s": 18.40953674318298, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04995587966041731, + "ttft_s": 1.0528786539998691, + "end_to_end_latency_s": 8.093054117000065, + "request_output_throughput_token_per_s": 18.65797482841653, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04911442522750226, + "ttft_s": 0.5836739589994977, + "end_to_end_latency_s": 8.202321823000602, + "request_output_throughput_token_per_s": 18.40942153434802, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04997360117281067, + "ttft_s": 1.0561246950001077, + "end_to_end_latency_s": 8.095966913999291, + "request_output_throughput_token_per_s": 18.65126199304194, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05218825700628322, + "ttft_s": 0.5822821309993742, + "end_to_end_latency_s": 8.19374983399939, + "request_output_throughput_token_per_s": 18.428680769998138, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..8982057 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_2/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 2, + "results_inter_token_latency_s_quantiles_p25": 0.0504675588021398, + "results_inter_token_latency_s_quantiles_p50": 0.05132632468122154, + "results_inter_token_latency_s_quantiles_p75": 0.052004817064281526, + "results_inter_token_latency_s_quantiles_p90": 0.05272720663946088, + "results_inter_token_latency_s_quantiles_p95": 0.05299685007053086, + "results_inter_token_latency_s_quantiles_p99": 0.05372706529991118, + "results_inter_token_latency_s_mean": 0.05122191632598677, + "results_inter_token_latency_s_min": 0.047369290805094715, + "results_inter_token_latency_s_max": 0.05438612181878486, + "results_inter_token_latency_s_stddev": 0.0012734437610207622, + "results_ttft_s_quantiles_p25": 0.5875999479999336, + "results_ttft_s_quantiles_p50": 0.5932397525002671, + "results_ttft_s_quantiles_p75": 1.0605307812497813, + "results_ttft_s_quantiles_p90": 1.0699853204003376, + "results_ttft_s_quantiles_p95": 1.0912284775492935, + "results_ttft_s_quantiles_p99": 1.115545715890039, + "results_ttft_s_mean": 0.8136072918500031, + "results_ttft_s_min": 0.5773630950006918, + "results_ttft_s_max": 1.11934502800068, + "results_ttft_s_stddev": 0.23721944735760614, + "results_end_to_end_latency_s_quantiles_p25": 8.106415239750504, + "results_end_to_end_latency_s_quantiles_p50": 8.163446195999768, + "results_end_to_end_latency_s_quantiles_p75": 8.216121988499253, + "results_end_to_end_latency_s_quantiles_p90": 8.246511801999986, + "results_end_to_end_latency_s_quantiles_p95": 8.255182759649689, + "results_end_to_end_latency_s_quantiles_p99": 8.272471164440002, + "results_end_to_end_latency_s_mean": 8.152552897720016, + "results_end_to_end_latency_s_min": 7.531978521000383, + "results_end_to_end_latency_s_max": 8.282876801000384, + "results_end_to_end_latency_s_stddev": 0.109478611062119, + "results_request_output_throughput_token_per_s_quantiles_p25": 18.378500276843553, + "results_request_output_throughput_token_per_s_quantiles_p50": 18.483502378173807, + "results_request_output_throughput_token_per_s_quantiles_p75": 18.62048349387721, + "results_request_output_throughput_token_per_s_quantiles_p90": 18.672777721588627, + "results_request_output_throughput_token_per_s_quantiles_p95": 18.740678393133553, + "results_request_output_throughput_token_per_s_quantiles_p99": 19.969165450764866, + "results_request_output_throughput_token_per_s_mean": 18.521542080274155, + "results_request_output_throughput_token_per_s_min": 18.230381017107803, + "results_request_output_throughput_token_per_s_max": 20.04785324055126, + "results_request_output_throughput_token_per_s_stddev": 0.25999226820193966, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 156.75, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 162.0, + "results_number_output_tokens_quantiles_p90": 164.0, + "results_number_output_tokens_quantiles_p95": 165.0, + "results_number_output_tokens_quantiles_p99": 166.01, + "results_number_output_tokens_mean": 159.22, + "results_number_output_tokens_min": "149", + "results_number_output_tokens_max": "167", + "results_number_output_tokens_stddev": 3.503475907487225, + "results_num_requests_started": 100, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 37.51150600207296, + "results_num_completed_requests": 100, + "results_num_completed_requests_per_min": 14.13572641706053, + "timestamp": 1718208656 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..cd41135 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1410 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1851385584107967, + "ttft_s": 0.5895533439997962, + "end_to_end_latency_s": 31.103511895998963, + "request_output_throughput_token_per_s": 4.854757253936461, + "number_total_tokens": 718, + "number_output_tokens": 168, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18506278316039648, + "ttft_s": 5.070436152998809, + "end_to_end_latency_s": 29.980485144998966, + "request_output_throughput_token_per_s": 5.036609623550013, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18374034724537483, + "ttft_s": 5.0389435999986745, + "end_to_end_latency_s": 29.94997882499956, + "request_output_throughput_token_per_s": 5.0417397916141, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18936777812737932, + "ttft_s": 9.375130280999656, + "end_to_end_latency_s": 29.73099706799985, + "request_output_throughput_token_per_s": 5.078874403526975, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18367154655281626, + "ttft_s": 13.78711955899962, + "end_to_end_latency_s": 29.57139950799865, + "request_output_throughput_token_per_s": 5.106285211802594, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1965974018461076, + "ttft_s": 0.6121183420000307, + "end_to_end_latency_s": 30.669448210001065, + "request_output_throughput_token_per_s": 4.923466472760344, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1911029542102688, + "ttft_s": 5.091294179001125, + "end_to_end_latency_s": 30.003438577999987, + "request_output_throughput_token_per_s": 5.032756482476002, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18536826293707206, + "ttft_s": 13.687012748998313, + "end_to_end_latency_s": 29.473787959999754, + "request_output_throughput_token_per_s": 5.12319625169758, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19151265198704462, + "ttft_s": 13.70619254399935, + "end_to_end_latency_s": 29.493278208999982, + "request_output_throughput_token_per_s": 5.119810654141586, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19045982263688893, + "ttft_s": 4.987233878999177, + "end_to_end_latency_s": 29.90244854699995, + "request_output_throughput_token_per_s": 5.049753693670331, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1880306726750632, + "ttft_s": 5.1697298619983485, + "end_to_end_latency_s": 30.08513658299853, + "request_output_throughput_token_per_s": 5.019089728358817, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18383749677154193, + "ttft_s": 9.417994557999918, + "end_to_end_latency_s": 29.781903828001305, + "request_output_throughput_token_per_s": 5.070192989409494, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18507156443485645, + "ttft_s": 9.4354373999995, + "end_to_end_latency_s": 29.796772747000432, + "request_output_throughput_token_per_s": 5.067662907057637, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18412555747823908, + "ttft_s": 13.854313046998868, + "end_to_end_latency_s": 29.644425710999712, + "request_output_throughput_token_per_s": 5.093706367331336, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1823089275215581, + "ttft_s": 9.35548721899977, + "end_to_end_latency_s": 29.71659700300006, + "request_output_throughput_token_per_s": 5.081335523874274, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18319971690675488, + "ttft_s": 13.705963032, + "end_to_end_latency_s": 29.495418497001083, + "request_output_throughput_token_per_s": 5.119439143247036, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19298332000649773, + "ttft_s": 9.352995176999684, + "end_to_end_latency_s": 29.719713792999755, + "request_output_throughput_token_per_s": 5.080802629921923, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18749364197460808, + "ttft_s": 13.832489758999145, + "end_to_end_latency_s": 29.624207572998785, + "request_output_throughput_token_per_s": 5.097182755957669, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18853254076917345, + "ttft_s": 16.48987656600002, + "end_to_end_latency_s": 29.41129137200005, + "request_output_throughput_token_per_s": 5.134082624598866, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18965682347093646, + "ttft_s": 16.478121235999424, + "end_to_end_latency_s": 29.39702656999907, + "request_output_throughput_token_per_s": 5.136573919829769, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19073629528020608, + "ttft_s": 5.026972857000146, + "end_to_end_latency_s": 29.94580734199917, + "request_output_throughput_token_per_s": 5.042442111360999, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18689022546835918, + "ttft_s": 13.737076530000195, + "end_to_end_latency_s": 29.528880955000204, + "request_output_throughput_token_per_s": 5.113637737580122, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1825262461925659, + "ttft_s": 16.46446330099934, + "end_to_end_latency_s": 29.38701008099997, + "request_output_throughput_token_per_s": 5.1383247082229815, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18680166340248508, + "ttft_s": 9.338963157999387, + "end_to_end_latency_s": 29.701731192999432, + "request_output_throughput_token_per_s": 5.08387874830643, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18022338378774072, + "ttft_s": 9.379099761001271, + "end_to_end_latency_s": 29.73714113599999, + "request_output_throughput_token_per_s": 5.077825044089337, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1842095618625194, + "ttft_s": 16.55262673800098, + "end_to_end_latency_s": 29.47372839000127, + "request_output_throughput_token_per_s": 5.12320660630182, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18183608431931989, + "ttft_s": 5.272918511000171, + "end_to_end_latency_s": 30.18502257, + "request_output_throughput_token_per_s": 5.002480937353163, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18397510939264675, + "ttft_s": 5.067439293999996, + "end_to_end_latency_s": 29.988178843999776, + "request_output_throughput_token_per_s": 5.035317442433255, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1828790423851218, + "ttft_s": 16.532314168000084, + "end_to_end_latency_s": 29.443812556999546, + "request_output_throughput_token_per_s": 5.128411944196522, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18182373020121437, + "ttft_s": 9.458290262000446, + "end_to_end_latency_s": 29.819345036001323, + "request_output_throughput_token_per_s": 5.06382684856745, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18408032769520571, + "ttft_s": 0.7055794860007154, + "end_to_end_latency_s": 30.189881780999713, + "request_output_throughput_token_per_s": 5.001675763269576, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18322866454325126, + "ttft_s": 13.88878339799885, + "end_to_end_latency_s": 29.683258117998776, + "request_output_throughput_token_per_s": 5.087042648746145, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18217566946146363, + "ttft_s": 14.551511996000045, + "end_to_end_latency_s": 28.419694420999804, + "request_output_throughput_token_per_s": 5.313216875703755, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18331970481646254, + "ttft_s": 10.544602613999814, + "end_to_end_latency_s": 28.964722065000387, + "request_output_throughput_token_per_s": 5.213238354614192, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1952998881719117, + "ttft_s": 3.1193220000004658, + "end_to_end_latency_s": 30.662300457999663, + "request_output_throughput_token_per_s": 4.924614192168505, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1716161307424402, + "ttft_s": 14.103114835999804, + "end_to_end_latency_s": 27.97371289200055, + "request_output_throughput_token_per_s": 5.397924851197727, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19201513912019758, + "ttft_s": 7.357982933999665, + "end_to_end_latency_s": 30.33858481199968, + "request_output_throughput_token_per_s": 4.977160303808096, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1941503357359139, + "ttft_s": 3.3268043320003926, + "end_to_end_latency_s": 30.870132084000943, + "request_output_throughput_token_per_s": 4.89145947251255, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17440748673914822, + "ttft_s": 14.207620211000176, + "end_to_end_latency_s": 28.07980262700039, + "request_output_throughput_token_per_s": 5.377530675903134, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19122055792210524, + "ttft_s": 11.023940348999531, + "end_to_end_latency_s": 29.44817634300125, + "request_output_throughput_token_per_s": 5.127651989081054, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18640063570807192, + "ttft_s": 7.025172190999001, + "end_to_end_latency_s": 30.010682940999686, + "request_output_throughput_token_per_s": 5.031541611260981, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17700782914374713, + "ttft_s": 14.447147652999774, + "end_to_end_latency_s": 28.32147943599921, + "request_output_throughput_token_per_s": 5.331642379107678, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17242992326074663, + "ttft_s": 13.886479507998956, + "end_to_end_latency_s": 27.76146054899982, + "request_output_throughput_token_per_s": 5.439195093265371, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18979337305846491, + "ttft_s": 10.80254837999928, + "end_to_end_latency_s": 29.22838276499897, + "request_output_throughput_token_per_s": 5.1662112547952095, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1909765878438975, + "ttft_s": 3.016446925999844, + "end_to_end_latency_s": 30.55650959600098, + "request_output_throughput_token_per_s": 4.941663887545645, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18107677789513543, + "ttft_s": 10.907952977000605, + "end_to_end_latency_s": 29.334621992000393, + "request_output_throughput_token_per_s": 5.147501135047112, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.20373834054304182, + "ttft_s": 3.2239734979993955, + "end_to_end_latency_s": 30.764756388000023, + "request_output_throughput_token_per_s": 4.9082137396315755, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18777081753937774, + "ttft_s": 0.5868577380006172, + "end_to_end_latency_s": 30.982370637000713, + "request_output_throughput_token_per_s": 4.873739384541096, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1838364240864304, + "ttft_s": 6.800740362999932, + "end_to_end_latency_s": 29.781743745999847, + "request_output_throughput_token_per_s": 5.0702202425699685, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17753623055412704, + "ttft_s": 13.997219843000494, + "end_to_end_latency_s": 27.87344063099954, + "request_output_throughput_token_per_s": 5.41734341300029, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18655018328398135, + "ttft_s": 7.241060640000796, + "end_to_end_latency_s": 30.22133775400107, + "request_output_throughput_token_per_s": 4.996469753560422, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1845635524424076, + "ttft_s": 2.9096364209999592, + "end_to_end_latency_s": 30.45319338599984, + "request_output_throughput_token_per_s": 4.958429091033152, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18566688839123824, + "ttft_s": 6.909129917999962, + "end_to_end_latency_s": 29.892575281999598, + "request_output_throughput_token_per_s": 5.05142158464104, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17675361711109064, + "ttft_s": 10.213405561999025, + "end_to_end_latency_s": 28.63427529299952, + "request_output_throughput_token_per_s": 5.2734004424730925, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1738520542012739, + "ttft_s": 14.908094132999395, + "end_to_end_latency_s": 27.642725301999235, + "request_output_throughput_token_per_s": 5.4625583530679975, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17089487850304674, + "ttft_s": 14.320233121001365, + "end_to_end_latency_s": 28.19789552600014, + "request_output_throughput_token_per_s": 5.355009555970906, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18424405129483673, + "ttft_s": 10.31854379500146, + "end_to_end_latency_s": 28.742300822999823, + "request_output_throughput_token_per_s": 5.253580808644538, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1858862066791568, + "ttft_s": 6.571631216000242, + "end_to_end_latency_s": 29.556135033000828, + "request_output_throughput_token_per_s": 5.108922388918623, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18427252340996528, + "ttft_s": 6.683987231001083, + "end_to_end_latency_s": 29.668059122999693, + "request_output_throughput_token_per_s": 5.089648748978649, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17923242613052812, + "ttft_s": 10.430475986000602, + "end_to_end_latency_s": 28.8566054540006, + "request_output_throughput_token_per_s": 5.232770716593963, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1718989902168665, + "ttft_s": 14.657230737000646, + "end_to_end_latency_s": 28.535450700001093, + "request_output_throughput_token_per_s": 5.29166339748733, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18536151940135162, + "ttft_s": 10.674892339000507, + "end_to_end_latency_s": 29.101961309001126, + "request_output_throughput_token_per_s": 5.188653726692169, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.16589056779520686, + "ttft_s": 14.79840513199997, + "end_to_end_latency_s": 27.53805897500024, + "request_output_throughput_token_per_s": 5.483320379881592, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19189686998727487, + "ttft_s": 7.1337202380000235, + "end_to_end_latency_s": 30.12801943100021, + "request_output_throughput_token_per_s": 5.011945785079673, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1739542130613097, + "ttft_s": 13.783751205999579, + "end_to_end_latency_s": 28.354913552999278, + "request_output_throughput_token_per_s": 5.32535568192652, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1806240043999651, + "ttft_s": 9.785407752999163, + "end_to_end_latency_s": 28.90009101999931, + "request_output_throughput_token_per_s": 5.2248970390960245, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18143454312416202, + "ttft_s": 10.095799519000138, + "end_to_end_latency_s": 29.211147964999327, + "request_output_throughput_token_per_s": 5.169259358821761, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17465610976687235, + "ttft_s": 13.896353685000577, + "end_to_end_latency_s": 28.46919505400001, + "request_output_throughput_token_per_s": 5.303978553435919, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17011413782101617, + "ttft_s": 14.735193593000076, + "end_to_end_latency_s": 27.558708308999485, + "request_output_throughput_token_per_s": 5.479211808729436, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17364474348731257, + "ttft_s": 14.611302469998918, + "end_to_end_latency_s": 27.4360848719989, + "request_output_throughput_token_per_s": 5.503700717667252, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.16860662324690992, + "ttft_s": 14.488902666000286, + "end_to_end_latency_s": 27.314540618999672, + "request_output_throughput_token_per_s": 5.5281910871664515, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18389566159270773, + "ttft_s": 2.4852468380013306, + "end_to_end_latency_s": 30.71077514799981, + "request_output_throughput_token_per_s": 4.916841052441967, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19803752375167272, + "ttft_s": 0.5851558670001396, + "end_to_end_latency_s": 31.09210073000031, + "request_output_throughput_token_per_s": 4.8565390068449865, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18877476496858, + "ttft_s": 6.360767611000483, + "end_to_end_latency_s": 30.01541052200082, + "request_output_throughput_token_per_s": 5.030749117668052, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1899980555483419, + "ttft_s": 10.33060298899909, + "end_to_end_latency_s": 29.449900014999002, + "request_output_throughput_token_per_s": 5.127351872946762, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17115307721477468, + "ttft_s": 13.322221648000777, + "end_to_end_latency_s": 27.898171037000793, + "request_output_throughput_token_per_s": 5.412541194895238, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.16874560977436065, + "ttft_s": 13.098061962999054, + "end_to_end_latency_s": 27.674541557998964, + "request_output_throughput_token_per_s": 5.45627827957119, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17729344643672287, + "ttft_s": 13.435214162000193, + "end_to_end_latency_s": 28.012554417999127, + "request_output_throughput_token_per_s": 5.390440220009953, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18928882304424152, + "ttft_s": 6.249790112999108, + "end_to_end_latency_s": 29.907827066999744, + "request_output_throughput_token_per_s": 5.048845563461653, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1787968574113574, + "ttft_s": 13.671353949001059, + "end_to_end_latency_s": 28.25016236400006, + "request_output_throughput_token_per_s": 5.345102022932914, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17926992936254466, + "ttft_s": 9.560680721000608, + "end_to_end_latency_s": 28.683376619999763, + "request_output_throughput_token_per_s": 5.264373229151612, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18828335103744392, + "ttft_s": 6.465156915000989, + "end_to_end_latency_s": 30.125522439000633, + "request_output_throughput_token_per_s": 5.012361206540097, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1796365087530819, + "ttft_s": 9.985314866999033, + "end_to_end_latency_s": 29.101317980999738, + "request_output_throughput_token_per_s": 5.188768429615042, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18783332490060026, + "ttft_s": 6.588681854000242, + "end_to_end_latency_s": 30.241363539000304, + "request_output_throughput_token_per_s": 4.993161098879196, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17921015480895883, + "ttft_s": 13.55517247500029, + "end_to_end_latency_s": 28.13625279900043, + "request_output_throughput_token_per_s": 5.366741658127426, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19790546907696785, + "ttft_s": 2.6473416089993407, + "end_to_end_latency_s": 30.873473965999437, + "request_output_throughput_token_per_s": 4.890929999205609, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18449314076737347, + "ttft_s": 10.216256688001522, + "end_to_end_latency_s": 29.33467509300135, + "request_output_throughput_token_per_s": 5.147491817150737, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1965186740529899, + "ttft_s": 6.017466664001404, + "end_to_end_latency_s": 29.67450905500118, + "request_output_throughput_token_per_s": 5.088542483386133, + "number_total_tokens": 701, + "number_output_tokens": 151, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1742952096706509, + "ttft_s": 14.002949656000055, + "end_to_end_latency_s": 28.58461690799959, + "request_output_throughput_token_per_s": 5.28256161298218, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18223675586696003, + "ttft_s": 9.67323014899921, + "end_to_end_latency_s": 28.793598287998975, + "request_output_throughput_token_per_s": 5.244221249795515, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19746882502586233, + "ttft_s": 2.3783022010011337, + "end_to_end_latency_s": 30.60784841200075, + "request_output_throughput_token_per_s": 4.933375190815301, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19057461069371584, + "ttft_s": 6.834242201000961, + "end_to_end_latency_s": 30.492169433000527, + "request_output_throughput_token_per_s": 4.952091071505669, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18739494041513372, + "ttft_s": 6.137431456998456, + "end_to_end_latency_s": 29.795998136998605, + "request_output_throughput_token_per_s": 5.0677946516750065, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19370276805623235, + "ttft_s": 2.76075550799942, + "end_to_end_latency_s": 30.992704299000252, + "request_output_throughput_token_per_s": 4.872114370634991, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18829758380892436, + "ttft_s": 10.439072355000462, + "end_to_end_latency_s": 29.562974909998957, + "request_output_throughput_token_per_s": 5.10774035629709, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18974586053755046, + "ttft_s": 6.696726583000782, + "end_to_end_latency_s": 30.359549669001353, + "request_output_throughput_token_per_s": 4.973723314288113, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1854462610428039, + "ttft_s": 6.834160378999513, + "end_to_end_latency_s": 30.413446556000054, + "request_output_throughput_token_per_s": 4.964909179956465, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19019444240497071, + "ttft_s": 0.5874477369998203, + "end_to_end_latency_s": 31.002014158000748, + "request_output_throughput_token_per_s": 4.870651281895217, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1882511222546731, + "ttft_s": 6.727244313000483, + "end_to_end_latency_s": 30.30864774800102, + "request_output_throughput_token_per_s": 4.982076444171254, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1834306908598392, + "ttft_s": 6.502965177998703, + "end_to_end_latency_s": 30.082929353999134, + "request_output_throughput_token_per_s": 5.019457986392091, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18737790028757217, + "ttft_s": 6.400350537000122, + "end_to_end_latency_s": 29.980662208001377, + "request_output_throughput_token_per_s": 5.036579877802046, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19213360912578062, + "ttft_s": 2.4161526530006086, + "end_to_end_latency_s": 30.549438144000305, + "request_output_throughput_token_per_s": 4.942807762559631, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19193459955287767, + "ttft_s": 2.7663255969982856, + "end_to_end_latency_s": 30.901663594999263, + "request_output_throughput_token_per_s": 4.886468313778289, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17946020138893845, + "ttft_s": 10.044737876, + "end_to_end_latency_s": 29.072875827001553, + "request_output_throughput_token_per_s": 5.193844630250101, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18354118513459, + "ttft_s": 14.160799344001134, + "end_to_end_latency_s": 28.632662042000447, + "request_output_throughput_token_per_s": 5.2736975618439645, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18024467070125222, + "ttft_s": 10.52495415300109, + "end_to_end_latency_s": 29.560315165999782, + "request_output_throughput_token_per_s": 5.108199934677284, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19145056471612817, + "ttft_s": 6.087292297001113, + "end_to_end_latency_s": 29.675038822000715, + "request_output_throughput_token_per_s": 5.088451641318509, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.179181566515451, + "ttft_s": 9.819544488998872, + "end_to_end_latency_s": 28.848509610999827, + "request_output_throughput_token_per_s": 5.234239204593927, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18002211221798858, + "ttft_s": 13.610606793001352, + "end_to_end_latency_s": 28.08369773800041, + "request_output_throughput_token_per_s": 5.376784831139952, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1833242499754423, + "ttft_s": 6.293273378998492, + "end_to_end_latency_s": 29.882070227999066, + "request_output_throughput_token_per_s": 5.053197413963481, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18380721485007143, + "ttft_s": 10.379241421000188, + "end_to_end_latency_s": 29.409375788000034, + "request_output_throughput_token_per_s": 5.134417033822691, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.19294721462263498, + "ttft_s": 2.5394485689994326, + "end_to_end_latency_s": 30.67882196699975, + "request_output_throughput_token_per_s": 4.921962132784173, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17957455708911205, + "ttft_s": 13.719737918001556, + "end_to_end_latency_s": 28.19340399900102, + "request_output_throughput_token_per_s": 5.355862669344589, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18870227584375243, + "ttft_s": 6.609439919000579, + "end_to_end_latency_s": 30.19258470600107, + "request_output_throughput_token_per_s": 5.001227999204297, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.1784278906001653, + "ttft_s": 14.934286086998327, + "end_to_end_latency_s": 27.656511377999777, + "request_output_throughput_token_per_s": 5.459835404985951, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18891484496933875, + "ttft_s": 2.657215543998973, + "end_to_end_latency_s": 30.79344108299847, + "request_output_throughput_token_per_s": 4.903641642160266, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18021504211106662, + "ttft_s": 10.163529534998816, + "end_to_end_latency_s": 29.195051018999948, + "request_output_throughput_token_per_s": 5.172109475052131, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18265515645403319, + "ttft_s": 6.189169007999226, + "end_to_end_latency_s": 29.772990163999566, + "request_output_throughput_token_per_s": 5.071710942308502, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17831929408737324, + "ttft_s": 14.05561963999935, + "end_to_end_latency_s": 28.531294917000196, + "request_output_throughput_token_per_s": 5.292434165335678, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.18785217071154664, + "ttft_s": 10.272232064000491, + "end_to_end_latency_s": 29.305127494999397, + "request_output_throughput_token_per_s": 5.152681899294467, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17582936829818233, + "ttft_s": 13.831828989999849, + "end_to_end_latency_s": 28.30872700299915, + "request_output_throughput_token_per_s": 5.334044161858723, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17989436391143257, + "ttft_s": 13.946530744999109, + "end_to_end_latency_s": 28.423530258998653, + "request_output_throughput_token_per_s": 5.312499841647737, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17679367806377527, + "ttft_s": 15.037139961999856, + "end_to_end_latency_s": 27.756841970000096, + "request_output_throughput_token_per_s": 5.440100144072675, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.16785901413847118, + "ttft_s": 13.386736172000383, + "end_to_end_latency_s": 27.86481269000069, + "request_output_throughput_token_per_s": 5.41902081596215, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17996877308078804, + "ttft_s": 9.94007614900147, + "end_to_end_latency_s": 28.975162819000616, + "request_output_throughput_token_per_s": 5.211359844403737, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17062998996354464, + "ttft_s": 13.504040091000206, + "end_to_end_latency_s": 27.983510476000447, + "request_output_throughput_token_per_s": 5.396034930267324, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17529491829274288, + "ttft_s": 9.711905103999015, + "end_to_end_latency_s": 28.748602142999516, + "request_output_throughput_token_per_s": 5.252429292001926, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.17542939489806247, + "ttft_s": 14.81967742300003, + "end_to_end_latency_s": 27.542878715999905, + "request_output_throughput_token_per_s": 5.482360851129289, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..73b2294 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_32/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 32, + "results_inter_token_latency_s_quantiles_p25": 0.17941263338234, + "results_inter_token_latency_s_quantiles_p50": 0.18393538549267724, + "results_inter_token_latency_s_quantiles_p75": 0.18872039812495933, + "results_inter_token_latency_s_quantiles_p90": 0.19195876142307364, + "results_inter_token_latency_s_quantiles_p95": 0.19489754481931246, + "results_inter_token_latency_s_quantiles_p99": 0.1980018689895024, + "results_inter_token_latency_s_mean": 0.18377501972261423, + "results_inter_token_latency_s_min": 0.16589056779520686, + "results_inter_token_latency_s_max": 0.20373834054304182, + "results_inter_token_latency_s_stddev": 0.007179570859092934, + "results_ttft_s_quantiles_p25": 6.282402562498646, + "results_ttft_s_quantiles_p50": 9.879810319000171, + "results_ttft_s_quantiles_p75": 13.74874519900004, + "results_ttft_s_quantiles_p90": 14.569449138199706, + "results_ttft_s_quantiles_p95": 14.925118903098701, + "results_ttft_s_quantiles_p99": 16.52085601546007, + "results_ttft_s_mean": 9.410386498195265, + "results_ttft_s_min": 0.5851558670001396, + "results_ttft_s_max": 16.55262673800098, + "results_ttft_s_stddev": 4.450480262001289, + "results_end_to_end_latency_s_quantiles_p25": 28.63387198024975, + "results_end_to_end_latency_s_quantiles_p50": 29.558225099500305, + "results_end_to_end_latency_s_quantiles_p75": 30.0322902300004, + "results_end_to_end_latency_s_quantiles_p90": 30.664444783600082, + "results_end_to_end_latency_s_quantiles_p95": 30.872304307299963, + "results_end_to_end_latency_s_quantiles_p99": 31.06777735556043, + "results_end_to_end_latency_s_mean": 29.391554288968734, + "results_end_to_end_latency_s_min": 27.314540618999672, + "results_end_to_end_latency_s_max": 31.103511895998963, + "results_end_to_end_latency_s_stddev": 0.9700369443574689, + "results_request_output_throughput_token_per_s_quantiles_p25": 5.027926334849062, + "results_request_output_throughput_token_per_s_quantiles_p50": 5.108561161797954, + "results_request_output_throughput_token_per_s_quantiles_p75": 5.273474722315811, + "results_request_output_throughput_token_per_s_quantiles_p90": 5.40230975430698, + "results_request_output_throughput_token_per_s_quantiles_p95": 5.4585904110907855, + "results_request_output_throughput_token_per_s_quantiles_p99": 5.498198026465124, + "results_request_output_throughput_token_per_s_mean": 5.143153329575094, + "results_request_output_throughput_token_per_s_min": 4.854757253936461, + "results_request_output_throughput_token_per_s_max": 5.5281910871664515, + "results_request_output_throughput_token_per_s_stddev": 0.1718206332309483, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 157.0, + "results_number_output_tokens_quantiles_p50": 160.0, + "results_number_output_tokens_quantiles_p75": 162.0, + "results_number_output_tokens_quantiles_p90": 164.0, + "results_number_output_tokens_quantiles_p95": 165.0, + "results_number_output_tokens_quantiles_p99": 166.73000000000002, + "results_number_output_tokens_mean": 160.0, + "results_number_output_tokens_min": "151", + "results_number_output_tokens_max": "168", + "results_number_output_tokens_stddev": 3.322554751804725, + "results_num_requests_started": 128, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 140.19221918748778, + "results_num_completed_requests": 128, + "results_num_completed_requests_per_min": 52.57208219530792, + "timestamp": 1718209442 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..586822d --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1102 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06037391052870972, + "ttft_s": 2.165774205001071, + "end_to_end_latency_s": 9.478942657000516, + "request_output_throughput_token_per_s": 15.930046785174024, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059866335253129274, + "ttft_s": 2.145463146000111, + "end_to_end_latency_s": 9.459108183000353, + "request_output_throughput_token_per_s": 15.963449944612433, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06066638728030727, + "ttft_s": 2.210455296000873, + "end_to_end_latency_s": 9.524845193000147, + "request_output_throughput_token_per_s": 15.853276031296614, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06351978905927592, + "ttft_s": 0.5894748329992581, + "end_to_end_latency_s": 9.655245849999119, + "request_output_throughput_token_per_s": 15.639166764460356, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06021197961397291, + "ttft_s": 2.2144013960005395, + "end_to_end_latency_s": 9.51371521100009, + "request_output_throughput_token_per_s": 15.87182258991824, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.061106229861874646, + "ttft_s": 1.988392469000246, + "end_to_end_latency_s": 9.288404631999583, + "request_output_throughput_token_per_s": 16.25682837715621, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05833725756958963, + "ttft_s": 0.5835365269995236, + "end_to_end_latency_s": 9.625869643999977, + "request_output_throughput_token_per_s": 15.686894336255813, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05808435760505228, + "ttft_s": 2.1107337619996542, + "end_to_end_latency_s": 9.409862754000642, + "request_output_throughput_token_per_s": 16.046992814619077, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057234081251525674, + "ttft_s": 2.0197226090003824, + "end_to_end_latency_s": 9.329551717999493, + "request_output_throughput_token_per_s": 16.18512920708461, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060486435166613096, + "ttft_s": 2.126237009999386, + "end_to_end_latency_s": 9.436078249000275, + "request_output_throughput_token_per_s": 16.002410748978054, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05791622844246211, + "ttft_s": 2.246302444000321, + "end_to_end_latency_s": 9.556413506999888, + "request_output_throughput_token_per_s": 15.800906887232895, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.061928451608986135, + "ttft_s": 0.5911336629997095, + "end_to_end_latency_s": 9.661145403999399, + "request_output_throughput_token_per_s": 15.629616746839451, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060490451512676006, + "ttft_s": 2.2395884090001346, + "end_to_end_latency_s": 9.557678862998728, + "request_output_throughput_token_per_s": 15.79881498054682, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06313020709144826, + "ttft_s": 0.5928568960007397, + "end_to_end_latency_s": 9.659193409999716, + "request_output_throughput_token_per_s": 15.632775283666717, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05802216485886239, + "ttft_s": 2.1402726090000215, + "end_to_end_latency_s": 9.457839336999314, + "request_output_throughput_token_per_s": 15.965591571140786, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05809967236644478, + "ttft_s": 2.0363994119998097, + "end_to_end_latency_s": 9.354250843000045, + "request_output_throughput_token_per_s": 16.142393713227822, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06124250707698126, + "ttft_s": 2.246868711999923, + "end_to_end_latency_s": 9.554015706000428, + "request_output_throughput_token_per_s": 15.804872489916885, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059615605999978366, + "ttft_s": 0.5895841340006882, + "end_to_end_latency_s": 9.657957485000225, + "request_output_throughput_token_per_s": 15.634775803736776, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05987450037167573, + "ttft_s": 2.0336352470003476, + "end_to_end_latency_s": 9.340629270000136, + "request_output_throughput_token_per_s": 16.165934396409014, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059767934500006406, + "ttft_s": 2.135562059000222, + "end_to_end_latency_s": 9.443612585000665, + "request_output_throughput_token_per_s": 15.989643649701813, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05862903385086441, + "ttft_s": 2.134026839999933, + "end_to_end_latency_s": 9.439467472000615, + "request_output_throughput_token_per_s": 15.996665113566712, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058832256670695374, + "ttft_s": 0.5897544259987626, + "end_to_end_latency_s": 9.648689096000453, + "request_output_throughput_token_per_s": 15.649794339688288, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05651141863313973, + "ttft_s": 2.2452649509996263, + "end_to_end_latency_s": 9.55066176299988, + "request_output_throughput_token_per_s": 15.81042274839923, + "number_total_tokens": 719, + "number_output_tokens": 169, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05871443215087396, + "ttft_s": 2.030622188000052, + "end_to_end_latency_s": 9.335791047000384, + "request_output_throughput_token_per_s": 16.17431230409947, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06157168235294865, + "ttft_s": 2.119250250998448, + "end_to_end_latency_s": 9.420841493998523, + "request_output_throughput_token_per_s": 16.02829217498176, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059500664512506775, + "ttft_s": 2.2177009789993463, + "end_to_end_latency_s": 9.520299235000493, + "request_output_throughput_token_per_s": 15.860845995771074, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06257366840920393, + "ttft_s": 0.5832874829993671, + "end_to_end_latency_s": 9.636866003998875, + "request_output_throughput_token_per_s": 15.668994457050834, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05778614885711707, + "ttft_s": 2.002114693999829, + "end_to_end_latency_s": 9.304599609000434, + "request_output_throughput_token_per_s": 16.228532805854016, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05837872286498235, + "ttft_s": 2.211785252999107, + "end_to_end_latency_s": 9.515922099000818, + "request_output_throughput_token_per_s": 15.868141671300059, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06095780300634206, + "ttft_s": 0.5830475610000576, + "end_to_end_latency_s": 9.631929571, + "request_output_throughput_token_per_s": 15.677024929110125, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05889954796830237, + "ttft_s": 2.001721439000903, + "end_to_end_latency_s": 9.306348621001234, + "request_output_throughput_token_per_s": 16.225482855783508, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05810804370366334, + "ttft_s": 2.1085184029998345, + "end_to_end_latency_s": 9.413737003000278, + "request_output_throughput_token_per_s": 16.040388631196556, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06137856597448193, + "ttft_s": 0.5834070450000581, + "end_to_end_latency_s": 9.63662455099984, + "request_output_throughput_token_per_s": 15.669387055691933, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058858877814780074, + "ttft_s": 2.2396589599993604, + "end_to_end_latency_s": 9.535580879999543, + "request_output_throughput_token_per_s": 15.835427531920555, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06081673990327897, + "ttft_s": 2.1313753760005056, + "end_to_end_latency_s": 9.426809239999784, + "request_output_throughput_token_per_s": 16.018145287090107, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05896595267095799, + "ttft_s": 2.0203199769985076, + "end_to_end_latency_s": 9.316817654998886, + "request_output_throughput_token_per_s": 16.207250757878878, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05838258698130636, + "ttft_s": 2.095576298999731, + "end_to_end_latency_s": 9.399793046000923, + "request_output_throughput_token_per_s": 16.064183462447815, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05808963647496057, + "ttft_s": 1.9904604450002807, + "end_to_end_latency_s": 9.295301560001462, + "request_output_throughput_token_per_s": 16.244766135373908, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.061743597422081346, + "ttft_s": 2.2042417369993927, + "end_to_end_latency_s": 9.50872675699975, + "request_output_throughput_token_per_s": 15.880149241731333, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06178566625634957, + "ttft_s": 0.5878500219987473, + "end_to_end_latency_s": 9.638769048999166, + "request_output_throughput_token_per_s": 15.665900825342316, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060232982617820716, + "ttft_s": 2.124140563000765, + "end_to_end_latency_s": 9.456779204001577, + "request_output_throughput_token_per_s": 15.967381361310126, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060886693535099, + "ttft_s": 2.2272264110015385, + "end_to_end_latency_s": 9.559710270001233, + "request_output_throughput_token_per_s": 15.795457784305897, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05672243147273548, + "ttft_s": 2.0257152959984523, + "end_to_end_latency_s": 9.359442330998718, + "request_output_throughput_token_per_s": 16.133439863172622, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060390185718722476, + "ttft_s": 0.5891020880008, + "end_to_end_latency_s": 9.662663916000383, + "request_output_throughput_token_per_s": 15.627160513154084, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05842916723915907, + "ttft_s": 2.2120939359992917, + "end_to_end_latency_s": 9.524163493000742, + "request_output_throughput_token_per_s": 15.854410742840471, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0592629460251425, + "ttft_s": 2.1109845850005513, + "end_to_end_latency_s": 9.423048442000436, + "request_output_throughput_token_per_s": 16.024538229790096, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06023380156249232, + "ttft_s": 0.5849158729997725, + "end_to_end_latency_s": 9.637615263000043, + "request_output_throughput_token_per_s": 15.66777629936184, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05720613199394901, + "ttft_s": 2.0100244549994386, + "end_to_end_latency_s": 9.3248579480005, + "request_output_throughput_token_per_s": 16.193276170215384, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060938755587102926, + "ttft_s": 2.1201079819984443, + "end_to_end_latency_s": 9.44576120199963, + "request_output_throughput_token_per_s": 15.9860065028993, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058016838695626544, + "ttft_s": 2.0151267999990523, + "end_to_end_latency_s": 9.340892991998771, + "request_output_throughput_token_per_s": 16.165477982602273, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05746695856550148, + "ttft_s": 0.5817398839990346, + "end_to_end_latency_s": 9.654639341999427, + "request_output_throughput_token_per_s": 15.64014922267709, + "number_total_tokens": 718, + "number_output_tokens": 168, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06085263047129869, + "ttft_s": 2.2282773940005427, + "end_to_end_latency_s": 9.554049016000135, + "request_output_throughput_token_per_s": 15.804817386546874, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058731136157196794, + "ttft_s": 2.012164173000201, + "end_to_end_latency_s": 9.338512432999778, + "request_output_throughput_token_per_s": 16.169598860992767, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0598087390696342, + "ttft_s": 2.1241651229993295, + "end_to_end_latency_s": 9.449967115999243, + "request_output_throughput_token_per_s": 15.978891581998187, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.061921217910343655, + "ttft_s": 0.5874733870005002, + "end_to_end_latency_s": 9.659978429999683, + "request_output_throughput_token_per_s": 15.631504883184812, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05791150184241732, + "ttft_s": 2.229827123999712, + "end_to_end_latency_s": 9.555754607999916, + "request_output_throughput_token_per_s": 15.80199640890583, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05915511191872156, + "ttft_s": 2.1301786409985652, + "end_to_end_latency_s": 9.465013588998772, + "request_output_throughput_token_per_s": 15.953490037828152, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06116028692414619, + "ttft_s": 0.5838088799991965, + "end_to_end_latency_s": 9.663506315000632, + "request_output_throughput_token_per_s": 15.625798243190792, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06170760421935542, + "ttft_s": 2.2302054389983823, + "end_to_end_latency_s": 9.564989425998647, + "request_output_throughput_token_per_s": 15.786739877575414, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05810328054039872, + "ttft_s": 2.0200467630002095, + "end_to_end_latency_s": 9.355979465000928, + "request_output_throughput_token_per_s": 16.139411225181117, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06163224873374953, + "ttft_s": 2.1328472240002156, + "end_to_end_latency_s": 9.491792540999086, + "request_output_throughput_token_per_s": 15.908480863626847, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059084795339621674, + "ttft_s": 2.0351041249996342, + "end_to_end_latency_s": 9.394724489999135, + "request_output_throughput_token_per_s": 16.07285026407559, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05989254905561222, + "ttft_s": 0.5867613079990406, + "end_to_end_latency_s": 9.70280793799975, + "request_output_throughput_token_per_s": 15.562505304122189, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06188880865800808, + "ttft_s": 2.233784824000395, + "end_to_end_latency_s": 9.59300684399932, + "request_output_throughput_token_per_s": 15.740632989796573, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05929699858391342, + "ttft_s": 2.225186813999244, + "end_to_end_latency_s": 9.547178597998936, + "request_output_throughput_token_per_s": 15.816190977264132, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06036096308131391, + "ttft_s": 0.5882526170007623, + "end_to_end_latency_s": 9.657979745001285, + "request_output_throughput_token_per_s": 15.6347397682371, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05867015381989867, + "ttft_s": 2.123781909000172, + "end_to_end_latency_s": 9.446097766998719, + "request_output_throughput_token_per_s": 15.9854369205811, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059728724096168244, + "ttft_s": 1.9953027869996731, + "end_to_end_latency_s": 9.31793603999904, + "request_output_throughput_token_per_s": 16.205305483081588, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057254976186731615, + "ttft_s": 2.213682567999058, + "end_to_end_latency_s": 9.50451533299929, + "request_output_throughput_token_per_s": 15.887185691177136, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058251879786170684, + "ttft_s": 1.9712187999994057, + "end_to_end_latency_s": 9.262298777999604, + "request_output_throughput_token_per_s": 16.30264836183699, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06083894489869106, + "ttft_s": 0.5889721660005307, + "end_to_end_latency_s": 9.612762396000107, + "request_output_throughput_token_per_s": 15.708283818898035, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058699329087505706, + "ttft_s": 2.101089868998315, + "end_to_end_latency_s": 9.39239281999835, + "request_output_throughput_token_per_s": 16.076840363670666, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0568336664383749, + "ttft_s": 1.3178853010012972, + "end_to_end_latency_s": 9.207908083000802, + "request_output_throughput_token_per_s": 16.398947365555152, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05760943496837091, + "ttft_s": 1.2124936139989586, + "end_to_end_latency_s": 9.102543507000519, + "request_output_throughput_token_per_s": 16.58876992830301, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059726717764042064, + "ttft_s": 0.5842601859985734, + "end_to_end_latency_s": 9.616206690998297, + "request_output_throughput_token_per_s": 15.702657487733772, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057573573205095624, + "ttft_s": 1.6768346480002947, + "end_to_end_latency_s": 8.981673782000144, + "request_output_throughput_token_per_s": 16.81201117575811, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05878406632702147, + "ttft_s": 2.0088017399993987, + "end_to_end_latency_s": 9.347559338000792, + "request_output_throughput_token_per_s": 16.15394934013814, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05985744374700954, + "ttft_s": 2.1187126340009854, + "end_to_end_latency_s": 9.458363973000814, + "request_output_throughput_token_per_s": 15.964705992604436, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060192687792545, + "ttft_s": 2.23209092199977, + "end_to_end_latency_s": 9.57085650099907, + "request_output_throughput_token_per_s": 15.777062375163352, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059020015701185556, + "ttft_s": 0.5860092490001989, + "end_to_end_latency_s": 9.679914055001063, + "request_output_throughput_token_per_s": 15.599312054014245, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06092867580390921, + "ttft_s": 0.5827256270004, + "end_to_end_latency_s": 9.626944568000908, + "request_output_throughput_token_per_s": 15.685142771249595, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06023374781012489, + "ttft_s": 2.214133933000994, + "end_to_end_latency_s": 9.517535650000355, + "request_output_throughput_token_per_s": 15.86545147324921, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05902519393081699, + "ttft_s": 2.0821503959996335, + "end_to_end_latency_s": 9.385361239999838, + "request_output_throughput_token_per_s": 16.088885247852495, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05983456702580893, + "ttft_s": 1.971786185999008, + "end_to_end_latency_s": 9.275232903999495, + "request_output_throughput_token_per_s": 16.279914646120485, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06058826996165298, + "ttft_s": 2.20882540599996, + "end_to_end_latency_s": 9.512575037000715, + "request_output_throughput_token_per_s": 15.873724981160287, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.059991194896769406, + "ttft_s": 1.9947809400000551, + "end_to_end_latency_s": 9.29882860500038, + "request_output_throughput_token_per_s": 16.238604496785843, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057016180448540996, + "ttft_s": 2.103174604999367, + "end_to_end_latency_s": 9.407858287999261, + "request_output_throughput_token_per_s": 16.050411834180878, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058722130865878235, + "ttft_s": 0.5875120980017527, + "end_to_end_latency_s": 9.631572137001058, + "request_output_throughput_token_per_s": 15.677606713852246, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06064128412987423, + "ttft_s": 2.012377925000692, + "end_to_end_latency_s": 9.338965478000318, + "request_output_throughput_token_per_s": 16.168814453346975, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05938817603104925, + "ttft_s": 2.2352087319995917, + "end_to_end_latency_s": 9.561840764999943, + "request_output_throughput_token_per_s": 15.791938363240554, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05966026051848732, + "ttft_s": 0.5837988599996606, + "end_to_end_latency_s": 9.665338888000406, + "request_output_throughput_token_per_s": 15.622835551836438, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05981901109491814, + "ttft_s": 2.124487888000658, + "end_to_end_latency_s": 9.451836039999762, + "request_output_throughput_token_per_s": 15.975732054700751, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.058395211735762086, + "ttft_s": 1.983398025000497, + "end_to_end_latency_s": 9.28505573899929, + "request_output_throughput_token_per_s": 16.26269181839874, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.062082356212903825, + "ttft_s": 0.5853514400005224, + "end_to_end_latency_s": 9.622964706000857, + "request_output_throughput_token_per_s": 15.691629826495857, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.057289109891618034, + "ttft_s": 2.2080342850003944, + "end_to_end_latency_s": 9.510452169000928, + "request_output_throughput_token_per_s": 15.877268222028452, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05871426503134671, + "ttft_s": 2.092628649999824, + "end_to_end_latency_s": 9.394468967000648, + "request_output_throughput_token_per_s": 16.073287434383793, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.06218536006535407, + "ttft_s": 2.2217521000002307, + "end_to_end_latency_s": 9.514557390999471, + "request_output_throughput_token_per_s": 15.870417697290065, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05812927645280333, + "ttft_s": 1.9499228069998935, + "end_to_end_latency_s": 9.242745498000659, + "request_output_throughput_token_per_s": 16.337137058752024, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.05780857240747368, + "ttft_s": 2.07311288100027, + "end_to_end_latency_s": 9.365266263999729, + "request_output_throughput_token_per_s": 16.123407038670862, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.060170025699994766, + "ttft_s": 0.5891366790001484, + "end_to_end_latency_s": 9.627427463999993, + "request_output_throughput_token_per_s": 15.684356030168695, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..ce709b0 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_4/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 4, + "results_inter_token_latency_s_quantiles_p25": 0.05836835654113417, + "results_inter_token_latency_s_quantiles_p50": 0.05963793325923285, + "results_inter_token_latency_s_quantiles_p75": 0.060647559917482496, + "results_inter_token_latency_s_quantiles_p90": 0.061711203539628015, + "results_inter_token_latency_s_quantiles_p95": 0.06193614683918202, + "results_inter_token_latency_s_quantiles_p99": 0.06313410291112653, + "results_inter_token_latency_s_mean": 0.05955418782851389, + "results_inter_token_latency_s_min": 0.05651141863313973, + "results_inter_token_latency_s_max": 0.06351978905927592, + "results_inter_token_latency_s_stddev": 0.0015422369688723777, + "results_ttft_s_quantiles_p25": 1.0575844344994039, + "results_ttft_s_quantiles_p50": 2.035751768499722, + "results_ttft_s_quantiles_p75": 2.150540910750351, + "results_ttft_s_quantiles_p90": 2.2284323670004595, + "results_ttft_s_quantiles_p95": 2.2354277158496187, + "results_ttft_s_quantiles_p99": 2.246308106680317, + "results_ttft_s_mean": 1.7137107673198806, + "results_ttft_s_min": 0.5817398839990346, + "results_ttft_s_max": 2.246868711999923, + "results_ttft_s_stddev": 0.6706939569652745, + "results_end_to_end_latency_s_quantiles_p25": 9.363810280749476, + "results_end_to_end_latency_s_quantiles_p50": 9.485367598999801, + "results_end_to_end_latency_s_quantiles_p75": 9.597945731999516, + "results_end_to_end_latency_s_quantiles_p90": 9.65551701349923, + "results_end_to_end_latency_s_quantiles_p95": 9.661221329599448, + "results_end_to_end_latency_s_quantiles_p99": 9.680142993831051, + "results_end_to_end_latency_s_mean": 9.475706080719974, + "results_end_to_end_latency_s_min": 8.981673782000144, + "results_end_to_end_latency_s_max": 9.70280793799975, + "results_end_to_end_latency_s_stddev": 0.14084633113173364, + "results_request_output_throughput_token_per_s_quantiles_p25": 15.732545697071938, + "results_request_output_throughput_token_per_s_quantiles_p50": 15.919263824400435, + "results_request_output_throughput_token_per_s_quantiles_p75": 16.125915244796303, + "results_request_output_throughput_token_per_s_quantiles_p90": 16.229539974947198, + "results_request_output_throughput_token_per_s_quantiles_p95": 16.28105133190631, + "results_request_output_throughput_token_per_s_quantiles_p99": 16.59100234077756, + "results_request_output_throughput_token_per_s_mean": 15.939005488633883, + "results_request_output_throughput_token_per_s_min": 15.562505304122189, + "results_request_output_throughput_token_per_s_max": 16.81201117575811, + "results_request_output_throughput_token_per_s_stddev": 0.23909384202291645, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 157.0, + "results_number_output_tokens_quantiles_p50": 159.0, + "results_number_output_tokens_quantiles_p75": 161.25, + "results_number_output_tokens_quantiles_p90": 164.0, + "results_number_output_tokens_quantiles_p95": 165.0, + "results_number_output_tokens_quantiles_p99": 168.01, + "results_number_output_tokens_mean": 159.18, + "results_number_output_tokens_min": "152", + "results_number_output_tokens_max": "169", + "results_number_output_tokens_stddev": 3.5742626561786883, + "results_num_requests_started": 100, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 64.53540931804359, + "results_num_completed_requests": 100, + "results_num_completed_requests_per_min": 24.32544640710275, + "timestamp": 1718208921 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..259b15a --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1410 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.04786341221152048, + "ttft_s": 0.607711055999971, + "end_to_end_latency_s": 7.466937597000651, + "request_output_throughput_token_per_s": 20.222480506687813, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32125405494842846, + "ttft_s": 0.7684905400001298, + "end_to_end_latency_s": 49.7946176789992, + "request_output_throughput_token_per_s": 3.032456258092408, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30858757657600255, + "ttft_s": 12.795462375999705, + "end_to_end_latency_s": 48.760831858999154, + "request_output_throughput_token_per_s": 3.0967478249067626, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3044279786792354, + "ttft_s": 21.508217861999583, + "end_to_end_latency_s": 48.404272162999405, + "request_output_throughput_token_per_s": 3.119559354007301, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30910967396182065, + "ttft_s": 21.63337335199867, + "end_to_end_latency_s": 48.53054782699837, + "request_output_throughput_token_per_s": 3.111442313371046, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3204527810921838, + "ttft_s": 12.740517844000351, + "end_to_end_latency_s": 48.70910833900052, + "request_output_throughput_token_per_s": 3.100036218053636, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29861432824846074, + "ttft_s": 25.722811827999976, + "end_to_end_latency_s": 48.07720998399964, + "request_output_throughput_token_per_s": 3.1407812568627347, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3111854362422101, + "ttft_s": 5.01691036900047, + "end_to_end_latency_s": 50.10109599600037, + "request_output_throughput_token_per_s": 3.0139061231725255, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.324712338754597, + "ttft_s": 1.9706720630001655, + "end_to_end_latency_s": 51.62949088700043, + "request_output_throughput_token_per_s": 2.9246850473596218, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3032873805154935, + "ttft_s": 17.376643056999455, + "end_to_end_latency_s": 48.82960859800005, + "request_output_throughput_token_per_s": 3.0923860406733756, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30958065811320246, + "ttft_s": 8.706844180998814, + "end_to_end_latency_s": 49.22354993299996, + "request_output_throughput_token_per_s": 3.067637344432326, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2969756480245233, + "ttft_s": 26.049320670999805, + "end_to_end_latency_s": 48.40727645199877, + "request_output_throughput_token_per_s": 3.119365745555493, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30728924201275154, + "ttft_s": 21.64971967999918, + "end_to_end_latency_s": 48.551933879998614, + "request_output_throughput_token_per_s": 3.1100717918510297, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29728940356970157, + "ttft_s": 13.07931229300084, + "end_to_end_latency_s": 49.05301207299999, + "request_output_throughput_token_per_s": 3.0783023023190497, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3091124269102086, + "ttft_s": 25.86211285799982, + "end_to_end_latency_s": 48.22177473099873, + "request_output_throughput_token_per_s": 3.1313654638872435, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3019005457546458, + "ttft_s": 13.234779930000514, + "end_to_end_latency_s": 49.21001532900118, + "request_output_throughput_token_per_s": 3.068481060013213, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30624209462261, + "ttft_s": 17.23461793199931, + "end_to_end_latency_s": 48.69274253900039, + "request_output_throughput_token_per_s": 3.1010781510007726, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2965860999938741, + "ttft_s": 12.960767488999409, + "end_to_end_latency_s": 48.93694782999955, + "request_output_throughput_token_per_s": 3.0856031423241577, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31779891730571286, + "ttft_s": 4.80370895499982, + "end_to_end_latency_s": 49.89465222800027, + "request_output_throughput_token_per_s": 3.0263764403043707, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3002539625950854, + "ttft_s": 12.9646128280001, + "end_to_end_latency_s": 48.94180939200123, + "request_output_throughput_token_per_s": 3.085296638515342, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29551940976833113, + "ttft_s": 21.55844547400011, + "end_to_end_latency_s": 48.465390434001165, + "request_output_throughput_token_per_s": 3.1156253699354313, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32825155981652043, + "ttft_s": 2.197424919999321, + "end_to_end_latency_s": 51.86420913599977, + "request_output_throughput_token_per_s": 2.9114490033781024, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30407732601211085, + "ttft_s": 5.383298340000692, + "end_to_end_latency_s": 50.47704850000082, + "request_output_throughput_token_per_s": 2.9914585833994938, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30164545169325585, + "ttft_s": 13.188807171998633, + "end_to_end_latency_s": 49.168497767999725, + "request_output_throughput_token_per_s": 3.071072065542648, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3227075616625143, + "ttft_s": 1.9641782990001957, + "end_to_end_latency_s": 51.6334515669987, + "request_output_throughput_token_per_s": 2.9244607016841577, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30954077210555614, + "ttft_s": 9.310149899998578, + "end_to_end_latency_s": 49.8362759419997, + "request_output_throughput_token_per_s": 3.02992142060808, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30554524047246745, + "ttft_s": 9.277881477000847, + "end_to_end_latency_s": 49.804101101000924, + "request_output_throughput_token_per_s": 3.0318788345115886, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31369420905086987, + "ttft_s": 8.723236178999286, + "end_to_end_latency_s": 49.25024915299946, + "request_output_throughput_token_per_s": 3.0659743371227948, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29586556210973336, + "ttft_s": 21.61031607699988, + "end_to_end_latency_s": 48.522196069998245, + "request_output_throughput_token_per_s": 3.111977862299699, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29288228690427265, + "ttft_s": 12.928593737999108, + "end_to_end_latency_s": 48.91157664599996, + "request_output_throughput_token_per_s": 3.0872036919371917, + "number_total_tokens": 717, + "number_output_tokens": 167, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2980627479815901, + "ttft_s": 17.118262825000784, + "end_to_end_latency_s": 48.584449875001155, + "request_output_throughput_token_per_s": 3.107990321769521, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31311759376098647, + "ttft_s": 9.256274520999796, + "end_to_end_latency_s": 49.78592982900045, + "request_output_throughput_token_per_s": 3.032985434210813, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3021014836500058, + "ttft_s": 25.964891211999202, + "end_to_end_latency_s": 48.33646226599922, + "request_output_throughput_token_per_s": 3.1239356982527093, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31797902328751204, + "ttft_s": 5.775352230000863, + "end_to_end_latency_s": 50.876851168999565, + "request_output_throughput_token_per_s": 2.9679509743717745, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3076177878641335, + "ttft_s": 9.3024896120005, + "end_to_end_latency_s": 49.834358978001546, + "request_output_throughput_token_per_s": 3.0300379717266184, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29331843991563933, + "ttft_s": 17.220557791999454, + "end_to_end_latency_s": 48.691061707999324, + "request_output_throughput_token_per_s": 3.1011852012089647, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.311312692660277, + "ttft_s": 17.09438554999906, + "end_to_end_latency_s": 48.56510311799866, + "request_output_throughput_token_per_s": 3.1092284439943474, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3308139242948388, + "ttft_s": 6.5034880340008385, + "end_to_end_latency_s": 51.60726538199924, + "request_output_throughput_token_per_s": 2.9259446103623468, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3120944698489982, + "ttft_s": 9.089277627999763, + "end_to_end_latency_s": 49.62325545100066, + "request_output_throughput_token_per_s": 3.04292813173254, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3199325175350759, + "ttft_s": 5.124818557000253, + "end_to_end_latency_s": 50.2296757790009, + "request_output_throughput_token_per_s": 3.0061910147372943, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3151651248064398, + "ttft_s": 17.377714369999012, + "end_to_end_latency_s": 48.85081779900065, + "request_output_throughput_token_per_s": 3.0910434421240955, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30334247312504203, + "ttft_s": 21.63773741699879, + "end_to_end_latency_s": 48.53503701399859, + "request_output_throughput_token_per_s": 3.11115452444073, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2964722387750385, + "ttft_s": 29.618724122001368, + "end_to_end_latency_s": 47.43584850800107, + "request_output_throughput_token_per_s": 3.183246526612265, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31363547467746333, + "ttft_s": 21.692604788999233, + "end_to_end_latency_s": 48.61374323999917, + "request_output_throughput_token_per_s": 3.1061175284226596, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30412545428201226, + "ttft_s": 29.62287763599852, + "end_to_end_latency_s": 47.44379851999838, + "request_output_throughput_token_per_s": 3.182713119742107, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2992076418087082, + "ttft_s": 26.120374948999597, + "end_to_end_latency_s": 48.47216047099937, + "request_output_throughput_token_per_s": 3.1151902150171846, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32306408853903074, + "ttft_s": 4.659045895999952, + "end_to_end_latency_s": 49.752082066001094, + "request_output_throughput_token_per_s": 3.03504886086342, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29684270609151076, + "ttft_s": 17.22042556999986, + "end_to_end_latency_s": 48.68242439700043, + "request_output_throughput_token_per_s": 3.101735418281754, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2860012586807317, + "ttft_s": 29.661901624000166, + "end_to_end_latency_s": 47.47660138900028, + "request_output_throughput_token_per_s": 3.1805140970976233, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2914041356136317, + "ttft_s": 29.667518576001385, + "end_to_end_latency_s": 47.49912371700157, + "request_output_throughput_token_per_s": 3.1790060149246906, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2964865820312411, + "ttft_s": 30.768073249000736, + "end_to_end_latency_s": 47.43816233800135, + "request_output_throughput_token_per_s": 3.1830912615061027, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2889595612485154, + "ttft_s": 29.859827263000625, + "end_to_end_latency_s": 47.678555259999484, + "request_output_throughput_token_per_s": 3.1670422724969463, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3060573988902756, + "ttft_s": 30.75070710699947, + "end_to_end_latency_s": 47.43913851099933, + "request_output_throughput_token_per_s": 3.183025761839854, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29756340913664636, + "ttft_s": 30.08705989700138, + "end_to_end_latency_s": 47.90797527200084, + "request_output_throughput_token_per_s": 3.1518760528427068, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3190816196564604, + "ttft_s": 0.592172317999939, + "end_to_end_latency_s": 52.01058554699921, + "request_output_throughput_token_per_s": 2.9032551433890186, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.282911764709276, + "ttft_s": 21.757014322000032, + "end_to_end_latency_s": 48.66102871400108, + "request_output_throughput_token_per_s": 3.1030992149278847, + "number_total_tokens": 722, + "number_output_tokens": 172, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.305859705165719, + "ttft_s": 25.6415835300013, + "end_to_end_latency_s": 48.020728522000354, + "request_output_throughput_token_per_s": 3.1444754098393246, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.28990934811517943, + "ttft_s": 30.008654045001094, + "end_to_end_latency_s": 47.83525094200013, + "request_output_throughput_token_per_s": 3.1566678762297355, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30624715585994416, + "ttft_s": 25.712654797998766, + "end_to_end_latency_s": 48.08101211199937, + "request_output_throughput_token_per_s": 3.140532891617637, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2950339396666426, + "ttft_s": 17.2141808209999, + "end_to_end_latency_s": 48.680814246999944, + "request_output_throughput_token_per_s": 3.1018380102240317, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3138128350062475, + "ttft_s": 5.424358095000571, + "end_to_end_latency_s": 50.524135851999745, + "request_output_throughput_token_per_s": 2.9886706116523003, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30641753833333496, + "ttft_s": 9.1090891799995, + "end_to_end_latency_s": 49.639868141999614, + "request_output_throughput_token_per_s": 3.041909772363818, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3068591472420403, + "ttft_s": 25.804715854999813, + "end_to_end_latency_s": 48.177093869999226, + "request_output_throughput_token_per_s": 3.134269584783538, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29015550628052617, + "ttft_s": 29.747978354000224, + "end_to_end_latency_s": 47.58572545400057, + "request_output_throughput_token_per_s": 3.1732205101289535, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.60507138681174, + "ttft_s": 6.7401579699999274, + "end_to_end_latency_s": 41.86243495900089, + "request_output_throughput_token_per_s": 1.4810414172209851, + "number_total_tokens": 619, + "number_output_tokens": 69, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31901350680742346, + "ttft_s": 10.869214331998592, + "end_to_end_latency_s": 51.36141511099959, + "request_output_throughput_token_per_s": 2.9399501488357895, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3343704402849181, + "ttft_s": 3.2323390140008996, + "end_to_end_latency_s": 52.83081468800083, + "request_output_throughput_token_per_s": 2.8581804178442054, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30890257825627715, + "ttft_s": 18.035427056000117, + "end_to_end_latency_s": 49.42461337200075, + "request_output_throughput_token_per_s": 3.0551579405078795, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29616643208635335, + "ttft_s": 25.685127025999464, + "end_to_end_latency_s": 47.97924712200074, + "request_output_throughput_token_per_s": 3.147194027785388, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32183826852859426, + "ttft_s": 14.609609307999563, + "end_to_end_latency_s": 50.528808912000386, + "request_output_throughput_token_per_s": 2.988394210181711, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.28846489506678624, + "ttft_s": 25.30149243500091, + "end_to_end_latency_s": 47.59693446700112, + "request_output_throughput_token_per_s": 3.172473221036705, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32034064121382827, + "ttft_s": 10.43481010100004, + "end_to_end_latency_s": 50.934378514000855, + "request_output_throughput_token_per_s": 2.9645988506268526, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29434695150313184, + "ttft_s": 25.094039373998385, + "end_to_end_latency_s": 47.39008346399896, + "request_output_throughput_token_per_s": 3.186320617365252, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29574008615378633, + "ttft_s": 28.379571317998852, + "end_to_end_latency_s": 46.13571359399975, + "request_output_throughput_token_per_s": 3.2729525184940136, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3016903024321616, + "ttft_s": 22.032162026998776, + "end_to_end_latency_s": 48.874042645999, + "request_output_throughput_token_per_s": 3.0895745844826568, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.28912453267480626, + "ttft_s": 29.37028614799965, + "end_to_end_latency_s": 47.1276591299993, + "request_output_throughput_token_per_s": 3.2040632356356595, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30983594438507484, + "ttft_s": 18.48939624599916, + "end_to_end_latency_s": 49.883811318999506, + "request_output_throughput_token_per_s": 3.0270341420862494, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31451934902503353, + "ttft_s": 14.399673586000063, + "end_to_end_latency_s": 50.32329436600048, + "request_output_throughput_token_per_s": 3.0005984684106632, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32120472413374096, + "ttft_s": 14.505428487000245, + "end_to_end_latency_s": 50.429362041000786, + "request_output_throughput_token_per_s": 2.9942873335822067, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29899538917726215, + "ttft_s": 24.941454243000408, + "end_to_end_latency_s": 47.24150818400085, + "request_output_throughput_token_per_s": 3.196341645399432, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.28881243885096536, + "ttft_s": 28.739500867999595, + "end_to_end_latency_s": 46.499098767999385, + "request_output_throughput_token_per_s": 3.247374766409838, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3145040593913589, + "ttft_s": 14.710124543000347, + "end_to_end_latency_s": 50.6354435939993, + "request_output_throughput_token_per_s": 2.9821008622089904, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30009817231090957, + "ttft_s": 17.819682969000496, + "end_to_end_latency_s": 49.21630306999941, + "request_output_throughput_token_per_s": 3.068089039220105, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2859480394416576, + "ttft_s": 28.848643482000625, + "end_to_end_latency_s": 46.60971945200072, + "request_output_throughput_token_per_s": 3.2396676439020777, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.28413951758016026, + "ttft_s": 28.26928175899957, + "end_to_end_latency_s": 46.030807053999524, + "request_output_throughput_token_per_s": 3.2804117430063595, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3100760666772853, + "ttft_s": 22.14398147599968, + "end_to_end_latency_s": 48.992231616000936, + "request_output_throughput_token_per_s": 3.0821212877896986, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30922728456527515, + "ttft_s": 18.386673994000375, + "end_to_end_latency_s": 49.78577689700069, + "request_output_throughput_token_per_s": 3.0329947509385335, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3198547150645379, + "ttft_s": 18.177303447999293, + "end_to_end_latency_s": 49.57771423799932, + "request_output_throughput_token_per_s": 3.0457233117912605, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3200508110131451, + "ttft_s": 21.798390750998806, + "end_to_end_latency_s": 48.64794144699954, + "request_output_throughput_token_per_s": 3.103934010537937, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32954908620628204, + "ttft_s": 3.1179533730010007, + "end_to_end_latency_s": 52.72812077600065, + "request_output_throughput_token_per_s": 2.8637470438492865, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3198741147707146, + "ttft_s": 14.290257328000735, + "end_to_end_latency_s": 50.22043589100031, + "request_output_throughput_token_per_s": 3.0067441136459703, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3268634325964028, + "ttft_s": 3.0143725689995335, + "end_to_end_latency_s": 52.625216239999645, + "request_output_throughput_token_per_s": 2.8693468794761388, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31321134487492375, + "ttft_s": 14.18182478199924, + "end_to_end_latency_s": 50.11401146099888, + "request_output_throughput_token_per_s": 3.013129374357018, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.33619492761295516, + "ttft_s": 7.046742147000259, + "end_to_end_latency_s": 52.11041811200084, + "request_output_throughput_token_per_s": 2.8976931191658437, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31572386213579584, + "ttft_s": 10.638382002998696, + "end_to_end_latency_s": 51.147498798000015, + "request_output_throughput_token_per_s": 2.9522460247050133, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3125464687499516, + "ttft_s": 14.073357895998925, + "end_to_end_latency_s": 50.00766887200007, + "request_output_throughput_token_per_s": 3.019536871164711, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3089325193208047, + "ttft_s": 17.71425323099902, + "end_to_end_latency_s": 49.120484674998806, + "request_output_throughput_token_per_s": 3.0740739021424095, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.293943901713347, + "ttft_s": 21.35148137000033, + "end_to_end_latency_s": 48.20699001200046, + "request_output_throughput_token_per_s": 3.13232582997633, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3078518201483436, + "ttft_s": 25.405467513000985, + "end_to_end_latency_s": 47.71726144400054, + "request_output_throughput_token_per_s": 3.164473304429023, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3123472025519094, + "ttft_s": 25.789564832000906, + "end_to_end_latency_s": 48.10180164800113, + "request_output_throughput_token_per_s": 3.1391755573935933, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32429103103797524, + "ttft_s": 10.742698305000886, + "end_to_end_latency_s": 51.23831159800102, + "request_output_throughput_token_per_s": 2.947013578134589, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.28839653472774046, + "ttft_s": 21.903567605000717, + "end_to_end_latency_s": 48.73933896500057, + "request_output_throughput_token_per_s": 3.098113417345118, + "number_total_tokens": 719, + "number_output_tokens": 169, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32090034914099475, + "ttft_s": 7.259173211999951, + "end_to_end_latency_s": 52.30704454499937, + "request_output_throughput_token_per_s": 2.8868004551489386, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31571805508701234, + "ttft_s": 10.330284455001674, + "end_to_end_latency_s": 50.83083403100136, + "request_output_throughput_token_per_s": 2.9706378594517293, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29455503914639447, + "ttft_s": 21.46492925899838, + "end_to_end_latency_s": 48.3072867840001, + "request_output_throughput_token_per_s": 3.125822418368834, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32510689716553454, + "ttft_s": 10.534590396000567, + "end_to_end_latency_s": 51.041987668999354, + "request_output_throughput_token_per_s": 2.9583487418087113, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29391360290625246, + "ttft_s": 29.264322053000797, + "end_to_end_latency_s": 47.026368135000666, + "request_output_throughput_token_per_s": 3.210964528804726, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30726677222783916, + "ttft_s": 21.689553449001323, + "end_to_end_latency_s": 48.54833690400119, + "request_output_throughput_token_per_s": 3.110302218973748, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30271552318118894, + "ttft_s": 21.583624007000253, + "end_to_end_latency_s": 48.43468550199941, + "request_output_throughput_token_per_s": 3.117600505401581, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32682590904938413, + "ttft_s": 3.331158566999875, + "end_to_end_latency_s": 52.946005759999025, + "request_output_throughput_token_per_s": 2.8519620664960765, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.28005274894520904, + "ttft_s": 29.30444059599904, + "end_to_end_latency_s": 45.92883506099861, + "request_output_throughput_token_per_s": 3.2876949698257136, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29002561583125724, + "ttft_s": 28.632165856000938, + "end_to_end_latency_s": 46.4043698280002, + "request_output_throughput_token_per_s": 3.2540038914371214, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2875432924969833, + "ttft_s": 28.521764694998637, + "end_to_end_latency_s": 46.29471629699947, + "request_output_throughput_token_per_s": 3.2617113156342397, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.31852542793217536, + "ttft_s": 11.085836830001426, + "end_to_end_latency_s": 51.601328857001135, + "request_output_throughput_token_per_s": 2.926281228501981, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29681262368751504, + "ttft_s": 25.198041132998696, + "end_to_end_latency_s": 47.49040946599962, + "request_output_throughput_token_per_s": 3.179589346520738, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32324741915292443, + "ttft_s": 14.808791514000404, + "end_to_end_latency_s": 50.75006270900121, + "request_output_throughput_token_per_s": 2.975365781631204, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32306155068748693, + "ttft_s": 6.641968054000245, + "end_to_end_latency_s": 51.69004253999992, + "request_output_throughput_token_per_s": 2.921258961687831, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.30269909870551104, + "ttft_s": 17.926841717999196, + "end_to_end_latency_s": 49.340302224998595, + "request_output_throughput_token_per_s": 3.0603784977120556, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3010895442545227, + "ttft_s": 18.283178341000166, + "end_to_end_latency_s": 49.67995775500094, + "request_output_throughput_token_per_s": 3.039455080551067, + "number_total_tokens": 715, + "number_output_tokens": 165, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3197365805961973, + "ttft_s": 10.966653977000533, + "end_to_end_latency_s": 51.47791090999999, + "request_output_throughput_token_per_s": 2.9332969681694494, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.33573901638596276, + "ttft_s": 0.588565160998769, + "end_to_end_latency_s": 53.0469622699984, + "request_output_throughput_token_per_s": 2.8465343450099985, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32969947645912306, + "ttft_s": 7.357727601000079, + "end_to_end_latency_s": 52.422445229000004, + "request_output_throughput_token_per_s": 2.880445567549891, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.29954450498038554, + "ttft_s": 29.198856207000063, + "end_to_end_latency_s": 45.83054709499993, + "request_output_throughput_token_per_s": 3.2947457442958163, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3355559768580861, + "ttft_s": 6.945683676000044, + "end_to_end_latency_s": 52.01137436699901, + "request_output_throughput_token_per_s": 2.9032111117565247, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.2975214189627086, + "ttft_s": 25.58051069999965, + "end_to_end_latency_s": 47.901211774998956, + "request_output_throughput_token_per_s": 3.152321087601615, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3432434865228857, + "ttft_s": 2.915465285001119, + "end_to_end_latency_s": 52.516475111000545, + "request_output_throughput_token_per_s": 2.8752881772975325, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.3224510642298725, + "ttft_s": 6.843737492999935, + "end_to_end_latency_s": 51.91481676299918, + "request_output_throughput_token_per_s": 2.908610863240511, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.32847665386165, + "ttft_s": 7.154329020999285, + "end_to_end_latency_s": 52.22799493400089, + "request_output_throughput_token_per_s": 2.8911697680681527, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..6a7c41e --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_64/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 64, + "results_inter_token_latency_s_quantiles_p25": 0.2969424125412702, + "results_inter_token_latency_s_quantiles_p50": 0.30745351493844253, + "results_inter_token_latency_s_quantiles_p75": 0.31924535989139463, + "results_inter_token_latency_s_quantiles_p90": 0.32562260073068944, + "results_inter_token_latency_s_quantiles_p95": 0.33042386755233827, + "results_inter_token_latency_s_quantiles_p99": 0.3413403756172045, + "results_inter_token_latency_s_mean": 0.30833836372349244, + "results_inter_token_latency_s_min": 0.04786341221152048, + "results_inter_token_latency_s_max": 0.60507138681174, + "results_inter_token_latency_s_stddev": 0.037472401936725075, + "results_ttft_s_quantiles_p25": 9.219478185749722, + "results_ttft_s_quantiles_p50": 17.220491680999658, + "results_ttft_s_quantiles_p75": 25.32748620450093, + "results_ttft_s_quantiles_p90": 29.218495960800283, + "results_ttft_s_quantiles_p95": 29.665552642800957, + "results_ttft_s_quantiles_p99": 30.57152236029999, + "results_ttft_s_mean": 16.630715770742114, + "results_ttft_s_min": 0.588565160998769, + "results_ttft_s_max": 30.768073249000736, + "results_ttft_s_stddev": 8.97254874585545, + "results_end_to_end_latency_s_quantiles_p25": 48.09660426400069, + "results_end_to_end_latency_s_quantiles_p50": 48.924262237999756, + "results_end_to_end_latency_s_quantiles_p75": 50.441283655750794, + "results_end_to_end_latency_s_quantiles_p90": 51.879391424099595, + "results_end_to_end_latency_s_quantiles_p95": 52.38205498959978, + "results_end_to_end_latency_s_quantiles_p99": 52.91490417055951, + "results_end_to_end_latency_s_mean": 48.92121551664843, + "results_end_to_end_latency_s_min": 7.466937597000651, + "results_end_to_end_latency_s_max": 53.0469622699984, + "results_end_to_end_latency_s_stddev": 4.119586037076952, + "results_request_output_throughput_token_per_s_quantiles_p25": 2.9907615904626956, + "results_request_output_throughput_token_per_s_quantiles_p50": 3.08544989041975, + "results_request_output_throughput_token_per_s_quantiles_p75": 3.1354960779360517, + "results_request_output_throughput_token_per_s_quantiles_p90": 3.184168753838161, + "results_request_output_throughput_token_per_s_quantiles_p95": 3.251683697677572, + "results_request_output_throughput_token_per_s_quantiles_p99": 3.2928420351888885, + "results_request_output_throughput_token_per_s_mean": 3.187798465074813, + "results_request_output_throughput_token_per_s_min": 1.4810414172209851, + "results_request_output_throughput_token_per_s_max": 20.222480506687813, + "results_request_output_throughput_token_per_s_stddev": 1.5275954750845664, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 157.0, + "results_number_output_tokens_quantiles_p50": 160.0, + "results_number_output_tokens_quantiles_p75": 163.0, + "results_number_output_tokens_quantiles_p90": 164.3, + "results_number_output_tokens_quantiles_p95": 165.0, + "results_number_output_tokens_quantiles_p99": 168.46, + "results_number_output_tokens_mean": 159.40625, + "results_number_output_tokens_min": "69", + "results_number_output_tokens_max": "172", + "results_number_output_tokens_stddev": 8.798208031578314, + "results_num_requests_started": 128, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 135.38240586364088, + "results_num_completed_requests": 128, + "results_num_completed_requests_per_min": 50.95750230507558, + "timestamp": 1718209612 +} \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json new file mode 100644 index 0000000..86a58de --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_individual_responses.json @@ -0,0 +1,1146 @@ +[ + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0807645126429492, + "ttft_s": 4.292722194999442, + "end_to_end_latency_s": 12.438038329999472, + "request_output_throughput_token_per_s": 12.140178056518854, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08052833228847679, + "ttft_s": 4.416717618001712, + "end_to_end_latency_s": 12.562660051000421, + "request_output_throughput_token_per_s": 12.019747361385871, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07905064652867663, + "ttft_s": 4.266398198000388, + "end_to_end_latency_s": 12.411156366000796, + "request_output_throughput_token_per_s": 12.166473094614327, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07520565717078814, + "ttft_s": 4.189252332000251, + "end_to_end_latency_s": 12.333946168999319, + "request_output_throughput_token_per_s": 12.242634914325313, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08309716781049145, + "ttft_s": 0.5885932310011412, + "end_to_end_latency_s": 12.71410589700099, + "request_output_throughput_token_per_s": 11.876572463944786, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0775920611374886, + "ttft_s": 4.267829985999924, + "end_to_end_latency_s": 12.415019064999797, + "request_output_throughput_token_per_s": 12.162687726005718, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07752183372049218, + "ttft_s": 4.333442275001289, + "end_to_end_latency_s": 12.48122659100045, + "request_output_throughput_token_per_s": 12.098169911351349, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07809064788056756, + "ttft_s": 4.267655943998761, + "end_to_end_latency_s": 12.416618674998972, + "request_output_throughput_token_per_s": 12.161120829460643, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07915242396125143, + "ttft_s": 2.9476286660010373, + "end_to_end_latency_s": 12.268827197000064, + "request_output_throughput_token_per_s": 12.307614866148091, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08030923327046738, + "ttft_s": 0.5869000699985918, + "end_to_end_latency_s": 12.76938537299975, + "request_output_throughput_token_per_s": 11.8251580314337, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07815772859880837, + "ttft_s": 3.3405255469988333, + "end_to_end_latency_s": 12.661737567999808, + "request_output_throughput_token_per_s": 11.925693388372263, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07633469727781271, + "ttft_s": 3.0441719600003125, + "end_to_end_latency_s": 12.36676825900031, + "request_output_throughput_token_per_s": 12.210142281117376, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07792209859995865, + "ttft_s": 3.1442330489990127, + "end_to_end_latency_s": 12.467729488998884, + "request_output_throughput_token_per_s": 12.111266941846745, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07535136912499638, + "ttft_s": 3.8804980870008876, + "end_to_end_latency_s": 12.058837864000452, + "request_output_throughput_token_per_s": 12.521936334411134, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07660742671345272, + "ttft_s": 3.2417503239994403, + "end_to_end_latency_s": 12.56383613700018, + "request_output_throughput_token_per_s": 12.018622206899755, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07604467310615064, + "ttft_s": 3.9887671300002694, + "end_to_end_latency_s": 12.167599283000527, + "request_output_throughput_token_per_s": 12.410007634863813, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0789054444379206, + "ttft_s": 3.844192013000793, + "end_to_end_latency_s": 12.073489091000738, + "request_output_throughput_token_per_s": 12.506740914898531, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07471728287728072, + "ttft_s": 3.9484945069998503, + "end_to_end_latency_s": 12.17910909100101, + "request_output_throughput_token_per_s": 12.39827961731388, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07930633573709528, + "ttft_s": 2.380700761999833, + "end_to_end_latency_s": 12.371981055001015, + "request_output_throughput_token_per_s": 12.204997674076022, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07955311919102512, + "ttft_s": 2.4987748809999175, + "end_to_end_latency_s": 12.490020523999192, + "request_output_throughput_token_per_s": 12.089651871256587, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08016328259496801, + "ttft_s": 2.6727229040006932, + "end_to_end_latency_s": 12.665984773000673, + "request_output_throughput_token_per_s": 11.921694420624737, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08302073082467797, + "ttft_s": 2.7937217190010415, + "end_to_end_latency_s": 12.785501169000781, + "request_output_throughput_token_per_s": 11.81025272330416, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08075835104611877, + "ttft_s": 4.044984458998442, + "end_to_end_latency_s": 12.275577482998415, + "request_output_throughput_token_per_s": 12.300846962933834, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07955005505561002, + "ttft_s": 0.590160970999932, + "end_to_end_latency_s": 12.88769611600037, + "request_output_throughput_token_per_s": 11.716601527601977, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08135283147465641, + "ttft_s": 0.5815162910002982, + "end_to_end_latency_s": 12.853970735000985, + "request_output_throughput_token_per_s": 11.747342756027244, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07670885142500765, + "ttft_s": 4.039859233000243, + "end_to_end_latency_s": 12.273631969001144, + "request_output_throughput_token_per_s": 12.302796790825456, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07640350446293011, + "ttft_s": 2.380776492000223, + "end_to_end_latency_s": 12.377598057000796, + "request_output_throughput_token_per_s": 12.199458998799374, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07731309518407706, + "ttft_s": 2.606205203999707, + "end_to_end_latency_s": 12.602437231000295, + "request_output_throughput_token_per_s": 11.981809330385744, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07686770773252183, + "ttft_s": 3.8339429320003546, + "end_to_end_latency_s": 12.06846897700052, + "request_output_throughput_token_per_s": 12.511943336621089, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07560647197515322, + "ttft_s": 3.9387323210012255, + "end_to_end_latency_s": 12.174546612000995, + "request_output_throughput_token_per_s": 12.40292594150098, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0794593058687724, + "ttft_s": 2.7164430619995983, + "end_to_end_latency_s": 12.713850743999501, + "request_output_throughput_token_per_s": 11.876810813692051, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0752065680181297, + "ttft_s": 2.485262178000994, + "end_to_end_latency_s": 12.484519984000144, + "request_output_throughput_token_per_s": 12.094978436777538, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07719719551267742, + "ttft_s": 3.970576098001402, + "end_to_end_latency_s": 12.19735660400147, + "request_output_throughput_token_per_s": 12.379731519078721, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07777493693170101, + "ttft_s": 3.1416475149999314, + "end_to_end_latency_s": 12.521974561999741, + "request_output_throughput_token_per_s": 12.058801050294221, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07940489286163298, + "ttft_s": 3.24532870099938, + "end_to_end_latency_s": 12.625569166000787, + "request_output_throughput_token_per_s": 11.959856859889193, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07704145924842246, + "ttft_s": 3.8682252500002505, + "end_to_end_latency_s": 12.095734654998523, + "request_output_throughput_token_per_s": 12.48373945914891, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08066785815580343, + "ttft_s": 3.041533135999998, + "end_to_end_latency_s": 12.423037307000413, + "request_output_throughput_token_per_s": 12.15483752229506, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07652232790058418, + "ttft_s": 2.9379526720003923, + "end_to_end_latency_s": 12.320367275000535, + "request_output_throughput_token_per_s": 12.256128135595166, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07916804769742336, + "ttft_s": 0.591940964000969, + "end_to_end_latency_s": 12.825435320000906, + "request_output_throughput_token_per_s": 11.773479514143254, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07857076038270648, + "ttft_s": 3.3448137720006343, + "end_to_end_latency_s": 12.728677204000633, + "request_output_throughput_token_per_s": 11.862976614140281, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07442334930253568, + "ttft_s": 3.8369607900003757, + "end_to_end_latency_s": 12.056777567999234, + "request_output_throughput_token_per_s": 12.524076118048328, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07944327275165015, + "ttft_s": 3.9354649289998633, + "end_to_end_latency_s": 12.155026432999875, + "request_output_throughput_token_per_s": 12.422844230930481, + "number_total_tokens": 703, + "number_output_tokens": 153, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07671760771945185, + "ttft_s": 3.216805595999176, + "end_to_end_latency_s": 12.582032288999471, + "request_output_throughput_token_per_s": 12.001240859318091, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07614541413670889, + "ttft_s": 2.895454420000533, + "end_to_end_latency_s": 12.259592850001354, + "request_output_throughput_token_per_s": 12.316885384981061, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08097770875948196, + "ttft_s": 0.585287548999986, + "end_to_end_latency_s": 12.794837557999927, + "request_output_throughput_token_per_s": 11.801634785553631, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07799380564998729, + "ttft_s": 3.11426482600109, + "end_to_end_latency_s": 12.480058586001178, + "request_output_throughput_token_per_s": 12.099302175502283, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07836924871522179, + "ttft_s": 3.0146997029987688, + "end_to_end_latency_s": 12.382608850999532, + "request_output_throughput_token_per_s": 12.194522318922413, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0787996072423135, + "ttft_s": 3.3224797359998774, + "end_to_end_latency_s": 12.686957320000147, + "request_output_throughput_token_per_s": 11.901986913911857, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08029049865824243, + "ttft_s": 3.3231948259999626, + "end_to_end_latency_s": 12.68611349999992, + "request_output_throughput_token_per_s": 11.902778577536845, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07559768209936711, + "ttft_s": 3.9513667019982677, + "end_to_end_latency_s": 12.171432961999017, + "request_output_throughput_token_per_s": 12.406098811162494, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07715166893878207, + "ttft_s": 3.210729177999383, + "end_to_end_latency_s": 12.576046461999795, + "request_output_throughput_token_per_s": 12.006953095813273, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07947767026089197, + "ttft_s": 0.5815713810006855, + "end_to_end_latency_s": 12.796099605000563, + "request_output_throughput_token_per_s": 11.800470820107636, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07683227708697792, + "ttft_s": 3.007223177999549, + "end_to_end_latency_s": 12.370183602999532, + "request_output_throughput_token_per_s": 12.20677112370308, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07718588241516285, + "ttft_s": 2.9088439700008166, + "end_to_end_latency_s": 12.272755927000617, + "request_output_throughput_token_per_s": 12.303674977173888, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07346859547558823, + "ttft_s": 3.8298705900015193, + "end_to_end_latency_s": 12.049056849000408, + "request_output_throughput_token_per_s": 12.53210121691201, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0794309614394496, + "ttft_s": 3.1082363780005835, + "end_to_end_latency_s": 12.470899129000827, + "request_output_throughput_token_per_s": 12.108188707007702, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07351610570731282, + "ttft_s": 3.84920746699936, + "end_to_end_latency_s": 12.056841858000553, + "request_output_throughput_token_per_s": 12.524009336640756, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08028310102537621, + "ttft_s": 3.325809190000655, + "end_to_end_latency_s": 12.685066136000387, + "request_output_throughput_token_per_s": 11.903761350637344, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07714050624527549, + "ttft_s": 2.9067434139997204, + "end_to_end_latency_s": 12.265976489999957, + "request_output_throughput_token_per_s": 12.310475250226126, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07913460204402344, + "ttft_s": 3.2242949920000683, + "end_to_end_latency_s": 12.582622936999542, + "request_output_throughput_token_per_s": 12.000677502302038, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07746848403731289, + "ttft_s": 3.1136195870003576, + "end_to_end_latency_s": 12.472614050999255, + "request_output_throughput_token_per_s": 12.106523891669886, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0789653946233545, + "ttft_s": 3.9529013819992542, + "end_to_end_latency_s": 12.16085377699892, + "request_output_throughput_token_per_s": 12.416891344060227, + "number_total_tokens": 704, + "number_output_tokens": 154, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08257123267741859, + "ttft_s": 0.5925735629989504, + "end_to_end_latency_s": 12.798756139000034, + "request_output_throughput_token_per_s": 11.798021492094593, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07634836160496765, + "ttft_s": 3.0103270979998342, + "end_to_end_latency_s": 12.372100067001156, + "request_output_throughput_token_per_s": 12.204880269498219, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07864194274530203, + "ttft_s": 3.326735541000744, + "end_to_end_latency_s": 12.661554356000124, + "request_output_throughput_token_per_s": 11.925865952503953, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07663099513658425, + "ttft_s": 3.0030537040001946, + "end_to_end_latency_s": 12.337784147999628, + "request_output_throughput_token_per_s": 12.238826533894436, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07772083744875766, + "ttft_s": 3.9295386740013782, + "end_to_end_latency_s": 12.124657595000826, + "request_output_throughput_token_per_s": 12.453959942114944, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07903688207231521, + "ttft_s": 3.8191615839987207, + "end_to_end_latency_s": 12.01381923900044, + "request_output_throughput_token_per_s": 12.568858994466053, + "number_total_tokens": 702, + "number_output_tokens": 152, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07725132224855227, + "ttft_s": 3.1035452880005323, + "end_to_end_latency_s": 12.438088119000895, + "request_output_throughput_token_per_s": 12.140129460035476, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07658908673173999, + "ttft_s": 3.226304126999821, + "end_to_end_latency_s": 12.562244857001133, + "request_output_throughput_token_per_s": 12.020144625333057, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07600474931056643, + "ttft_s": 2.900021907998962, + "end_to_end_latency_s": 12.236960779999208, + "request_output_throughput_token_per_s": 12.339665274305943, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07876494729005594, + "ttft_s": 0.5831043220005085, + "end_to_end_latency_s": 12.760159065001062, + "request_output_throughput_token_per_s": 11.83370828144041, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0774141437785239, + "ttft_s": 2.893235961000755, + "end_to_end_latency_s": 12.23167917200044, + "request_output_throughput_token_per_s": 12.34499351042941, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07872785444299012, + "ttft_s": 3.10087955400013, + "end_to_end_latency_s": 12.439185082999757, + "request_output_throughput_token_per_s": 12.139058868604419, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07435147401842733, + "ttft_s": 3.9392345569995086, + "end_to_end_latency_s": 12.121451442999387, + "request_output_throughput_token_per_s": 12.457254043385078, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08135841275156529, + "ttft_s": 0.5906123670010857, + "end_to_end_latency_s": 12.773504486000093, + "request_output_throughput_token_per_s": 11.821344734759183, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0760530878607732, + "ttft_s": 3.8343936370001757, + "end_to_end_latency_s": 12.016578653001488, + "request_output_throughput_token_per_s": 12.565972758168014, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07804901868987628, + "ttft_s": 2.991450416999214, + "end_to_end_latency_s": 12.332560871998794, + "request_output_throughput_token_per_s": 12.244010110085656, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07747154811728744, + "ttft_s": 3.208974846000274, + "end_to_end_latency_s": 12.550747908999256, + "request_output_throughput_token_per_s": 12.031155521156517, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07815198669750653, + "ttft_s": 3.321813127000496, + "end_to_end_latency_s": 12.661107019001065, + "request_output_throughput_token_per_s": 11.926287312269602, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07414088403005603, + "ttft_s": 3.0098486520000733, + "end_to_end_latency_s": 12.307570041999497, + "request_output_throughput_token_per_s": 12.268871880047284, + "number_total_tokens": 716, + "number_output_tokens": 166, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0762398590127739, + "ttft_s": 3.808354765000331, + "end_to_end_latency_s": 11.969845517000067, + "request_output_throughput_token_per_s": 12.6150333173092, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07455277058022743, + "ttft_s": 3.915208740001617, + "end_to_end_latency_s": 12.077739215001202, + "request_output_throughput_token_per_s": 12.502339826351763, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07781335065845281, + "ttft_s": 3.228851609999765, + "end_to_end_latency_s": 12.528260822000448, + "request_output_throughput_token_per_s": 12.052750349420736, + "number_total_tokens": 711, + "number_output_tokens": 161, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08004356685537545, + "ttft_s": 0.5848513230012031, + "end_to_end_latency_s": 12.727277514999514, + "request_output_throughput_token_per_s": 11.864281251197795, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0789274011313978, + "ttft_s": 3.3278398849997757, + "end_to_end_latency_s": 12.628572034000172, + "request_output_throughput_token_per_s": 11.957013001427201, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07481322765640354, + "ttft_s": 2.8962130390009406, + "end_to_end_latency_s": 12.194742551000672, + "request_output_throughput_token_per_s": 12.382385226132493, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07862469748737037, + "ttft_s": 3.12259909200111, + "end_to_end_latency_s": 12.42362216500078, + "request_output_throughput_token_per_s": 12.154265317677627, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07870585498749737, + "ttft_s": 2.5040930089999165, + "end_to_end_latency_s": 12.514420986000914, + "request_output_throughput_token_per_s": 12.066079618778534, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07845192893680028, + "ttft_s": 2.3858973080004944, + "end_to_end_latency_s": 12.396114314000442, + "request_output_throughput_token_per_s": 12.18123648871625, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0753914121437674, + "ttft_s": 3.7979515620008897, + "end_to_end_latency_s": 12.062812344000122, + "request_output_throughput_token_per_s": 12.517810581303234, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07827290263703385, + "ttft_s": 4.0259433050014195, + "end_to_end_latency_s": 12.289054446000591, + "request_output_throughput_token_per_s": 12.287357067503445, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08037692225782007, + "ttft_s": 2.7702112700008, + "end_to_end_latency_s": 12.780112861000816, + "request_output_throughput_token_per_s": 11.815232122150064, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0754146436308929, + "ttft_s": 2.659065838999595, + "end_to_end_latency_s": 12.669869032000861, + "request_output_throughput_token_per_s": 11.918039532895918, + "number_total_tokens": 718, + "number_output_tokens": 168, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07651469001882202, + "ttft_s": 3.9029117230002157, + "end_to_end_latency_s": 12.168781049000245, + "request_output_throughput_token_per_s": 12.408802442246733, + "number_total_tokens": 709, + "number_output_tokens": 159, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07910034165034391, + "ttft_s": 0.5877848020008969, + "end_to_end_latency_s": 12.893543471000157, + "request_output_throughput_token_per_s": 11.711287927917217, + "number_total_tokens": 713, + "number_output_tokens": 163, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07891344653160536, + "ttft_s": 3.1204149240002153, + "end_to_end_latency_s": 12.468538469000123, + "request_output_throughput_token_per_s": 12.110481142230377, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08008235822283721, + "ttft_s": 3.223524451999765, + "end_to_end_latency_s": 12.573121445000652, + "request_output_throughput_token_per_s": 12.009746399136302, + "number_total_tokens": 707, + "number_output_tokens": 157, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.0782174190061815, + "ttft_s": 3.009283774999858, + "end_to_end_latency_s": 12.35893882900018, + "request_output_throughput_token_per_s": 12.217877447995726, + "number_total_tokens": 708, + "number_output_tokens": 158, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.08201954205124377, + "ttft_s": 0.5897332360000291, + "end_to_end_latency_s": 12.796147676001056, + "request_output_throughput_token_per_s": 11.80042648954402, + "number_total_tokens": 706, + "number_output_tokens": 156, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07826254014183594, + "ttft_s": 3.3267620709993935, + "end_to_end_latency_s": 12.678737574999104, + "request_output_throughput_token_per_s": 11.909703084142482, + "number_total_tokens": 712, + "number_output_tokens": 162, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07663169248743315, + "ttft_s": 2.910473151001497, + "end_to_end_latency_s": 12.261264740000115, + "request_output_throughput_token_per_s": 12.31520591080546, + "number_total_tokens": 710, + "number_output_tokens": 160, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07831386209037455, + "ttft_s": 3.923613838000165, + "end_to_end_latency_s": 12.138856515999578, + "request_output_throughput_token_per_s": 12.43939244202903, + "number_total_tokens": 705, + "number_output_tokens": 155, + "number_input_tokens": 550 + }, + { + "error_code": null, + "error_msg": "", + "inter_token_latency_s": 0.07331145139639886, + "ttft_s": 3.8076719870005036, + "end_to_end_latency_s": 12.02453506600068, + "request_output_throughput_token_per_s": 12.557658085837499, + "number_total_tokens": 714, + "number_output_tokens": 164, + "number_input_tokens": 550 + } +] \ No newline at end of file diff --git a/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_summary.json b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_summary.json new file mode 100644 index 0000000..5b9d8c1 --- /dev/null +++ b/benchmarks/results/yi/4xa10g/vllm/result_01-ai_Yi-1.5-34B-Chat_8/01-ai-Yi-1-5-34B-Chat_550_150_summary.json @@ -0,0 +1,78 @@ +{ + "version": "2023-08-31", + "name": "01-ai-Yi-1-5-34B-Chat_550_150_summary", + "model": "01-ai/Yi-1.5-34B-Chat", + "mean_input_tokens": 550, + "stddev_input_tokens": 0, + "mean_output_tokens": 150, + "stddev_output_tokens": 0, + "num_concurrent_requests": 8, + "results_inter_token_latency_s_quantiles_p25": 0.07657239702395104, + "results_inter_token_latency_s_quantiles_p50": 0.07806983328522192, + "results_inter_token_latency_s_quantiles_p75": 0.07920261970734134, + "results_inter_token_latency_s_quantiles_p90": 0.08048290927927977, + "results_inter_token_latency_s_quantiles_p95": 0.08129656306738024, + "results_inter_token_latency_s_quantiles_p99": 0.08300724588026018, + "results_inter_token_latency_s_mean": 0.07794287519221489, + "results_inter_token_latency_s_min": 0.07331145139639886, + "results_inter_token_latency_s_max": 0.08309716781049145, + "results_inter_token_latency_s_stddev": 0.002087756318470657, + "results_ttft_s_quantiles_p25": 2.8948998052505885, + "results_ttft_s_quantiles_p50": 3.1766039474996433, + "results_ttft_s_quantiles_p75": 3.83876859575048, + "results_ttft_s_quantiles_p90": 3.9833098204006094, + "results_ttft_s_quantiles_p95": 4.254826318100367, + "results_ttft_s_quantiles_p99": 4.332220672601234, + "results_ttft_s_mean": 3.0223527698655674, + "results_ttft_s_min": 0.5815162910002982, + "results_ttft_s_max": 4.416717618001712, + "results_ttft_s_stddev": 1.0503665795084827, + "results_end_to_end_latency_s_quantiles_p25": 12.235640377999516, + "results_end_to_end_latency_s_quantiles_p50": 12.423329736000596, + "results_end_to_end_latency_s_quantiles_p75": 12.636705780250395, + "results_end_to_end_latency_s_quantiles_p90": 12.77226875209999, + "results_end_to_end_latency_s_quantiles_p95": 12.796140465350982, + "results_end_to_end_latency_s_quantiles_p99": 12.886684354570388, + "results_end_to_end_latency_s_mean": 12.427825965779062, + "results_end_to_end_latency_s_min": 11.969845517000067, + "results_end_to_end_latency_s_max": 12.893543471000157, + "results_end_to_end_latency_s_stddev": 0.2453431809637068, + "results_request_output_throughput_token_per_s_quantiles_p25": 11.949331579137802, + "results_request_output_throughput_token_per_s_quantiles_p50": 12.154551419986344, + "results_request_output_throughput_token_per_s_quantiles_p75": 12.34099733333681, + "results_request_output_throughput_token_per_s_quantiles_p90": 12.5054205883345, + "results_request_output_throughput_token_per_s_quantiles_p95": 12.524066100837192, + "results_request_output_throughput_token_per_s_quantiles_p99": 12.568772407377113, + "results_request_output_throughput_token_per_s_mean": 12.154846079455641, + "results_request_output_throughput_token_per_s_min": 11.711287927917217, + "results_request_output_throughput_token_per_s_max": 12.6150333173092, + "results_request_output_throughput_token_per_s_stddev": 0.24002533569777737, + "results_number_input_tokens_quantiles_p25": 550.0, + "results_number_input_tokens_quantiles_p50": 550.0, + "results_number_input_tokens_quantiles_p75": 550.0, + "results_number_input_tokens_quantiles_p90": 550.0, + "results_number_input_tokens_quantiles_p95": 550.0, + "results_number_input_tokens_quantiles_p99": 550.0, + "results_number_input_tokens_mean": 550.0, + "results_number_input_tokens_min": "550", + "results_number_input_tokens_max": "550", + "results_number_input_tokens_stddev": 0.0, + "results_number_output_tokens_quantiles_p25": 158.0, + "results_number_output_tokens_quantiles_p50": 160.0, + "results_number_output_tokens_quantiles_p75": 162.0, + "results_number_output_tokens_quantiles_p90": 163.0, + "results_number_output_tokens_quantiles_p95": 164.0, + "results_number_output_tokens_quantiles_p99": 166.0, + "results_number_output_tokens_mean": 159.5, + "results_number_output_tokens_min": "152", + "results_number_output_tokens_max": "168", + "results_number_output_tokens_stddev": 3.1958711227159506, + "results_num_requests_started": 104, + "results_error_rate": 0.0, + "results_number_errors": 0, + "results_error_code_frequency": "{}", + "results_mean_output_throughput_token_per_s": 95.81719381923115, + "results_num_completed_requests": 104, + "results_num_completed_requests_per_min": 36.044085449240555, + "timestamp": 1718209114 +} \ No newline at end of file diff --git a/benchmarks/yi_1_5_34b_chat_4xa10g.md b/benchmarks/yi_1_5_34b_chat_4xa10g.md new file mode 100644 index 0000000..8fbecbd --- /dev/null +++ b/benchmarks/yi_1_5_34b_chat_4xa10g.md @@ -0,0 +1,103 @@ +# Benchmark: Yi 1.5 34B Chat on NVIDIA A10G + +Benchmarking the performance of LLMs on the Llama 3 8b Instruct model using the NVIDIA A10G GPU using `llmperf`. The engines tested include vLLM, Hugging Face TGI, all measueed via HTTP and their OpenAI API implementations. The tests were run on an Amazon EC2 g5.12xlarge instance equipped with an NVIDIA A10G GPU. + +## Test Environment +- **Instance Type**: Amazon EC2 g5.2xlarge +- **GPU**: NVIDIA A10G +- **Setup**: Requests and containers were run on the same machine via localhost. +- **Engines Tested**: + - [vLLM](https://docs.vllm.ai/en/stable/) + - [Hugging Face TGI](https://huggingface.co/docs/text-generation-inference/en/index) +- **Model**: [01-ai/Yi-1.5-34B-Chat](https://huggingface.co/01-ai/Yi-1.5-34B-Chat) +- **Scenario**: + - Expected Input: 550 tokens (mean) + - Expected Output: 150 tokens (mean) + - Concurrent Requests: 2, 4, 8, 16, 32, 64 +- **metrics**: + - Throughput: Measures how many tokens can be processed in a given time frame. + - First Time to Token: Tracks the time taken to generate the first token in response to a request. + - Latency (Inter-Token Latency): Measures the time elapsed between generating successive tokens. + +The benchmarking was performed using `llmperf`, a tool designed to evaluate the performance of LLMs across different frameworks and hardware configurations. + +## Benchmark Results + +The benchmark tested the Yi 1.5 34B Chat model on an 4x NVIDIA A10G GPU using llmperf, comparing vLLM, Hugging Face TGI, and NVIDIA NIM on an Amazon EC2 g5.12xlarge instance. Metrics included throughput, first time to token, and inter-token latency under varying levels of concurrency. + + +### Concurrency User 1 + +| Engine | vLLM | TGI | NVIDIA NIM | +| ------------------------------ | ----------- | ----------- | ----------- | +| First Time To Token (ms) | 137.2919661 | 138.9137787 | 135.4107646 | +| Througput (token/sec) | 31.92462559 | 32.78526142 | 32.2123514 | +| Inter Token Latency (ms/token) | 30.65149844 | 29.86407376 | 30.3319248 | + +- For First Time To Token (ms), TGI is 1.18% slower than vLLM and NVIDIA NIM is 1.37% faster than vLLM. Compared to TGI, NVIDIA NIM is 2.52% faster.- +- For Throughput (token/sec), TGI is 2.70% slower than vLLM and NVIDIA NIM is 0.90% slower than vLLM. Compared to TGI, NVIDIA NIM is 1.75% faster. +- For Inter Token Latency (ms/token), TGI is 2.57% faster than vLLM and NVIDIA NIM is 1.04% faster than vLLM. Compared to TGI, NVIDIA NIM is 1.57% slower. + + +### Concurrency User 4 + + + +### Concurrency User 16 + + + +### Concurrency User 64 + + + +## Steps to Run Each Benchmark + +Make sure to login into huggingface to have access to Llama 3 8B Instruct model with `huggingface-cli login`. We are going to use the [benchmark.py](../scripts/benchmark.py) script to run the benchmarks. The script will run the benchmark for 2, 4, 8, 16, 32, 64, and 128 concurrent requests using the same configuration for each engine. + +### vLLM + +1. Start the vLLM Container: +```bash +docker run --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -e "HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:v0.4.3 \ + --model 01-ai/Yi-1.5-34B-Chat \ + --tensor-parallel-size 4 +``` + +2. Run the benchmark: + +```bash +# pwd +# >/home/ubuntu/llmperf +python scripts/benchmark.py --model-id "01-ai/Yi-1.5-34B-Chat" +``` + +### Hugging Face TGI + +1. Start the TGI Container: + +```bash +docker run --gpus all -ti -p 8000:80 \ + -e MODEL_ID="01-ai/Yi-1.5-34B-Chat" \ + -v ~/.cache/huggingface/hub:/data \ + --shm-size 1g \ + -e HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token) \ + -e MAX_INPUT_LENGTH=4000 \ + -e NUM_SHARD=4 \ + -e MAX_TOTAL_TOKENS=4096 \ + -e MAX_BATCH_PREFILL_TOKENS=8192 \ + ghcr.io/huggingface/text-generation-inference:latest +``` + +1. Run the benchmark: + +```bash +# pwd +# >/home/ubuntu/llmperf +python scripts/benchmark.py --model-id "01-ai/Yi-1.5-34B-Chat" +``` diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 5ff4857..f455cf1 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -28,7 +28,7 @@ def benchmark(config): print(f"Running test with concurrency: {concurrency}") os.environ["OPENAI_API_BASE"] = "http://localhost:8000/v1" os.environ["OPENAI_API_KEY"] = "none" - output_dir = f"result_outputs_{concurrency}" + output_dir = f'result_{config.model_id.replace("/","_")}_{concurrency}' cmd = [ "python", script_file_path, @@ -62,15 +62,36 @@ def benchmark(config): "concurrency": concurrency, "mean_input_token_length": data["results_number_input_tokens_mean"], "mean_output_token_length": data["results_number_output_tokens_mean"], - "first-time-to-token_mean_in_ms_(ttft)": data["results_ttft_s_mean"] * 1000, - "throughput_token_per_s_(token/sec)": data[ - "results_mean_output_throughput_token_per_s" + "time_to_first_token_in_ms_(ttft)_p50": data["results_ttft_s_quantiles_p50"] + * 1000, + "time_to_first_token_in_ms_(ttft)_p75": data["results_ttft_s_quantiles_p75"] + * 1000, + "time_to_first_token_in_ms_(ttft)_p95": data["results_ttft_s_quantiles_p95"] + * 1000, + "throughput_token_per_s_(token/sec)_p50": data[ + "results_request_output_throughput_token_per_s_quantiles_p50" + ], + "throughput_token_per_s_(token/sec)_p75": data[ + "results_request_output_throughput_token_per_s_quantiles_p75" + ], + "throughput_token_per_s_(token/sec)_p95": data[ + "results_request_output_throughput_token_per_s_quantiles_p95" ], - "latency_ms_per_token_(inter_token_latency)": data[ - "results_inter_token_latency_s_mean" + "latency_ms_per_token_(inter_token_latency)_p50": data[ + "results_inter_token_latency_s_quantiles_p50" + ] + * 1000, + "latency_ms_per_token_(inter_token_latency)_p75": data[ + "results_inter_token_latency_s_quantiles_p75" + ] + * 1000, + "latency_ms_per_token_(inter_token_latency)_p95": data[ + "results_inter_token_latency_s_quantiles_p95" ] * 1000, "requests_per_minute_(qpm)": data["results_num_completed_requests_per_min"], + "results_number_errors": data["results_number_errors"], + "results_num_completed_requests": data["results_num_completed_requests"], } # append results results[concurrency] = data @@ -80,7 +101,7 @@ def benchmark(config): ) as file: json.dump(detailed_results[concurrency], file, indent=2) # remove the output directory - subprocess.run(["rm", "-rf", output_dir]) + # subprocess.run(["rm", "-rf", output_dir]) return results, detailed_results From bd517baf8453f2ce35af6160ad89bf6e2c6dc1c2 Mon Sep 17 00:00:00 2001 From: philschmid Date: Thu, 13 Jun 2024 06:57:44 +0000 Subject: [PATCH 17/18] updated csv --- .../results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv | 11 +++++++++++ .../results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/benchmarks/results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv b/benchmarks/results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv index a9e40dd..2984a06 100644 --- a/benchmarks/results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv +++ b/benchmarks/results/yi/4xa10g/tgi-01-ai_Yi-1.5-34B-Chat.csv @@ -14,3 +14,14 @@ latency_ms_per_token_(inter_token_latency)_p95,47.012874357545535,52.76172237063 requests_per_minute_(qpm),8.211894857783436,14.57747226052002,24.663603308027646,36.74115036623516,47.693122662953904,42.71346832611164,27.129583372411076 results_number_errors,0.0,0.0,0.0,0.0,0.0,44.0,86.0 results_num_completed_requests,83.0,100.0,100.0,104.0,112.0,84.0,42.0 + + +concurrency,1.0,2.0,4.0,8.0,16.0,32.0,64.0 +mean_input_token_length,550.0,550.0,550.0,550.0,550.0,550.0,550.0 +mean_output_token_length,158.33734939759037,157.22,158.22,156.83653846153845,158.78571428571428,158.3452380952381,159.04761904761904 +time_to_first_token_in_ms_(ttft)_p75,591.8130939999173,1067.9737397492772,2167.6567652496033,3851.6287092497805,8107.615061500383,10712.515758000109,10294.223657249859 +throughput_token_per_s_(token/sec)_p75,20.88080751297862,18.816694929061082,16.238827688643383,12.6543303317669,8.698153944788167,6.767838652716483,6.908107654299118 +latency_ms_per_token_(inter_token_latency)_p75,45.79195392042707,51.184840881309285,59.91128747742228,78.08761739927169,113.83001986670706,146.0550590310391,144.78203158558753 +requests_per_minute_(qpm),8.211894857783436,14.57747226052002,24.663603308027646,36.74115036623516,47.693122662953904,42.71346832611164,27.129583372411076 +results_number_errors,0.0,0.0,0.0,0.0,0.0,44.0,86.0 +results_num_completed_requests,83.0,100.0,100.0,104.0,112.0,84.0,42.0 diff --git a/benchmarks/results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv b/benchmarks/results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv index 359ff3c..d26adfa 100644 --- a/benchmarks/results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv +++ b/benchmarks/results/yi/4xa10g/vllm-01-ai_Yi-1.5-34B-Chat.csv @@ -14,3 +14,13 @@ latency_ms_per_token_(inter_token_latency)_p95,48.62765116823143,52.996850070530 requests_per_minute_(qpm),7.915935703584854,14.13572641706053,24.32544640710275,36.044085449240555,46.821420363866935,52.57208219530792,50.95750230507558 results_number_errors,0.0,0.0,0.0,0.0,0.0,0.0,0.0 results_num_completed_requests,80.0,100.0,100.0,104.0,112.0,128.0,128.0 + +concurrency,1.0,2.0,4.0,8.0,16.0,32.0,64.0 +mean_input_token_length,550.0,550.0,550.0,550.0,550.0,550.0,550.0 +mean_output_token_length,159.0375,159.22,159.18,159.5,159.27678571428572,160.0,159.40625 +time_to_first_token_in_ms_(ttft)_p75,588.4995797496231,1060.5307812497813,2150.540910750351,3838.76859575048,7219.509921999816,13748.74519900004,25327.48620450093 +throughput_token_per_s_(token/sec)_p75,20.350613901547643,18.62048349387721,16.125915244796303,12.34099733333681,8.560894932024599,5.273474722315811,3.1354960779360517 +latency_ms_per_token_(inter_token_latency)_p75,47.63683723238133,52.004817064281525,60.647559917482496,79.20261970734134,116.40027259278233,188.72039812495933,319.2453598913946 +requests_per_minute_(qpm),7.915935703584854,14.13572641706053,24.32544640710275,36.044085449240555,46.821420363866935,52.57208219530792,50.95750230507558 +results_number_errors,0.0,0.0,0.0,0.0,0.0,0.0,0.0 +results_num_completed_requests,80.0,100.0,100.0,104.0,112.0,128.0,128.0 From 5c1c321e95061fc20304eb882cb9653a88bc7b9c Mon Sep 17 00:00:00 2001 From: philschmid Date: Mon, 17 Jun 2024 14:27:21 +0000 Subject: [PATCH 18/18] inf2 --- benchmarks/llama_3_8b_instruct_inf2.md | 71 ++++++++++++++++++++++++++ pyproject.toml | 1 + scripts/benchmark.py | 4 +- 3 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 benchmarks/llama_3_8b_instruct_inf2.md diff --git a/benchmarks/llama_3_8b_instruct_inf2.md b/benchmarks/llama_3_8b_instruct_inf2.md new file mode 100644 index 0000000..0960abd --- /dev/null +++ b/benchmarks/llama_3_8b_instruct_inf2.md @@ -0,0 +1,71 @@ +# Benchmark: Llama 3 8b Instruct on AWS Inferentia2 + +Benchmarking the performance of LLMs on the Llama 3 8b Instruct model using the AWS Inferentia2 using `llmperf`. + +## Test Environment +- **Instance Type**: Amazon EC2 inf2.xlarge +- **GPU**: AWS Inferentia2 +- **Setup**: Requests and containers were run on the same machine via localhost. +- **Engines Tested**: + - [Hugging Face TGI](https://huggingface.co/docs/text-generation-inference/en/index) +- **Model**: [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) +- **Scenario**: + - Expected Input: 550 tokens (mean) + - Expected Output: 150 tokens (mean) + - Concurrent Requests: 2, 4, 8, 16, 32, 64 +- **metrics**: + - Throughput: Measures how many tokens can be processed in a given time frame. + - First Time to Token: Tracks the time taken to generate the first token in response to a request. + - Latency (Inter-Token Latency): Measures the time elapsed between generating successive tokens. + +The benchmarking was performed using `llmperf`, a tool designed to evaluate the performance of LLMs across different frameworks and hardware configurations. + +## Benchmark Results + +The first time to token (ms) at the 50th percentile increases significantly from 1001 ms for one user to 7503 ms for 16 users, indicating higher latency with more users. + +Throughput (tokens per second at the 50th percentile) improves with concurrency, peaking at 142.72 tokens per second for 16 users. + +Inter-token latency (ms/token) at the 50th percentile rises from 52.18 ms for one user to 97.17 ms for 16 users, showing longer intervals between tokens as user count increases. + + +| Concurrent Users | 1 | 2 | 4 | 8 | 16 | +|--------------------------------|-------|-------|--------|-------|--------| +| Mean Input Token | 550 | 550 | 550 | 550 | 550 | +| Mean Output Token | 177 | 175 | 174 | 176 | 175 | +| First Time To Token (ms) p50 | 1001 | 1419 | 3732.78| 7539 | 7503 | +| Throughput (token/sec) p50 | 16.23 | 29.2 | 48.08 | 72.16 | 142.72 | +| Inter Token Latency (ms/token) p50 | 52.18 | 58.29 | 69.93 | 93.95 | 97.17 | +| Request per minute | 6.4 | 11.45 | 18.81 | 27.78 | 26.9 | +| Errors | 0 | 0 | 0 | 0 | 56 | +| Cost per 1M token | $13.01| $7.23 | $4.39 | $2.93 | $1.48 | + + + +## Steps to Run Each Benchmark + +Make sure to login into huggingface to have access to Llama 3 8B Instruct model with `huggingface-cli login`. We are going to use the [benchmark.py](../scripts/benchmark.py) script to run the benchmarks. The script will run the benchmark for 2, 4, 8, 16, 32, 64, and 128 concurrent requests using the same configuration for each engine. + + +1. Start the TGI Container: + +```bash +docker run --privileged -ti -p 8000:80 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -e MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" \ + -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \ + -e HF_AUTO_CAST_TYPE="fp16" \ + -e HF_NUM_CORES=2 \ + -e MAX_BATCH_SIZE=8 \ + -e MAX_INPUT_LENGTH=4000 \ + -e MAX_TOTAL_TOKENS=4096 \ + ghcr.io/huggingface/neuronx-tgi:0.0.23 +``` + +1. Run the benchmark: + +```bash +# pwd +# >/home/ubuntu/llmperf +python scripts/benchmark.py --model-id "meta-llama/Meta-Llama-3-8B-Instruct" +``` diff --git a/pyproject.toml b/pyproject.toml index 142e52c..f968712 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,4 +23,5 @@ dependencies = [ "boto3", "google-cloud-aiplatform", "pandas", + "tabulate" ] diff --git a/scripts/benchmark.py b/scripts/benchmark.py index f455cf1..7f118ca 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -146,9 +146,9 @@ def main(): results, detailed_results = benchmark(config) # print the results in a nice markdown table using pandas df = pd.DataFrame(detailed_results) - print(df.to_markdown()) - # write to csv df.to_csv(f"{config.model_id.replace('/','_')}.csv") + # write to csv + print(df.to_markdown()) if __name__ == "__main__":