From d28d060e67945e5c129ff3e7f1fb2384bc4949b1 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 5 Dec 2024 22:05:13 +0000 Subject: [PATCH 01/10] first commit --- .../container/benchmark_serving.py | 59 +++++++++++++++---- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 9ebb26ad5..20466a05e 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -12,7 +12,7 @@ import random import requests import time -from typing import AsyncGenerator, List, Optional, Tuple, Dict +from typing import AsyncGenerator, List, NamedTuple, Optional, Tuple, Dict from prometheus_client import start_http_server, Histogram, Gauge import google.auth @@ -21,6 +21,8 @@ import aiohttp import numpy as np +from sympy import symbols +from sympy.parsing.sympy_parser import parse_expr from transformers import AutoTokenizer from transformers import PreTrainedTokenizerBase @@ -30,6 +32,15 @@ CLIENT_TIMEOUT_SEC = 3 * 60 * 60 NEW_TEXT_KEY = "\nOutput:\n" PROMETHEUS_PORT = 9090 +NS_IN_SEC = 1_000_000_000 + +class QueryStats(NamedTuple): + start_time: float + end_time: float + output_len: float + ttft: Optional[float] + +RESULTS_BUCKET : List[QueryStats] = [] # Prometheus Metrics prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) @@ -38,6 +49,7 @@ tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request (excluding first token)') ttft_metric = Histogram('LatencyProfileGenerator:time_to_first_token', 'Time to first token per request') active_requests_metric = Gauge('LatencyProfileGenerator:active_requests', 'How many requests actively being processed') +request_rate_metric = Gauge('LatencyProfileGenerator:request_rate', "The current request rate in seconds") # Add trace config for monitoring in flight requests async def on_request_start(session, trace_config_ctx, params): @@ -110,18 +122,26 @@ def get_filtered_dataset( async def generate_next_request( input_requests: List[Tuple[str, int, int]], - request_rate: float, + request_rate_expr: float, + start_time: float, ) -> AsyncGenerator[Tuple[str, int, int], None]: """Gets request async.""" while True: request = random.choice(input_requests) yield request - if request_rate == float("inf"): + if request_rate_expr == "oo": # If the request rate is infinity, then we don't need to wait. continue + + # Evaluate the request rate at this point in time + t = symbols('t') + expr_parsed = parse_expr(request_rate_expr, transformations="all", local_dict={"t": t}) + request_rate_at_t = expr_parsed.subs(t, ((time.time_ns() - start_time) / NS_IN_SEC)) + request_rate_metric.set(request_rate_at_t) + # Sample the request interval from the exponential distribution. - interval = np.random.exponential(1.0 / request_rate) + interval = np.random.exponential(1.0 / request_rate_at_t) # The next request will be sent after the interval. await asyncio.sleep(interval) @@ -408,7 +428,7 @@ async def benchmark( benchmark_start_time = time.time() tasks: List[asyncio.Task] = [] prompts_sent: int = 0 - async for request in generate_next_request(input_requests, args.request_rate): + async for request in generate_next_request(input_requests, args.request_rate, time.time_ns()): if args.num_prompts <= prompts_sent: break prompt, prompt_len, output_len = request @@ -876,16 +896,31 @@ async def main(args: argparse.Namespace): " LLaMA2 models." ), ) + # Input assertions + def is_expression_of_t(input_str): + if input_str == "inf": + return "oo" + # Check if expression uses variables other than 't' by attempting to evaluate with only 't' defined + try: + t = symbols('t') + expr_parsed = parse_expr(input_str, transformations="all", local_dict={"t": t}) + expr_parsed.subs(t, 1) + return input_str + except Exception: + raise ValueError(f"Request rate {input_str}, must be an expression of `t`") + parser.add_argument( "--request-rate", - type=float, - default=float("inf"), + type=is_expression_of_t, + default=None, help=( - "Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times." - ), + "Specifies the request rate as a function of time, f(t)." + " Example format: '1+1.05*t', where 't' represents seconds from" + " start. If set to 'inf', all requests are sent at time 0." + " Otherwise, the function is interpreted to generate a Poisson" + " process for request arrival times based on the provided rate" + " expression." + ), ) parser.add_argument("--seed", type=int, default=int(time.time())) parser.add_argument( From b13e34ae744370d2eac044c5cd03cdc33268af6c Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 5 Dec 2024 22:06:39 +0000 Subject: [PATCH 02/10] remove non useful code --- .../profile-generator/container/benchmark_serving.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 20466a05e..460b6de97 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -34,13 +34,6 @@ PROMETHEUS_PORT = 9090 NS_IN_SEC = 1_000_000_000 -class QueryStats(NamedTuple): - start_time: float - end_time: float - output_len: float - ttft: Optional[float] - -RESULTS_BUCKET : List[QueryStats] = [] # Prometheus Metrics prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)]) @@ -907,7 +900,7 @@ def is_expression_of_t(input_str): expr_parsed.subs(t, 1) return input_str except Exception: - raise ValueError(f"Request rate {input_str}, must be an expression of `t`") + raise ValueError(f"Request rate {input_str}, must be numeric or an expression of `t`") parser.add_argument( "--request-rate", From 317ae419db1cfa26874353367913417a8bcc151c Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 5 Dec 2024 22:07:47 +0000 Subject: [PATCH 03/10] more of the above --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 460b6de97..ebd088848 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -12,7 +12,7 @@ import random import requests import time -from typing import AsyncGenerator, List, NamedTuple, Optional, Tuple, Dict +from typing import AsyncGenerator, List, Tuple, Dict from prometheus_client import start_http_server, Histogram, Gauge import google.auth From 94f3c8a84bbc570014536ba7ced6635173d96505 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 5 Dec 2024 22:08:55 +0000 Subject: [PATCH 04/10] reset default --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index ebd088848..376366762 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -905,7 +905,7 @@ def is_expression_of_t(input_str): parser.add_argument( "--request-rate", type=is_expression_of_t, - default=None, + default="inf", help=( "Specifies the request rate as a function of time, f(t)." " Example format: '1+1.05*t', where 't' represents seconds from" From 8f9bce29afacd1b71ddfefb40a2e7d99e4d211f4 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 5 Dec 2024 22:41:23 +0000 Subject: [PATCH 05/10] revert --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 376366762..95d0aa1e7 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -12,7 +12,7 @@ import random import requests import time -from typing import AsyncGenerator, List, Tuple, Dict +from typing import AsyncGenerator, List, Optional, Tuple, Dict from prometheus_client import start_http_server, Histogram, Gauge import google.auth From f087486056bc216509ab438d6a76f372055b9068 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 5 Dec 2024 22:41:25 +0000 Subject: [PATCH 06/10] revert --- manifest.yaml | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 manifest.yaml diff --git a/manifest.yaml b/manifest.yaml new file mode 100644 index 000000000..bce90cfad --- /dev/null +++ b/manifest.yaml @@ -0,0 +1,93 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + annotations: + gkit.gke.io/generated: "true" + gkit.gke.io/inference-server: vllm + creationTimestamp: null + labels: + app: llama3-8b-vllm-inference-server + name: llama3-8b-vllm-deployment +spec: + replicas: 1 + selector: + matchLabels: + app: llama3-8b-vllm-inference-server + strategy: {} + template: + metadata: + creationTimestamp: null + labels: + ai.gke.io/inference-server: vllm + ai.gke.io/model: LLaMA3_8B + app: llama3-8b-vllm-inference-server + examples.ai.gke.io/source: blueprints + spec: + containers: + - args: + - --model=$(MODEL_ID) + - --max-num-seq=1024 + - --num-scheduler-steps=4 + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + env: + - name: MODEL_ID + value: meta-llama/Meta-Llama-3-8B + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + key: hf_api_token + name: hf-secret + image: vllm/vllm-openai:latest + name: inference-server + ports: + - containerPort: 8000 + name: metrics + readinessProbe: + failureThreshold: 60 + httpGet: + path: /health + port: 8000 + periodSeconds: 10 + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /dev/shm + name: dshm + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-tesla-a100 + volumes: + - emptyDir: + medium: Memory + name: dshm +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + annotations: + gkit.gke.io/generated: "true" + creationTimestamp: null + name: llama3-8b-vllm-hpa +spec: + maxReplicas: 3 + metrics: + - pods: + metric: + name: prometheus.googleapis.com|vllm:gpu_cache_usage_perc|gauge + target: + averageValue: 512546u + type: AverageValue + type: Pods + minReplicas: 1 + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: llama3-8b-vllm-deployment +status: + currentMetrics: null + desiredReplicas: 0 \ No newline at end of file From 15c1950dd607b447688ee5198121010f45c75c0c Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 5 Dec 2024 22:41:45 +0000 Subject: [PATCH 07/10] revert --- manifest.yaml | 93 --------------------------------------------------- 1 file changed, 93 deletions(-) delete mode 100644 manifest.yaml diff --git a/manifest.yaml b/manifest.yaml deleted file mode 100644 index bce90cfad..000000000 --- a/manifest.yaml +++ /dev/null @@ -1,93 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - annotations: - gkit.gke.io/generated: "true" - gkit.gke.io/inference-server: vllm - creationTimestamp: null - labels: - app: llama3-8b-vllm-inference-server - name: llama3-8b-vllm-deployment -spec: - replicas: 1 - selector: - matchLabels: - app: llama3-8b-vllm-inference-server - strategy: {} - template: - metadata: - creationTimestamp: null - labels: - ai.gke.io/inference-server: vllm - ai.gke.io/model: LLaMA3_8B - app: llama3-8b-vllm-inference-server - examples.ai.gke.io/source: blueprints - spec: - containers: - - args: - - --model=$(MODEL_ID) - - --max-num-seq=1024 - - --num-scheduler-steps=4 - command: - - python3 - - -m - - vllm.entrypoints.openai.api_server - env: - - name: MODEL_ID - value: meta-llama/Meta-Llama-3-8B - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - key: hf_api_token - name: hf-secret - image: vllm/vllm-openai:latest - name: inference-server - ports: - - containerPort: 8000 - name: metrics - readinessProbe: - failureThreshold: 60 - httpGet: - path: /health - port: 8000 - periodSeconds: 10 - resources: - limits: - nvidia.com/gpu: "1" - requests: - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /dev/shm - name: dshm - nodeSelector: - cloud.google.com/gke-accelerator: nvidia-tesla-a100 - volumes: - - emptyDir: - medium: Memory - name: dshm ---- -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - annotations: - gkit.gke.io/generated: "true" - creationTimestamp: null - name: llama3-8b-vllm-hpa -spec: - maxReplicas: 3 - metrics: - - pods: - metric: - name: prometheus.googleapis.com|vllm:gpu_cache_usage_perc|gauge - target: - averageValue: 512546u - type: AverageValue - type: Pods - minReplicas: 1 - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: llama3-8b-vllm-deployment -status: - currentMetrics: null - desiredReplicas: 0 \ No newline at end of file From 5b11894fdb4d9b07821df28bec6abef5b2d6b6dd Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 5 Dec 2024 22:50:55 +0000 Subject: [PATCH 08/10] requirements --- .../tools/profile-generator/container/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt index c3bfdaca3..6ab15ecec 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt +++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt @@ -36,4 +36,5 @@ accelerate aiohttp google-auth google-cloud-storage >= 2.18.2 -prometheus_client >= 0.21.0 \ No newline at end of file +prometheus_client >= 0.21.0 +sympy >= 1.13 \ No newline at end of file From 3e7db9893db618f7f9164bb33a653ff368698125 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Fri, 6 Dec 2024 17:51:35 +0000 Subject: [PATCH 09/10] request rate function --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 95d0aa1e7..f380a22e0 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -561,7 +561,7 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics # Save to file model_without_slash = model.replace("/","-") file_name = ( - f"{args.file_prefix}-{args.backend}-{args.request_rate}qps-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" + f"{args.file_prefix}-{args.backend}-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" ) with open(file_name, "w", encoding="utf-8") as outfile: json.dump(final_json, outfile) From b36df468ca641caa135ce5be7541ccdfe8cce89a Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Fri, 6 Dec 2024 18:09:03 +0000 Subject: [PATCH 10/10] update description --- .../tools/profile-generator/container/benchmark_serving.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index f380a22e0..e0cab7a27 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -912,7 +912,8 @@ def is_expression_of_t(input_str): " start. If set to 'inf', all requests are sent at time 0." " Otherwise, the function is interpreted to generate a Poisson" " process for request arrival times based on the provided rate" - " expression." + " expression. Current value emitted as the following prometheus" + " metric: 'LatencyProfileGenerator:request_rate'" ), ) parser.add_argument("--seed", type=int, default=int(time.time()))