feat(loadtest): add loadtest tools (#906)

* add loadtest tools * update * [autofix.ci] apply automated fixes * add readme * cleanup legacy loadtest --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
TabbyML · Nov 28, 2023 · edd33a3 · edd33a3
1 parent b47abc6
commit edd33a3
Show file tree

Hide file tree

Showing 7 changed files with 317 additions and 37 deletions.
diff --git a/Makefile b/Makefile
@@ -1,10 +1,3 @@
-loadtest:
-ifdef TABBY_API_HOST
-	k6 run tests/*.loadtest.js
-else 
-	$(error TABBY_API_HOST is undefined)
-endif
-
 fix:
 	cargo machete --fix || true
 	cargo +nightly fmt

diff --git a/python/tabby-loadtest/README.md b/python/tabby-loadtest/README.md
@@ -0,0 +1,6 @@
+# Run loadtest with tabby on modal GPUs
+
+Steps:
+1. Adjust `TABBY_API_HOST` in `run.sh` to match your modal deployment url.
+2. Add models you're interested in to benchmark at end of `run.sh`
+3. Run `run.sh`, output will be appended to `record.csv`
diff --git a/python/tabby-loadtest/loadtest.js b/python/tabby-loadtest/loadtest.js
@@ -0,0 +1,48 @@
+import http from "k6/http";
+import { check, sleep } from "k6";
+import { textSummary } from "https://jslib.k6.io/k6-utils/1.4.0/index.js";
+
+const PARALLELISM = parseInt(__ENV.PARALLELISM);
+
+export const options = {
+  stages: [
+    { duration: "1s", target: PARALLELISM },
+    { duration: "30s", target: PARALLELISM },
+  ],
+  // Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU.
+  thresholds: {
+    http_req_failed: ['rate<0.001'],
+    http_req_duration: ["med<1800", "avg<1800", "p(90)<2500", "p(95)<3000"],
+  },
+};
+
+export default () => {
+  const payload = JSON.stringify({
+    language: "python",
+    segments: {
+      prefix: "def binarySearch(arr, left, right, x):\n    mid = (left +"
+    },
+  });
+  const headers = { "Content-Type": "application/json" };
+  const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, {
+    headers,
+  });
+  check(res, { success: (r) => r.status === 200 });
+  sleep(0.5);
+};
+
+export function handleSummary(data) {
+  const avg_latency = data.metrics.http_req_duration.values.avg / 1000;
+  const med_latency = data.metrics.http_req_duration.values.med / 1000;
+  const p90_latency = data.metrics.http_req_duration.values["p(90)"] / 1000;
+  const p95_latency = data.metrics.http_req_duration.values["p(95)"] / 1000;
+  const qps = PARALLELISM / avg_latency;
+
+  return {
+    "metrics.txt": `${rounded(qps)},${rounded(avg_latency)},${rounded(med_latency)},${rounded(p90_latency)},${rounded(p95_latency)}`
+  };
+}
+
+function rounded(x) {
+  return Math.round(x * 100) / 100;
+}
diff --git a/python/tabby-loadtest/record.csv b/python/tabby-loadtest/record.csv
@@ -0,0 +1,59 @@
+GPU,Model,Parallelism,QPS,Latency (Avg)," Latency (Med)"," Latency (p90)"," Latency (p95)"," Passed"
+T4,TabbyML/StarCoder-1B,7,4.14,1.69,"1.58","2","2.05","SUCCESS"
+T4,TabbyML/StarCoder-1B,10,4.85,2.06,"1.98","2.49","2.63","FAILED"
+T4,TabbyML/StarCoder-1B,8,4.22,1.9,"1.85","2.29","2.38","FAILED"
+A10G,TabbyML/StarCoder-1B,17,12.01,1.42,"1.35","1.66","1.9","SUCCESS"
+A10G,TabbyML/StarCoder-1B,25,14.61,1.71,"1.68","1.97","2.07","SUCCESS"
+A10G,TabbyML/StarCoder-1B,29,15.94,1.82,"1.79","2.11","2.17","FAILED"
+A10G,TabbyML/StarCoder-1B,27,14.89,1.81,"1.79","2.03","2.37","FAILED"
+A10G,TabbyML/StarCoder-1B,26,14.79,1.76,"1.72","2.08","2.22","SUCCESS"
+A100,TabbyML/StarCoder-1B,33,13.16,2.51,"1.57","2.58","11.52","FAILED"
+A100,TabbyML/StarCoder-1B,17,12.94,1.31,"1.29","1.46","1.53","SUCCESS"
+A100,TabbyML/StarCoder-1B,25,16.98,1.47,"1.43","1.69","1.86","SUCCESS"
+A100,TabbyML/StarCoder-1B,29,10.46,2.77,"2.8","3.11","3.18","FAILED"
+A100,TabbyML/StarCoder-1B,27,13.58,1.99,"1.96","2.37","2.42","FAILED"
+A100,TabbyML/StarCoder-1B,26,16.52,1.57,"1.47","1.96","2.19","SUCCESS"
+T4,TabbyML/DeepseekCoder-1.3B,7,3.82,1.83,"1.86","1.94","1.96","FAILED"
+T4,TabbyML/DeepseekCoder-1.3B,4,3.05,1.31,"1.32","1.45","1.49","SUCCESS"
+T4,TabbyML/DeepseekCoder-1.3B,5,3.18,1.57,"1.54","1.92","1.97","SUCCESS"
+T4,TabbyML/DeepseekCoder-1.3B,6,3.33,1.8,"1.84","1.92","1.95","FAILED"
+A10G,TabbyML/DeepseekCoder-1.3B,17,11.66,1.46,1.46,1.65,1.77,SUCCESS
+A10G,TabbyML/DeepseekCoder-1.3B,21,6.83,3.07,1.63,14.32,14.56,FAILED
+A10G,TabbyML/DeepseekCoder-1.3B,19,12.63,1.5,1.5,1.69,1.78,SUCCESS
+A10G,TabbyML/DeepseekCoder-1.3B,20,12.88,1.55,1.53,1.79,1.92,SUCCESS
+A100,TabbyML/DeepseekCoder-1.3B,33,20.78,1.59,1.55,1.84,1.9,SUCCESS
+A100,TabbyML/DeepseekCoder-1.3B,49,18.2,2.69,2.62,3.15,3.38,FAILED
+A100,TabbyML/DeepseekCoder-1.3B,41,16.44,2.49,2.36,2.9,3.49,FAILED
+A100,TabbyML/DeepseekCoder-1.3B,37,21.15,1.75,1.67,2.07,2.18,SUCCESS
+A100,TabbyML/DeepseekCoder-1.3B,39,14.78,2.64,2.6,3.02,3.13,FAILED
+A100,TabbyML/DeepseekCoder-1.3B,38,20.88,1.82,1.76,2.08,2.19,FAILED
+T4,TabbyML/StarCoder-3B,7,1.89,3.7,3.68,3.89,3.96,FAILED
+T4,TabbyML/StarCoder-3B,4,1.53,2.62,2.62,2.75,2.78,FAILED
+T4,TabbyML/StarCoder-3B,2,0.86,2.32,2.34,2.43,2.47,FAILED
+A10G,TabbyML/StarCoder-3B,17,5.42,3.14,3.16,3.47,3.84,FAILED
+A10G,TabbyML/StarCoder-3B,9,4.31,2.09,2.05,2.39,2.69,FAILED
+A10G,TabbyML/StarCoder-3B,5,2.85,1.75,1.73,1.95,2.28,SUCCESS
+A10G,TabbyML/StarCoder-3B,7,3.61,1.94,1.94,2.18,2.29,FAILED
+A10G,TabbyML/StarCoder-3B,6,3.17,1.89,1.91,2.04,2.06,FAILED
+A100,TabbyML/StarCoder-3B,33,8.12,4.07,4.12,4.53,4.61,FAILED
+A100,TabbyML/StarCoder-3B,17,7.9,2.15,2.1,2.34,2.81,FAILED
+A100,TabbyML/StarCoder-3B,9,4.47,2.01,1.99,2.19,2.26,FAILED
+A100,TabbyML/StarCoder-3B,5,3.21,1.56,1.56,1.68,1.72,SUCCESS
+A100,TabbyML/StarCoder-3B,7,4.29,1.63,1.61,1.78,1.8,SUCCESS
+A100,TabbyML/StarCoder-3B,8,4.63,1.73,1.73,1.92,2.03,SUCCESS
+A10G,TabbyML/DeepseekCoder-6.7B,5,1.3,3.85,3.83,4.25,4.31,FAILED
+A10G,TabbyML/DeepseekCoder-6.7B,3,1.14,2.63,2.6,2.81,2.86,FAILED
+A10G,TabbyML/DeepseekCoder-6.7B,2,0.83,2.4,2.4,2.48,2.5,FAILED
+A100,TabbyML/DeepseekCoder-6.7B,9,3.14,2.87,2.85,3.08,3.13,FAILED
+A100,TabbyML/DeepseekCoder-6.7B,5,2.08,2.4,2.46,2.58,2.63,FAILED
+A100,TabbyML/DeepseekCoder-6.7B,3,1.32,2.27,2.3,2.54,2.69,FAILED
+A100,TabbyML/DeepseekCoder-6.7B,2,1.2,1.67,1.66,1.84,1.93,SUCCESS
+A100,TabbyML/CodeLlama-7B,9,3.69,2.44,2.45,2.59,2.63,FAILED
+A100,TabbyML/CodeLlama-7B,5,2.14,2.34,2.31,2.61,3.26,FAILED
+A100,TabbyML/CodeLlama-7B,3,1.52,1.97,2.02,2.3,2.37,FAILED
+A100,TabbyML/CodeLlama-7B,5,2.37,2.11,2.13,2.24,2.26,FAILED
+A100,TabbyML/CodeLlama-7B,3,1.59,1.89,1.95,2.04,2.07,FAILED
+A100,TabbyML/CodeLlama-7B,2,1.45,1.38,1.39,1.54,1.56,SUCCESS
+A100,TabbyML/CodeLlama-13B,5,1.21,4.14,4.15,4.38,4.5,FAILED
+A100,TabbyML/CodeLlama-13B,3,0.89,3.36,3.4,3.71,3.73,FAILED
+A100,TabbyML/CodeLlama-13B,2,0.73,2.75,2.73,2.92,3.06,FAILED
diff --git a/python/tabby-loadtest/run.sh b/python/tabby-loadtest/run.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+record() {
+  echo $GPU_CONFIG,$MODEL_ID,$PARALLELISM,$1 >> record.csv
+}
+
+cleanup() {
+MODAL_APP_ID=$(modal app list | grep tabby-server-loadtest | grep deployed | awk '{print $2}')
+
+if [ -z $MODAL_APP_ID ]; then
+  modal app stop $MODAL_APP_ID
+fi
+}
+
+loadtest() {
+export GPU_CONFIG=$1
+export MODEL_ID=$2
+export PARALLELISM=$3
+
+>&2 modal deploy server.py
+
+export MODAL_PROCESS_ID=$!
+export TABBY_API_HOST=https://wsxiaoys--tabby-server-loadtest-app.modal.run
+
+# wait for warmup
+>&2 echo "Waiting for warmup..."
+
+
+n=0
+while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' $TABBY_API_HOST/v1/health)" != "200" ]]; do
+  if [ "$n" -ge 5 ]; then
+    # error after 5 retries.
+    return 1
+  fi
+
+  sleep 10;
+  n=$((n+1)) 
+done
+
+>&2 echo "Start load testing..."
+
+>&2 k6 run loadtest.js
+SUCCESS=$?
+METRICS=$(cat metrics.txt)
+rm metrics.txt
+
+if [ $SUCCESS -ne 0 ]; then
+  record $METRICS,FAILED
+else
+  record $METRICS,SUCCESS
+fi
+
+cleanup
+
+return $SUCCESS
+}
+
+function dichotomic_search {
+  min=$1
+  max=$2
+  command=$3
+
+  while (( $min < $max )); do
+    # Compute the mean between min and max, rounded up to the superior unit
+    current=$(( (min + max + 1 ) / 2 ))
+
+    if $command $current
+      then min=$current
+      else max=$((current - 1))
+    fi
+  done
+}
+
+test_t4() {
+  loadtest T4 $MODEL_ID $1
+}
+
+test_a10g() {
+  loadtest A10G $MODEL_ID $1
+}
+
+test_a100() {
+  loadtest A100 $MODEL_ID $1
+}
+
+test_1b3b_model() {
+  export MODEL_ID="$1"
+
+  dichotomic_search 1 12 test_t4
+  dichotomic_search 1 32 test_a10g
+  dichotomic_search 1 64 test_a100
+}
+
+test_7b_model() {
+  export MODEL_ID="$1"
+
+  dichotomic_search 1 8 test_a100
+}
+
+test_13b_model() {
+  export MODEL_ID="$1"
+
+  dichotomic_search 1 8 test_a100
+}
+
+# test_7b_model TabbyML/CodeLlama-7B
+test_13b_model TabbyML/CodeLlama-13B
diff --git a/python/tabby-loadtest/server.py b/python/tabby-loadtest/server.py
@@ -0,0 +1,97 @@
+"""Usage:
+modal serve app.py
+"""
+
+import os
+from modal import Image, Stub, asgi_app
+
+GPU_CONFIG = os.environ.get("GPU_CONFIG", "T4")
+IMAGE_NAME = "tabbyml/tabby:0.6.0"
+MODEL_ID = os.environ.get("MODEL_ID", "TabbyML/StarCoder-1B")
+PARALLELISM = os.environ.get("PARALLELISM", "4")
+
+
+def download_model():
+    import os
+    import subprocess
+
+    model_id = os.environ.get("MODEL_ID")
+    subprocess.run(
+        [
+            "/opt/tabby/bin/tabby",
+            "download",
+            "--model",
+            model_id,
+        ]
+    )
+
+
+image = (
+    Image.from_registry(
+        IMAGE_NAME,
+        add_python="3.11",
+    )
+    .env({"MODEL_ID": MODEL_ID})
+    .dockerfile_commands("ENTRYPOINT []")
+    .run_function(download_model)
+    .pip_install("asgi-proxy-lib")
+    .env({"PARALLELISM": PARALLELISM})
+)
+
+stub = Stub("tabby-server-loadtest", image=image)
+
+
+@stub.function(
+    gpu=GPU_CONFIG,
+    allow_concurrent_inputs=int(PARALLELISM),
+    container_idle_timeout=120,
+    timeout=360,
+)
+@asgi_app()
+def app():
+    import os
+    import socket
+    import subprocess
+    import time
+    from asgi_proxy import asgi_proxy
+
+    model_id = os.environ.get("MODEL_ID")
+    parallelism = os.environ.get("PARALLELISM")
+
+    env = os.environ.copy()
+    env["TABBY_DISABLE_USAGE_COLLECTION"] = "1"
+
+    launcher = subprocess.Popen(
+        [
+            "/opt/tabby/bin/tabby",
+            "serve",
+            "--model",
+            model_id,
+            "--port",
+            "8000",
+            "--device",
+            "cuda",
+            "--parallelism",
+            parallelism,
+        ],
+        env=env
+    )
+
+    # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
+    def tabby_ready():
+        try:
+            socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
+            return True
+        except (socket.timeout, ConnectionRefusedError):
+            # Check if launcher webserving process has exited.
+            # If so, a connection can never be made.
+            retcode = launcher.poll()
+            if retcode is not None:
+                raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
+            return False
+
+    while not tabby_ready():
+        time.sleep(1.0)
+
+    print("Tabby server ready!")
+    return asgi_proxy("http://localhost:8000")
diff --git a/tests/default.loadtest.js b/tests/default.loadtest.js