diff --git a/Makefile b/Makefile index 6b987b9b6d1a..b8e207ac2bc3 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,3 @@ -loadtest: -ifdef TABBY_API_HOST - k6 run tests/*.loadtest.js -else - $(error TABBY_API_HOST is undefined) -endif - fix: cargo machete --fix || true cargo +nightly fmt diff --git a/python/tabby-loadtest/README.md b/python/tabby-loadtest/README.md new file mode 100644 index 000000000000..1fd64d1cc987 --- /dev/null +++ b/python/tabby-loadtest/README.md @@ -0,0 +1,6 @@ +# Run loadtest with tabby on modal GPUs + +Steps: +1. Adjust `TABBY_API_HOST` in `run.sh` to match your modal deployment url. +2. Add models you're interested in to benchmark at end of `run.sh` +3. Run `run.sh`, output will be appended to `record.csv` \ No newline at end of file diff --git a/python/tabby-loadtest/loadtest.js b/python/tabby-loadtest/loadtest.js new file mode 100644 index 000000000000..d1c95dcaee6f --- /dev/null +++ b/python/tabby-loadtest/loadtest.js @@ -0,0 +1,48 @@ +import http from "k6/http"; +import { check, sleep } from "k6"; +import { textSummary } from "https://jslib.k6.io/k6-utils/1.4.0/index.js"; + +const PARALLELISM = parseInt(__ENV.PARALLELISM); + +export const options = { + stages: [ + { duration: "1s", target: PARALLELISM }, + { duration: "30s", target: PARALLELISM }, + ], + // Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU. + thresholds: { + http_req_failed: ['rate<0.001'], + http_req_duration: ["med<1800", "avg<1800", "p(90)<2500", "p(95)<3000"], + }, +}; + +export default () => { + const payload = JSON.stringify({ + language: "python", + segments: { + prefix: "def binarySearch(arr, left, right, x):\n mid = (left +" + }, + }); + const headers = { "Content-Type": "application/json" }; + const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, { + headers, + }); + check(res, { success: (r) => r.status === 200 }); + sleep(0.5); +}; + +export function handleSummary(data) { + const avg_latency = data.metrics.http_req_duration.values.avg / 1000; + const med_latency = data.metrics.http_req_duration.values.med / 1000; + const p90_latency = data.metrics.http_req_duration.values["p(90)"] / 1000; + const p95_latency = data.metrics.http_req_duration.values["p(95)"] / 1000; + const qps = PARALLELISM / avg_latency; + + return { + "metrics.txt": `${rounded(qps)},${rounded(avg_latency)},${rounded(med_latency)},${rounded(p90_latency)},${rounded(p95_latency)}` + }; +} + +function rounded(x) { + return Math.round(x * 100) / 100; +} \ No newline at end of file diff --git a/python/tabby-loadtest/record.csv b/python/tabby-loadtest/record.csv new file mode 100644 index 000000000000..b5e83e63ab9f --- /dev/null +++ b/python/tabby-loadtest/record.csv @@ -0,0 +1,59 @@ +GPU,Model,Parallelism,QPS,Latency (Avg)," Latency (Med)"," Latency (p90)"," Latency (p95)"," Passed" +T4,TabbyML/StarCoder-1B,7,4.14,1.69,"1.58","2","2.05","SUCCESS" +T4,TabbyML/StarCoder-1B,10,4.85,2.06,"1.98","2.49","2.63","FAILED" +T4,TabbyML/StarCoder-1B,8,4.22,1.9,"1.85","2.29","2.38","FAILED" +A10G,TabbyML/StarCoder-1B,17,12.01,1.42,"1.35","1.66","1.9","SUCCESS" +A10G,TabbyML/StarCoder-1B,25,14.61,1.71,"1.68","1.97","2.07","SUCCESS" +A10G,TabbyML/StarCoder-1B,29,15.94,1.82,"1.79","2.11","2.17","FAILED" +A10G,TabbyML/StarCoder-1B,27,14.89,1.81,"1.79","2.03","2.37","FAILED" +A10G,TabbyML/StarCoder-1B,26,14.79,1.76,"1.72","2.08","2.22","SUCCESS" +A100,TabbyML/StarCoder-1B,33,13.16,2.51,"1.57","2.58","11.52","FAILED" +A100,TabbyML/StarCoder-1B,17,12.94,1.31,"1.29","1.46","1.53","SUCCESS" +A100,TabbyML/StarCoder-1B,25,16.98,1.47,"1.43","1.69","1.86","SUCCESS" +A100,TabbyML/StarCoder-1B,29,10.46,2.77,"2.8","3.11","3.18","FAILED" +A100,TabbyML/StarCoder-1B,27,13.58,1.99,"1.96","2.37","2.42","FAILED" +A100,TabbyML/StarCoder-1B,26,16.52,1.57,"1.47","1.96","2.19","SUCCESS" +T4,TabbyML/DeepseekCoder-1.3B,7,3.82,1.83,"1.86","1.94","1.96","FAILED" +T4,TabbyML/DeepseekCoder-1.3B,4,3.05,1.31,"1.32","1.45","1.49","SUCCESS" +T4,TabbyML/DeepseekCoder-1.3B,5,3.18,1.57,"1.54","1.92","1.97","SUCCESS" +T4,TabbyML/DeepseekCoder-1.3B,6,3.33,1.8,"1.84","1.92","1.95","FAILED" +A10G,TabbyML/DeepseekCoder-1.3B,17,11.66,1.46,1.46,1.65,1.77,SUCCESS +A10G,TabbyML/DeepseekCoder-1.3B,21,6.83,3.07,1.63,14.32,14.56,FAILED +A10G,TabbyML/DeepseekCoder-1.3B,19,12.63,1.5,1.5,1.69,1.78,SUCCESS +A10G,TabbyML/DeepseekCoder-1.3B,20,12.88,1.55,1.53,1.79,1.92,SUCCESS +A100,TabbyML/DeepseekCoder-1.3B,33,20.78,1.59,1.55,1.84,1.9,SUCCESS +A100,TabbyML/DeepseekCoder-1.3B,49,18.2,2.69,2.62,3.15,3.38,FAILED +A100,TabbyML/DeepseekCoder-1.3B,41,16.44,2.49,2.36,2.9,3.49,FAILED +A100,TabbyML/DeepseekCoder-1.3B,37,21.15,1.75,1.67,2.07,2.18,SUCCESS +A100,TabbyML/DeepseekCoder-1.3B,39,14.78,2.64,2.6,3.02,3.13,FAILED +A100,TabbyML/DeepseekCoder-1.3B,38,20.88,1.82,1.76,2.08,2.19,FAILED +T4,TabbyML/StarCoder-3B,7,1.89,3.7,3.68,3.89,3.96,FAILED +T4,TabbyML/StarCoder-3B,4,1.53,2.62,2.62,2.75,2.78,FAILED +T4,TabbyML/StarCoder-3B,2,0.86,2.32,2.34,2.43,2.47,FAILED +A10G,TabbyML/StarCoder-3B,17,5.42,3.14,3.16,3.47,3.84,FAILED +A10G,TabbyML/StarCoder-3B,9,4.31,2.09,2.05,2.39,2.69,FAILED +A10G,TabbyML/StarCoder-3B,5,2.85,1.75,1.73,1.95,2.28,SUCCESS +A10G,TabbyML/StarCoder-3B,7,3.61,1.94,1.94,2.18,2.29,FAILED +A10G,TabbyML/StarCoder-3B,6,3.17,1.89,1.91,2.04,2.06,FAILED +A100,TabbyML/StarCoder-3B,33,8.12,4.07,4.12,4.53,4.61,FAILED +A100,TabbyML/StarCoder-3B,17,7.9,2.15,2.1,2.34,2.81,FAILED +A100,TabbyML/StarCoder-3B,9,4.47,2.01,1.99,2.19,2.26,FAILED +A100,TabbyML/StarCoder-3B,5,3.21,1.56,1.56,1.68,1.72,SUCCESS +A100,TabbyML/StarCoder-3B,7,4.29,1.63,1.61,1.78,1.8,SUCCESS +A100,TabbyML/StarCoder-3B,8,4.63,1.73,1.73,1.92,2.03,SUCCESS +A10G,TabbyML/DeepseekCoder-6.7B,5,1.3,3.85,3.83,4.25,4.31,FAILED +A10G,TabbyML/DeepseekCoder-6.7B,3,1.14,2.63,2.6,2.81,2.86,FAILED +A10G,TabbyML/DeepseekCoder-6.7B,2,0.83,2.4,2.4,2.48,2.5,FAILED +A100,TabbyML/DeepseekCoder-6.7B,9,3.14,2.87,2.85,3.08,3.13,FAILED +A100,TabbyML/DeepseekCoder-6.7B,5,2.08,2.4,2.46,2.58,2.63,FAILED +A100,TabbyML/DeepseekCoder-6.7B,3,1.32,2.27,2.3,2.54,2.69,FAILED +A100,TabbyML/DeepseekCoder-6.7B,2,1.2,1.67,1.66,1.84,1.93,SUCCESS +A100,TabbyML/CodeLlama-7B,9,3.69,2.44,2.45,2.59,2.63,FAILED +A100,TabbyML/CodeLlama-7B,5,2.14,2.34,2.31,2.61,3.26,FAILED +A100,TabbyML/CodeLlama-7B,3,1.52,1.97,2.02,2.3,2.37,FAILED +A100,TabbyML/CodeLlama-7B,5,2.37,2.11,2.13,2.24,2.26,FAILED +A100,TabbyML/CodeLlama-7B,3,1.59,1.89,1.95,2.04,2.07,FAILED +A100,TabbyML/CodeLlama-7B,2,1.45,1.38,1.39,1.54,1.56,SUCCESS +A100,TabbyML/CodeLlama-13B,5,1.21,4.14,4.15,4.38,4.5,FAILED +A100,TabbyML/CodeLlama-13B,3,0.89,3.36,3.4,3.71,3.73,FAILED +A100,TabbyML/CodeLlama-13B,2,0.73,2.75,2.73,2.92,3.06,FAILED diff --git a/python/tabby-loadtest/run.sh b/python/tabby-loadtest/run.sh new file mode 100755 index 000000000000..9c5d38cd7bf4 --- /dev/null +++ b/python/tabby-loadtest/run.sh @@ -0,0 +1,107 @@ +#!/bin/bash + +record() { + echo $GPU_CONFIG,$MODEL_ID,$PARALLELISM,$1 >> record.csv +} + +cleanup() { +MODAL_APP_ID=$(modal app list | grep tabby-server-loadtest | grep deployed | awk '{print $2}') + +if [ -z $MODAL_APP_ID ]; then + modal app stop $MODAL_APP_ID +fi +} + +loadtest() { +export GPU_CONFIG=$1 +export MODEL_ID=$2 +export PARALLELISM=$3 + +>&2 modal deploy server.py + +export MODAL_PROCESS_ID=$! +export TABBY_API_HOST=https://wsxiaoys--tabby-server-loadtest-app.modal.run + +# wait for warmup +>&2 echo "Waiting for warmup..." + + +n=0 +while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' $TABBY_API_HOST/v1/health)" != "200" ]]; do + if [ "$n" -ge 5 ]; then + # error after 5 retries. + return 1 + fi + + sleep 10; + n=$((n+1)) +done + +>&2 echo "Start load testing..." + +>&2 k6 run loadtest.js +SUCCESS=$? +METRICS=$(cat metrics.txt) +rm metrics.txt + +if [ $SUCCESS -ne 0 ]; then + record $METRICS,FAILED +else + record $METRICS,SUCCESS +fi + +cleanup + +return $SUCCESS +} + +function dichotomic_search { + min=$1 + max=$2 + command=$3 + + while (( $min < $max )); do + # Compute the mean between min and max, rounded up to the superior unit + current=$(( (min + max + 1 ) / 2 )) + + if $command $current + then min=$current + else max=$((current - 1)) + fi + done +} + +test_t4() { + loadtest T4 $MODEL_ID $1 +} + +test_a10g() { + loadtest A10G $MODEL_ID $1 +} + +test_a100() { + loadtest A100 $MODEL_ID $1 +} + +test_1b3b_model() { + export MODEL_ID="$1" + + dichotomic_search 1 12 test_t4 + dichotomic_search 1 32 test_a10g + dichotomic_search 1 64 test_a100 +} + +test_7b_model() { + export MODEL_ID="$1" + + dichotomic_search 1 8 test_a100 +} + +test_13b_model() { + export MODEL_ID="$1" + + dichotomic_search 1 8 test_a100 +} + +# test_7b_model TabbyML/CodeLlama-7B +test_13b_model TabbyML/CodeLlama-13B \ No newline at end of file diff --git a/python/tabby-loadtest/server.py b/python/tabby-loadtest/server.py new file mode 100644 index 000000000000..0455d471ea5e --- /dev/null +++ b/python/tabby-loadtest/server.py @@ -0,0 +1,97 @@ +"""Usage: +modal serve app.py +""" + +import os +from modal import Image, Stub, asgi_app + +GPU_CONFIG = os.environ.get("GPU_CONFIG", "T4") +IMAGE_NAME = "tabbyml/tabby:0.6.0" +MODEL_ID = os.environ.get("MODEL_ID", "TabbyML/StarCoder-1B") +PARALLELISM = os.environ.get("PARALLELISM", "4") + + +def download_model(): + import os + import subprocess + + model_id = os.environ.get("MODEL_ID") + subprocess.run( + [ + "/opt/tabby/bin/tabby", + "download", + "--model", + model_id, + ] + ) + + +image = ( + Image.from_registry( + IMAGE_NAME, + add_python="3.11", + ) + .env({"MODEL_ID": MODEL_ID}) + .dockerfile_commands("ENTRYPOINT []") + .run_function(download_model) + .pip_install("asgi-proxy-lib") + .env({"PARALLELISM": PARALLELISM}) +) + +stub = Stub("tabby-server-loadtest", image=image) + + +@stub.function( + gpu=GPU_CONFIG, + allow_concurrent_inputs=int(PARALLELISM), + container_idle_timeout=120, + timeout=360, +) +@asgi_app() +def app(): + import os + import socket + import subprocess + import time + from asgi_proxy import asgi_proxy + + model_id = os.environ.get("MODEL_ID") + parallelism = os.environ.get("PARALLELISM") + + env = os.environ.copy() + env["TABBY_DISABLE_USAGE_COLLECTION"] = "1" + + launcher = subprocess.Popen( + [ + "/opt/tabby/bin/tabby", + "serve", + "--model", + model_id, + "--port", + "8000", + "--device", + "cuda", + "--parallelism", + parallelism, + ], + env=env + ) + + # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs. + def tabby_ready(): + try: + socket.create_connection(("127.0.0.1", 8000), timeout=1).close() + return True + except (socket.timeout, ConnectionRefusedError): + # Check if launcher webserving process has exited. + # If so, a connection can never be made. + retcode = launcher.poll() + if retcode is not None: + raise RuntimeError(f"launcher exited unexpectedly with code {retcode}") + return False + + while not tabby_ready(): + time.sleep(1.0) + + print("Tabby server ready!") + return asgi_proxy("http://localhost:8000") diff --git a/tests/default.loadtest.js b/tests/default.loadtest.js deleted file mode 100644 index 415d2ebf7b2e..000000000000 --- a/tests/default.loadtest.js +++ /dev/null @@ -1,30 +0,0 @@ -import http from "k6/http"; -import { check, group, sleep, abortTest } from "k6"; - -export const options = { - stages: [ - { duration: "5s", target: 8 }, - { duration: "20s", target: 8 }, - { duration: "5s", target: 0 }, - ], - // Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU. - thresholds: { - http_req_failed: ['rate<0.01'], // http errors should be less than 1% - http_req_duration: ["med<1800", "avg<1800", "p(95)<2000"], - }, -}; - -export default () => { - const payload = JSON.stringify({ - language: "python", - segments: { - prefix: "def binarySearch(arr, left, right, x):\n mid = (left +" - }, - }); - const headers = { "Content-Type": "application/json" }; - const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, { - headers, - }); - check(res, { success: (r) => r.status === 200 }); - sleep(0.5); -};