-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
279 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import http from "k6/http"; | ||
import { check, sleep } from "k6"; | ||
import { textSummary } from "https://jslib.k6.io/k6-utils/1.4.0/index.js"; | ||
|
||
const PARALLELISM = parseInt(__ENV.PARALLELISM); | ||
|
||
export const options = { | ||
stages: [ | ||
{ duration: "1s", target: PARALLELISM }, | ||
{ duration: "30s", target: PARALLELISM }, | ||
], | ||
// Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU. | ||
thresholds: { | ||
http_req_failed: ['rate<0.001'], | ||
http_req_duration: ["med<1800", "avg<1800", "p(90)<2500", "p(95)<3000"], | ||
}, | ||
}; | ||
|
||
export default () => { | ||
const payload = JSON.stringify({ | ||
language: "python", | ||
segments: { | ||
prefix: "def binarySearch(arr, left, right, x):\n mid = (left +" | ||
}, | ||
}); | ||
const headers = { "Content-Type": "application/json" }; | ||
const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, { | ||
headers, | ||
}); | ||
check(res, { success: (r) => r.status === 200 }); | ||
sleep(0.5); | ||
}; | ||
|
||
export function handleSummary(data) { | ||
const avg_latency = data.metrics.http_req_duration.values.avg / 1000; | ||
const med_latency = data.metrics.http_req_duration.values.med / 1000; | ||
const p90_latency = data.metrics.http_req_duration.values["p(90)"] / 1000; | ||
const p95_latency = data.metrics.http_req_duration.values["p(95)"] / 1000; | ||
const qps = PARALLELISM / avg_latency; | ||
|
||
return { | ||
"metrics.txt": `${rounded(qps)},${rounded(avg_latency)},${rounded(med_latency)},${rounded(p90_latency)},${rounded(p95_latency)}` | ||
}; | ||
} | ||
|
||
function rounded(x) { | ||
return Math.round(x * 100) / 100; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
GPU,Model,Parallelism,QPS,Latency (Avg)," Latency (Med)"," Latency (p90)"," Latency (p95)"," Passed" | ||
T4,TabbyML/StarCoder-1B,7,4.14,1.69,"1.58","2","2.05","SUCCESS" | ||
T4,TabbyML/StarCoder-1B,10,4.85,2.06,"1.98","2.49","2.63","FAILED" | ||
T4,TabbyML/StarCoder-1B,8,4.22,1.9,"1.85","2.29","2.38","FAILED" | ||
A10G,TabbyML/StarCoder-1B,17,12.01,1.42,"1.35","1.66","1.9","SUCCESS" | ||
A10G,TabbyML/StarCoder-1B,25,14.61,1.71,"1.68","1.97","2.07","SUCCESS" | ||
A10G,TabbyML/StarCoder-1B,29,15.94,1.82,"1.79","2.11","2.17","FAILED" | ||
A10G,TabbyML/StarCoder-1B,27,14.89,1.81,"1.79","2.03","2.37","FAILED" | ||
A10G,TabbyML/StarCoder-1B,26,14.79,1.76,"1.72","2.08","2.22","SUCCESS" | ||
A100,TabbyML/StarCoder-1B,33,13.16,2.51,"1.57","2.58","11.52","FAILED" | ||
A100,TabbyML/StarCoder-1B,17,12.94,1.31,"1.29","1.46","1.53","SUCCESS" | ||
A100,TabbyML/StarCoder-1B,25,16.98,1.47,"1.43","1.69","1.86","SUCCESS" | ||
A100,TabbyML/StarCoder-1B,29,10.46,2.77,"2.8","3.11","3.18","FAILED" | ||
A100,TabbyML/StarCoder-1B,27,13.58,1.99,"1.96","2.37","2.42","FAILED" | ||
A100,TabbyML/StarCoder-1B,26,16.52,1.57,"1.47","1.96","2.19","SUCCESS" | ||
T4,TabbyML/DeepseekCoder-1.3B,7,3.82,1.83,"1.86","1.94","1.96","FAILED" | ||
T4,TabbyML/DeepseekCoder-1.3B,4,3.05,1.31,"1.32","1.45","1.49","SUCCESS" | ||
T4,TabbyML/DeepseekCoder-1.3B,5,3.18,1.57,"1.54","1.92","1.97","SUCCESS" | ||
T4,TabbyML/DeepseekCoder-1.3B,6,3.33,1.8,"1.84","1.92","1.95","FAILED" | ||
A10G,TabbyML/DeepseekCoder-1.3B,17,11.66,1.46,1.46,1.65,1.77,SUCCESS | ||
A10G,TabbyML/DeepseekCoder-1.3B,21,6.83,3.07,1.63,14.32,14.56,FAILED | ||
A10G,TabbyML/DeepseekCoder-1.3B,19,12.63,1.5,1.5,1.69,1.78,SUCCESS | ||
A10G,TabbyML/DeepseekCoder-1.3B,20,12.88,1.55,1.53,1.79,1.92,SUCCESS | ||
A100,TabbyML/DeepseekCoder-1.3B,33,20.78,1.59,1.55,1.84,1.9,SUCCESS | ||
A100,TabbyML/DeepseekCoder-1.3B,49,18.2,2.69,2.62,3.15,3.38,FAILED | ||
A100,TabbyML/DeepseekCoder-1.3B,41,16.44,2.49,2.36,2.9,3.49,FAILED | ||
A100,TabbyML/DeepseekCoder-1.3B,37,21.15,1.75,1.67,2.07,2.18,SUCCESS | ||
A100,TabbyML/DeepseekCoder-1.3B,39,14.78,2.64,2.6,3.02,3.13,FAILED | ||
A100,TabbyML/DeepseekCoder-1.3B,38,20.88,1.82,1.76,2.08,2.19,FAILED | ||
T4,TabbyML/StarCoder-3B,7,1.89,3.7,3.68,3.89,3.96,FAILED | ||
T4,TabbyML/StarCoder-3B,4,1.53,2.62,2.62,2.75,2.78,FAILED | ||
T4,TabbyML/StarCoder-3B,2,0.86,2.32,2.34,2.43,2.47,FAILED | ||
A10G,TabbyML/StarCoder-3B,17,5.42,3.14,3.16,3.47,3.84,FAILED | ||
A10G,TabbyML/StarCoder-3B,9,4.31,2.09,2.05,2.39,2.69,FAILED | ||
A10G,TabbyML/StarCoder-3B,5,2.85,1.75,1.73,1.95,2.28,SUCCESS | ||
A10G,TabbyML/StarCoder-3B,7,3.61,1.94,1.94,2.18,2.29,FAILED | ||
A10G,TabbyML/StarCoder-3B,6,3.17,1.89,1.91,2.04,2.06,FAILED | ||
A100,TabbyML/StarCoder-3B,33,8.12,4.07,4.12,4.53,4.61,FAILED | ||
A100,TabbyML/StarCoder-3B,17,7.9,2.15,2.1,2.34,2.81,FAILED | ||
A100,TabbyML/StarCoder-3B,9,4.47,2.01,1.99,2.19,2.26,FAILED |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#!/bin/bash | ||
|
||
record() { | ||
echo $GPU_CONFIG,$MODEL_ID,$PARALLELISM,$1 >> record.csv | ||
} | ||
|
||
cleanup() { | ||
MODAL_APP_ID=$(modal app list | grep tabby-server-loadtest | grep deployed | awk '{print $2}') | ||
|
||
if [ -z $MODAL_APP_ID ]; then | ||
modal app stop $MODAL_APP_ID | ||
fi | ||
} | ||
|
||
loadtest() { | ||
export GPU_CONFIG=$1 | ||
export MODEL_ID=$2 | ||
export PARALLELISM=$3 | ||
|
||
>&2 modal deploy server.py | ||
|
||
export MODAL_PROCESS_ID=$! | ||
export TABBY_API_HOST=https://wsxiaoys--tabby-server-loadtest-app.modal.run | ||
|
||
# wait for warmup | ||
>&2 echo "Waiting for warmup..." | ||
|
||
|
||
n=0 | ||
while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' $TABBY_API_HOST/v1/health)" != "200" ]]; do | ||
if [ "$n" -ge 5 ]; then | ||
# error after 5 retries. | ||
return 1 | ||
fi | ||
|
||
sleep 10; | ||
n=$((n+1)) | ||
done | ||
|
||
>&2 echo "Start load testing..." | ||
|
||
>&2 k6 run loadtest.js | ||
SUCCESS=$? | ||
METRICS=$(cat metrics.txt) | ||
rm metrics.txt | ||
|
||
if [ $SUCCESS -ne 0 ]; then | ||
record $METRICS,FAILED | ||
else | ||
record $METRICS,SUCCESS | ||
fi | ||
|
||
cleanup | ||
|
||
return $SUCCESS | ||
} | ||
|
||
function dichotomic_search { | ||
min=$1 | ||
max=$2 | ||
command=$3 | ||
|
||
while (( $min < $max )); do | ||
# Compute the mean between min and max, rounded up to the superior unit | ||
current=$(( (min + max + 1 ) / 2 )) | ||
|
||
if $command $current | ||
then min=$current | ||
else max=$((current - 1)) | ||
fi | ||
done | ||
} | ||
|
||
test_t4() { | ||
loadtest T4 $MODEL_ID $1 | ||
} | ||
|
||
test_a10g() { | ||
loadtest A10G $MODEL_ID $1 | ||
} | ||
|
||
test_a100() { | ||
loadtest A100 $MODEL_ID $1 | ||
} | ||
|
||
test_model() { | ||
export MODEL_ID="$1" | ||
|
||
dichotomic_search 1 12 test_t4 | ||
dichotomic_search 1 32 test_a10g | ||
dichotomic_search 1 64 test_a100 | ||
} | ||
|
||
test_model TabbyML/StarCoder-3B |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
"""Usage: | ||
modal serve app.py | ||
""" | ||
|
||
import os | ||
from modal import Image, Stub, asgi_app, gpu | ||
|
||
GPU_CONFIG = os.environ.get("GPU_CONFIG", "T4") | ||
IMAGE_NAME = "tabbyml/tabby:0.6.0" | ||
MODEL_ID = os.environ.get("MODEL_ID", "TabbyML/StarCoder-1B") | ||
PARALLELISM = os.environ.get("PARALLELISM", "4") | ||
|
||
|
||
def download_model(): | ||
import os | ||
import subprocess | ||
|
||
model_id = os.environ.get("MODEL_ID") | ||
subprocess.run( | ||
[ | ||
"/opt/tabby/bin/tabby", | ||
"download", | ||
"--model", | ||
model_id, | ||
] | ||
) | ||
|
||
|
||
image = ( | ||
Image.from_registry( | ||
IMAGE_NAME, | ||
add_python="3.11", | ||
) | ||
.env({"MODEL_ID": MODEL_ID}) | ||
.dockerfile_commands("ENTRYPOINT []") | ||
.run_function(download_model) | ||
.pip_install("asgi-proxy-lib") | ||
.env({"PARALLELISM": PARALLELISM}) | ||
) | ||
|
||
stub = Stub("tabby-server-loadtest", image=image) | ||
|
||
|
||
@stub.function( | ||
gpu=GPU_CONFIG, | ||
allow_concurrent_inputs=int(PARALLELISM), | ||
container_idle_timeout=120, | ||
timeout=360, | ||
) | ||
@asgi_app() | ||
def app(): | ||
import os | ||
import socket | ||
import subprocess | ||
import time | ||
from asgi_proxy import asgi_proxy | ||
|
||
model_id = os.environ.get("MODEL_ID") | ||
parallelism = os.environ.get("PARALLELISM") | ||
|
||
env = os.environ.copy() | ||
env["TABBY_DISABLE_USAGE_COLLECTION"] = "1" | ||
|
||
launcher = subprocess.Popen( | ||
[ | ||
"/opt/tabby/bin/tabby", | ||
"serve", | ||
"--model", | ||
model_id, | ||
"--port", | ||
"8000", | ||
"--device", | ||
"cuda", | ||
"--parallelism", | ||
parallelism, | ||
], | ||
env=env | ||
) | ||
|
||
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs. | ||
def tabby_ready(): | ||
try: | ||
socket.create_connection(("127.0.0.1", 8000), timeout=1).close() | ||
return True | ||
except (socket.timeout, ConnectionRefusedError): | ||
# Check if launcher webserving process has exited. | ||
# If so, a connection can never be made. | ||
retcode = launcher.poll() | ||
if retcode is not None: | ||
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}") | ||
return False | ||
|
||
while not tabby_ready(): | ||
time.sleep(1.0) | ||
|
||
print("Tabby server ready!") | ||
return asgi_proxy("http://localhost:8000") |