Skip to content

Commit

Permalink
Update inference examples to manual input URLs, update flags for mode…
Browse files Browse the repository at this point in the history
…l launch script, update vllm version to 0.4.0 to support logits
  • Loading branch information
XkunW committed Apr 9, 2024
1 parent 14a1985 commit 1206730
Show file tree
Hide file tree
Showing 7 changed files with 1,281 additions and 1,266 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,6 @@ dmypy.json
*.err

# Server url files
.vllm_api_base_url
.vllm*

logs/
11 changes: 3 additions & 8 deletions examples/inference.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
import os
from openai import OpenAI

model_type = "llama2"
vec_inf_dir = os.path.dirname(os.getcwd())
with open(f"{vec_inf_dir}/models/{model_type}/.vllm_api_base_url", "r") as f:
base_url = f.read()

client = OpenAI(base_url=base_url, api_key="EMPTY")
# The url is located in the .vllm_model-variant_url file in the corresponding model directory.
client = OpenAI(base_url="http://gpuXXX:XXXXX/v1", api_key="EMPTY")

completion = client.completions.create(
model="/model-weights/Llama-2-7b-hf",
model="/model-weights/Llama-2-70b-hf",
prompt="Where is the capital of Canada?",
max_tokens=20,
)
Expand Down
11 changes: 3 additions & 8 deletions examples/inference.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
# Did you modify this line in openai_entrypoint.sh?
# If so, modify it here accordingly.
model_type="llama2"
top_directory=$(dirname $(dirname $(realpath "$0")))
VLLM_BASE_URL_FILENAME=${top_directory}/models/${model_type}/.vllm_api_base_url

API_BASE_URL=$(cat ${VLLM_BASE_URL_FILENAME})
# The url is located in the .vllm_model-variant_url file in the corresponding model directory.
export API_BASE_URL=http://gpuXXX:XXXXX/v1

curl ${API_BASE_URL}/completions \
-H "Content-Type: application/json" \
-d '{
"model": "/model-weights/Llama-2-7b-hf",
"model": "/model-weights/Llama-2-13b-hf",
"prompt": "What is the capital of Canada?",
"max_tokens": 20
}'
14 changes: 11 additions & 3 deletions models/llama2/launch_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# SLURM job and are written to the file specified at VLLM_BASE_URL_FILENAME
export MODEL_NAME="llama2"
export MODEL_VARIANT="7b"
export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_api_base_url"
export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_${MODEL_NAME}-${MODEL_VARIANT}_url"

# Variables specific to your working environment, below are examples for the Vector cluster
export VENV_BASE=/projects/aieng/public/mixtral_vllm_env
Expand All @@ -21,42 +21,50 @@ export QOS="m3"

# ======================================= Optional Settings ========================================

while getopts p:n:q:t:e:v flag
do
while getopts "p:n:q:t:e:v:" flag; do
case "${flag}" in
p) partition=${OPTARG};;
n) num_gpus=${OPTARG};;
q) qos=${OPTARG};;
t) data_type=${OPTARG};;
e) virtual_env=${OPTARG};;
v) model_variant=${OPTARG};;
*) echo "Invalid option: $flag" ;;
esac
done

if [ -n "$partition" ]; then
export JOB_PARTITION=$partition
echo "Partition set to: ${JOB_PARTITION}"
fi

if [ -n "$num_gpus" ]; then
export NUM_GPUS=$num_gpus
echo "Number of GPUs set to: ${NUM_GPUS}"
fi

if [ -n "$qos" ]; then
export QOS=$qos
echo "QOS set to: ${QOS}"
fi

if [ -n "$data_type" ]; then
export VLLM_DATA_TYPE=$data_type
echo "Data type set to: ${VLLM_DATA_TYPE}"
fi

if [ -n "$virtual_env" ]; then
export VENV_BASE=$virtual_env
echo "Virtual environment set to: ${VENV_BASE}"
fi

if [ -n "$model_variant" ]; then
export MODEL_VARIANT=$model_variant
echo "Model variant set to: ${MODEL_VARIANT}"

export VLLM_MODEL_WEIGHTS=/model-weights/Llama-2-${MODEL_VARIANT}-hf
export JOB_NAME="vllm/${MODEL_NAME}-${MODEL_VARIANT}"
export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_${MODEL_NAME}-${MODEL_VARIANT}_url"
fi

# Set data type to fp16 instead of bf16 for non-Ampere GPUs
Expand Down
11 changes: 8 additions & 3 deletions models/mixtral/launch_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
# SLURM job and are written to the file specified at VLLM_BASE_URL_FILENAME
export MODEL_NAME="mixtral"
export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_api_base_url"
export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_mixtral_url"

# Variables specific to your working environment, below are examples for the Vector cluster
export VENV_BASE=/projects/aieng/public/mixtral_vllm_env
Expand All @@ -20,35 +20,40 @@ export QOS="m3"

# ======================================= Optional Settings ========================================

while getopts p:n:q:t:e flag
do
while getopts "p:n:q:t:e:v:" flag; do
case "${flag}" in
p) partition=${OPTARG};;
n) num_gpus=${OPTARG};;
q) qos=${OPTARG};;
t) data_type=${OPTARG};;
e) virtual_env=${OPTARG};;
*) echo "Invalid option: $flag" ;;
esac
done

if [ -n "$partition" ]; then
export JOB_PARTITION=$partition
echo "Partition set to: ${JOB_PARTITION}"
fi

if [ -n "$num_gpus" ]; then
export NUM_GPUS=$num_gpus
echo "Number of GPUs set to: ${NUM_GPUS}"
fi

if [ -n "$qos" ]; then
export QOS=$qos
echo "QOS set to: ${QOS}"
fi

if [ -n "$data_type" ]; then
export VLLM_DATA_TYPE=$data_type
echo "Data type set to: ${VLLM_DATA_TYPE}"
fi

if [ -n "$virtual_env" ]; then
export VENV_BASE=$virtual_env
echo "Virtual environment set to: ${VENV_BASE}"
fi

# Set data type to fp16 instead of bf16 for non-Ampere GPUs
Expand Down
Loading

0 comments on commit 1206730

Please sign in to comment.