Update inference examples to manual input URLs, update flags for mode…

…l launch script, update vllm version to 0.4.0 to support logits
VectorInstitute · Apr 9, 2024 · 1206730 · 1206730
1 parent 14a1985
commit 1206730
Show file tree

Hide file tree

Showing 7 changed files with 1,281 additions and 1,266 deletions.
diff --git a/.gitignore b/.gitignore
@@ -142,6 +142,6 @@ dmypy.json
 *.err
 
 # Server url files
-.vllm_api_base_url
+.vllm*
 
 logs/
diff --git a/examples/inference.py b/examples/inference.py
@@ -1,15 +1,10 @@
-import os
 from openai import OpenAI
 
-model_type = "llama2"
-vec_inf_dir = os.path.dirname(os.getcwd())
-with open(f"{vec_inf_dir}/models/{model_type}/.vllm_api_base_url", "r") as f:
-    base_url = f.read()
-
-client = OpenAI(base_url=base_url, api_key="EMPTY")
+# The url is located in the .vllm_model-variant_url file in the corresponding model directory.
+client = OpenAI(base_url="http://gpuXXX:XXXXX/v1", api_key="EMPTY")
 
 completion = client.completions.create(
-    model="/model-weights/Llama-2-7b-hf",
+    model="/model-weights/Llama-2-70b-hf",
     prompt="Where is the capital of Canada?",
     max_tokens=20,
 )

diff --git a/examples/inference.sh b/examples/inference.sh
@@ -1,15 +1,10 @@
-# Did you modify this line in openai_entrypoint.sh?
-# If so, modify it here accordingly.
-model_type="llama2"
-top_directory=$(dirname $(dirname $(realpath "$0")))
-VLLM_BASE_URL_FILENAME=${top_directory}/models/${model_type}/.vllm_api_base_url
-
-API_BASE_URL=$(cat ${VLLM_BASE_URL_FILENAME})
+# The url is located in the .vllm_model-variant_url file in the corresponding model directory.
+export API_BASE_URL=http://gpuXXX:XXXXX/v1
 
 curl ${API_BASE_URL}/completions \
    -H "Content-Type: application/json" \
    -d '{
-       "model": "/model-weights/Llama-2-7b-hf",
+       "model": "/model-weights/Llama-2-13b-hf",
        "prompt": "What is the capital of Canada?",
        "max_tokens": 20
    }'
diff --git a/models/llama2/launch_server.sh b/models/llama2/launch_server.sh
@@ -6,7 +6,7 @@
 # SLURM job and are written to the file specified at VLLM_BASE_URL_FILENAME
 export MODEL_NAME="llama2"
 export MODEL_VARIANT="7b"
-export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_api_base_url"
+export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_${MODEL_NAME}-${MODEL_VARIANT}_url"
 
 # Variables specific to your working environment, below are examples for the Vector cluster
 export VENV_BASE=/projects/aieng/public/mixtral_vllm_env
@@ -21,42 +21,50 @@ export QOS="m3"
 
 # ======================================= Optional Settings ========================================
 
-while getopts p:n:q:t:e:v flag
-do 
+while getopts "p:n:q:t:e:v:" flag; do 
     case "${flag}" in
         p) partition=${OPTARG};;
         n) num_gpus=${OPTARG};;
         q) qos=${OPTARG};;
         t) data_type=${OPTARG};;
         e) virtual_env=${OPTARG};;
         v) model_variant=${OPTARG};;
+        *) echo "Invalid option: $flag" ;;
     esac
 done
 
 if [ -n "$partition" ]; then
     export JOB_PARTITION=$partition
+    echo "Partition set to: ${JOB_PARTITION}"
 fi
 
 if [ -n "$num_gpus" ]; then
     export NUM_GPUS=$num_gpus
+    echo "Number of GPUs set to: ${NUM_GPUS}"
 fi
 
 if [ -n "$qos" ]; then
     export QOS=$qos
+    echo "QOS set to: ${QOS}"
 fi
 
 if [ -n "$data_type" ]; then
     export VLLM_DATA_TYPE=$data_type
+    echo "Data type set to: ${VLLM_DATA_TYPE}"
 fi
 
 if [ -n "$virtual_env" ]; then
     export VENV_BASE=$virtual_env
+    echo "Virtual environment set to: ${VENV_BASE}"
 fi
 
 if [ -n "$model_variant" ]; then
     export MODEL_VARIANT=$model_variant
+    echo "Model variant set to: ${MODEL_VARIANT}"
+
     export VLLM_MODEL_WEIGHTS=/model-weights/Llama-2-${MODEL_VARIANT}-hf
     export JOB_NAME="vllm/${MODEL_NAME}-${MODEL_VARIANT}"
+    export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_${MODEL_NAME}-${MODEL_VARIANT}_url"
 fi
 
 # Set data type to fp16 instead of bf16 for non-Ampere GPUs

diff --git a/models/mixtral/launch_server.sh b/models/mixtral/launch_server.sh
@@ -5,7 +5,7 @@
 # Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
 # SLURM job and are written to the file specified at VLLM_BASE_URL_FILENAME
 export MODEL_NAME="mixtral"
-export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_api_base_url"
+export VLLM_BASE_URL_FILENAME="$(dirname $(realpath "$0"))/.vllm_mixtral_url"
 
 # Variables specific to your working environment, below are examples for the Vector cluster
 export VENV_BASE=/projects/aieng/public/mixtral_vllm_env
@@ -20,35 +20,40 @@ export QOS="m3"
 
 # ======================================= Optional Settings ========================================
 
-while getopts p:n:q:t:e flag
-do 
+while getopts "p:n:q:t:e:v:" flag; do 
     case "${flag}" in
         p) partition=${OPTARG};;
         n) num_gpus=${OPTARG};;
         q) qos=${OPTARG};;
         t) data_type=${OPTARG};;
         e) virtual_env=${OPTARG};;
+        *) echo "Invalid option: $flag" ;;
     esac
 done
 
 if [ -n "$partition" ]; then
     export JOB_PARTITION=$partition
+    echo "Partition set to: ${JOB_PARTITION}"
 fi
 
 if [ -n "$num_gpus" ]; then
     export NUM_GPUS=$num_gpus
+    echo "Number of GPUs set to: ${NUM_GPUS}"
 fi
 
 if [ -n "$qos" ]; then
     export QOS=$qos
+    echo "QOS set to: ${QOS}"
 fi
 
 if [ -n "$data_type" ]; then
     export VLLM_DATA_TYPE=$data_type
+    echo "Data type set to: ${VLLM_DATA_TYPE}"
 fi
 
 if [ -n "$virtual_env" ]; then
     export VENV_BASE=$virtual_env
+    echo "Virtual environment set to: ${VENV_BASE}"
 fi
 
 # Set data type to fp16 instead of bf16 for non-Ampere GPUs
-Original file line number
+Diff line change
@@ Expand Up / @@ -142,6 +142,6 @@ dmypy.json @@
     *.err
     # Server url files
-    .vllm_api_base_url
+    .vllm*
     logs/