From 7d16b5be1470cf034a5ad780082033891ff3a749 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Wed, 4 Oct 2023 11:00:19 -0400 Subject: [PATCH] Tweaks --- docs/examples/llm/client.py | 8 +++++--- docs/examples/llm/inference_server.sh | 22 +++++++++++++--------- docs/examples/llm/vllm.rst | 20 ++++++++++++-------- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py index 86e3ce7e..756761ae 100644 --- a/docs/examples/llm/client.py +++ b/docs/examples/llm/client.py @@ -1,3 +1,5 @@ +import subprocess + import openai @@ -33,8 +35,8 @@ def get_job_comment(name="inference_server.sh"): # profit completion = openai.Completion.create( - model=server['model'], - prompt=args.prompt + model=server['model'], + prompt="What is the square root of 25 ?" ) -print(completion) \ No newline at end of file +print(completion) diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh index acf2a857..fdc1f69c 100644 --- a/docs/examples/llm/inference_server.sh +++ b/docs/examples/llm/inference_server.sh @@ -16,21 +16,22 @@ #SBATCH --ntasks-per-node=1 #SBATCH --mem=32G -usage() { - echo "Usage: $0 [-m] [-p] +function usage() { + echo "Usage: $0 [-m] [-p]" echo " -h Display this help message." echo " -m MODEL Specify a file to process." echo " -p PATH Specify a directory to work in." + echo " -e ENV Specify the conda environementt to use." echo " ARGUMENT Any additional argument you want to process." exit 1 } MODEL="" -PATH="" +MODEL_PATH="" ENV="./env" -while getopts ":hf:d:" opt; do +while getopts ":hm:p:e:" opt; do case $opt in h) usage @@ -39,7 +40,7 @@ while getopts ":hf:d:" opt; do MODEL="$OPTARG" ;; p) - PATH="$OPTARG" + MODEL_PATH="$OPTARG" ;; e) ENV="$OPTARG" @@ -55,9 +56,11 @@ while getopts ":hf:d:" opt; do esac done +echo "model: $MODEL" +echo " path: $MODEL_PATH" +echo " env: $ENV" export MILA_WEIGHTS="/network/weights/" - cd $SLURM_TMPDIR # @@ -65,12 +68,13 @@ cd $SLURM_TMPDIR # CONDA_EXEC="$(which conda)" CONDA_BASE=$(dirname $CONDA_EXEC) +CONDA_ENVS="$CONDA_BASE/../envs" source $CONDA_BASE/../etc/profile.d/conda.sh # # Create a new environment # -if [ ! -d "$ENV" ]; then +if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then conda create --prefix $ENV python=3.9 -y fi conda activate $ENV @@ -85,12 +89,12 @@ NAME="$WEIGHTS/$MODEL" # scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y" -# +# # Launch Server # python -m vllm.entrypoints.openai.api_server \ --host $HOST \ --port $PORT \ - --model "$MODEL" \ + --model "$MODEL_PATH" \ --tensor-parallel-size $SLURM_NTASKS_PER_NODE \ --served-model-name "$MODEL" diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst index e55c1291..b6501969 100644 --- a/docs/examples/llm/vllm.rst +++ b/docs/examples/llm/vllm.rst @@ -9,7 +9,7 @@ Server It is very easy to setup and supports a wide range of models through Huggingfaces. -.. code-block:: +.. code-block:: # sbatch inference_server.sh -m MODEL_NAME -p WEIGHT_PATH -e CONDA_ENV_NAME_TO_USE sbatch inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base @@ -19,7 +19,7 @@ By default the script will launch the server on an rtx8000 for 15 minutes. You can override the defaults by specifying arguments to sbatch. -.. code-block:: +.. code-block:: sbatch --time=00:30:00 inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base @@ -36,24 +36,28 @@ You can override the defaults by specifying arguments to sbatch. Client ------ -Becasue vLLM replicates OpenAI's API, the client side is quite straight forward. -Own OpenAI's client can be reused. +Because vLLM replicates OpenAI's API, the client side is quite straight forward and +own OpenAI's client can be reused. .. warning:: - + The server takes a while to setup you might to have to wait a few minutes before the server is ready for inference. - You can check the job log of the server. - Look for + You can check the job log of the server using ``tail -f slurm-.out`` to + see the log as it is written. + + Look for ``Uvicorn running on http://... (Press CTRL+C to quit)`` + to know when the server is ready to receive requests. .. note:: - We use squeue to look for the inference server job to configure the + We use ``squeue`` to look for the inference server job to configure the url endpoint automatically. Make sure your job name is unique! + .. literalinclude:: client.py :language: python