diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py index 86e3ce7e..9fa5cfc5 100644 --- a/docs/examples/llm/client.py +++ b/docs/examples/llm/client.py @@ -1,3 +1,5 @@ +import subprocess + import openai @@ -34,7 +36,7 @@ def get_job_comment(name="inference_server.sh"): # profit completion = openai.Completion.create( model=server['model'], - prompt=args.prompt + prompt="What is the square root of 25 ?" ) print(completion) \ No newline at end of file diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh index acf2a857..fdc1f69c 100644 --- a/docs/examples/llm/inference_server.sh +++ b/docs/examples/llm/inference_server.sh @@ -16,21 +16,22 @@ #SBATCH --ntasks-per-node=1 #SBATCH --mem=32G -usage() { - echo "Usage: $0 [-m] [-p] +function usage() { + echo "Usage: $0 [-m] [-p]" echo " -h Display this help message." echo " -m MODEL Specify a file to process." echo " -p PATH Specify a directory to work in." + echo " -e ENV Specify the conda environementt to use." echo " ARGUMENT Any additional argument you want to process." exit 1 } MODEL="" -PATH="" +MODEL_PATH="" ENV="./env" -while getopts ":hf:d:" opt; do +while getopts ":hm:p:e:" opt; do case $opt in h) usage @@ -39,7 +40,7 @@ while getopts ":hf:d:" opt; do MODEL="$OPTARG" ;; p) - PATH="$OPTARG" + MODEL_PATH="$OPTARG" ;; e) ENV="$OPTARG" @@ -55,9 +56,11 @@ while getopts ":hf:d:" opt; do esac done +echo "model: $MODEL" +echo " path: $MODEL_PATH" +echo " env: $ENV" export MILA_WEIGHTS="/network/weights/" - cd $SLURM_TMPDIR # @@ -65,12 +68,13 @@ cd $SLURM_TMPDIR # CONDA_EXEC="$(which conda)" CONDA_BASE=$(dirname $CONDA_EXEC) +CONDA_ENVS="$CONDA_BASE/../envs" source $CONDA_BASE/../etc/profile.d/conda.sh # # Create a new environment # -if [ ! -d "$ENV" ]; then +if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then conda create --prefix $ENV python=3.9 -y fi conda activate $ENV @@ -85,12 +89,12 @@ NAME="$WEIGHTS/$MODEL" # scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y" -# +# # Launch Server # python -m vllm.entrypoints.openai.api_server \ --host $HOST \ --port $PORT \ - --model "$MODEL" \ + --model "$MODEL_PATH" \ --tensor-parallel-size $SLURM_NTASKS_PER_NODE \ --served-model-name "$MODEL" diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst index e55c1291..0f7dfebb 100644 --- a/docs/examples/llm/vllm.rst +++ b/docs/examples/llm/vllm.rst @@ -36,24 +36,28 @@ You can override the defaults by specifying arguments to sbatch. Client ------ -Becasue vLLM replicates OpenAI's API, the client side is quite straight forward. -Own OpenAI's client can be reused. +Because vLLM replicates OpenAI's API, the client side is quite straight forward and +own OpenAI's client can be reused. .. warning:: The server takes a while to setup you might to have to wait a few minutes before the server is ready for inference. - You can check the job log of the server. - Look for + You can check the job log of the server using ``tail -f slurm-.out`` to + see the log as it is written. + + Look for ``Uvicorn running on http://... (Press CTRL+C to quit)`` + to know when the server is ready to receive requests. .. note:: - We use squeue to look for the inference server job to configure the + We use ``squeue`` to look for the inference server job to configure the url endpoint automatically. Make sure your job name is unique! + .. literalinclude:: client.py :language: python