updated README.md

AyushSawant18588 · Nov 3, 2023 · 744c61f · 744c61f
1 parent 132cfb8
commit 744c61f
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -80,7 +80,7 @@ python3 $WORK_DIR/llm/download.py --model_name llama2_7b --output /mnt/llm --hf_
 
 Run the following command for starting Kubeflow serving and running inference on the given input:
 ```
-bash run.sh  -n <MODEL_NAME> -g <NUM_GPUS> -f <NFS_ADDRESS_WITH_SHARE_PATH> -m <NFS_LOCAL_MOUNT_LOCATION> -e <KUBE_DEPLOYMENT_NAME> [OPTIONAL -d <INPUT_PATH> -v <REPO_COMMIT_ID>]
+bash run.sh  -n <MODEL_NAME> -g <NUM_GPUS> -f <NFS_ADDRESS_WITH_SHARE_PATH> -m <NFS_LOCAL_MOUNT_LOCATION> -e <KUBE_DEPLOYMENT_NAME> [OPTIONAL -d <INPUT_PATH> -v <REPO_COMMIT_ID> -t <Your_HuggingFace_Hub_Token>]
 ```
 - n:    Name of model
 - d:    Absolute path of input data folder (Optional)
@@ -89,9 +89,10 @@ bash run.sh  -n <MODEL_NAME> -g <NUM_GPUS> -f <NFS_ADDRESS_WITH_SHARE_PATH> -m <
 - m:    Mount path to your nfs server to be used in the kube PV where model files and model archive file be stored
 - e:    Name of the deployment metadata
 - v:    Commit id of model's repo from HuggingFace (optional, if not provided default set in model_config will be used)
+- t:    Your HuggingFace token. Needed for LLAMA(2) model.
 
 For model names, we support MPT-7B, Falcon-7B and Llama2-7B.
-Should print "Inference Run Successful" as a message at the end
+Should print "Inference Run Successful" as a message once the Inference Server has successfully started
 
 ##### Examples
 
@@ -105,7 +106,7 @@ bash $WORK_DIR/llm/run.sh -n falcon_7b -d data/qa -g 1 -e llm-deploy -f '1.1.1.1
 ```
 For 1 GPU Inference with official Llama2-7B model and keep inference server alive:
 ```
-bash $WORK_DIR/llm/run.sh -n llama2_7b -d data/summarize -g 1 -e llm-deploy -f '1.1.1.1:/llm' -m /mnt/llm
+bash $WORK_DIR/llm/run.sh -n llama2_7b -d data/summarize -g 1 -e llm-deploy -f '1.1.1.1:/llm' -m /mnt/llm -t <Your_HuggingFace_Hub_Token>
 ```
 
 #### Inference Check
@@ -144,7 +145,7 @@ curl -v -H "Host: ${SERVICE_HOSTNAME}" -H "Content-Type: application/json" http:
 
 #### Cleanup Inference deployment
 
-If keep alive flag was set in the bash script, then you can run the following command to stop the server and clean up temporary files
+Run the following command to stop the inference server and unmount PV and PVC.
 
 python3 $WORK_DIR/llm/cleanup.py --deploy_name <DEPLOYMENT_NAME>
 
@@ -165,7 +166,7 @@ python3 download.py --no_download [--repo_version <REPO_COMMIT_ID> --handler <CU
 - model_name:       Name of custom model, this name must not be in model_config
 - repo_version:     Any model version, defaults to "1.0" (optional)
 - model_path:       Absolute path of custom model files (should be non empty)
-- output:           Mount path to your nfs server to be used in the kube PV where model files and model archive file be stored
+- output:           Mount path to your nfs server to be used in the kube PV where config.properties and model archive file be stored
 - handler:          Path to custom handler, defaults to llm/handler.py (optional)<br />
 
 ### Start Torchserve and run inference for custom model

diff --git a/llm/handler.py b/llm/handler.py
@@ -36,7 +36,7 @@ class LLMHandler(BaseHandler, ABC):
             This method loads the Hugging Face model and tokenizer based on
             the provided model name and model files present in MAR file.
         preprocess(text: str) -> Tensor:
-            This method tookenizes input text using the associated tokenizer.
+            This method tokenizes input text using the associated tokenizer.
             Args:
                 text (str): The input text to be tokenized.
             Returns: