feat: added inference and deploy scripts along with README instructio…

…ns on how to use them.
hommayushi3 · Aug 9, 2024 · ad01020 · ad01020
1 parent 7d07aa4
commit ad01020
Show file tree

Hide file tree

Showing 6 changed files with 458 additions and 3 deletions.
diff --git a/.env.example b/.env.example
@@ -0,0 +1,2 @@
+HF_TOKEN=hf_...
+HF_ENDPOINT_URL=https://<YOUR_ENDPOINT_DETAILS>.endpoints.huggingface.cloud
diff --git a/README.md b/README.md
@@ -1,2 +1,19 @@
 # vllm-huggingface
-Make vllm-openai Docker container compatible with HuggingFace Inference Endpoints. Heavily inspired by https://github.com/philschmid/vllm-huggingface, but does not fork from vllm.
+Make vllm-openai Docker container compatible with HuggingFace Inference Endpoints. Specifically, the most recent VLLM version supports vision language models like Phi-3-vision that Text Generation Inference does not yet support, so this repo is useful for deploying those VLM models not supported by TGI.
+
+This repo was heavily inspired by https://github.com/philschmid/vllm-huggingface, but is simpler because it does not fork from vllm. 
+
+# General Setup
+
+1. Install dependencies with `poetry install`. If using `poetry` as your environment manager, run `poetry shell` to activate your environment. 
+2. Add a `.env` file in the root directory with `HF_TOKEN` defined as a read/write token from [huggingface](https://huggingface.co/settings/tokens). See `.env.example` for how to format.
+
+# Deploy to HuggingFace Endpoint
+
+1. View/Edit the details in `examples/deploy.py`. It is set up to deploy a HuggingFace Inference Endpoint for the [Phi-3-vision model](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct). Once you have set up the necessary variables, run `python examples/deploy.py`.
+2. Go to the link printed by the previous `deploy.py` script to watch the endpoint deployment status and to retrieve the inference base url when finished deploying.
+3. Copy this Endpoint Url from step 2 and add the env variable `HF_ENDPOINT_URL` with this copied value. Again, see `.env.example` for how to format.
+
+# Run inference
+1. The endpoint you have deployed above is OpenAI API Compatible, meaning you can use the OpenAI library and any other library built to use OpenAI's library with your endpoint. To see an example of how you can call inference using your new endpoint, see `examples/inference.py`.
+2. To run the inference, run `python examples/inference.py`.
diff --git a/examples/deploy.py b/examples/deploy.py
@@ -0,0 +1,38 @@
+from huggingface_hub import create_inference_endpoint
+import os
+from dotenv import load_dotenv
+
+
+VLLM_HF_IMAGE_URL = "hommayushi3/vllm-huggingface"
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    repo_id = "microsoft/Phi-3-vision-128k-instruct"
+    env_vars = {
+        "MAX_MODEL_LEN": "3072",
+        "DISABLE_SLIDING_WINDOW": "true",
+        "DTYPE": "bfloat16",
+        "TRUST_REMOTE_CODE": "true",
+    }
+
+    endpoint = create_inference_endpoint(
+        name=os.path.basename(repo_id).lower(),
+        repository=repo_id,
+        framework="pytorch",
+        task="custom",
+        accelerator="gpu",
+        vendor="aws",
+        region="us-east-1",
+        type="protected",
+        instance_size="x1",
+        instance_type="nvidia-l4",
+        custom_image={
+            "health_route": "/health",
+            "env": env_vars,
+            "url": VLLM_HF_IMAGE_URL,
+        },
+        token=os.getenv("OPENAI_API_TOKEN"),
+    )
+
+    print("Go to https://ui.endpoints.huggingface.co/{endpoint.namespace}/endpoints/{endpoint.name} to see the endpoint status.")
diff --git a/examples/inference.py b/examples/inference.py
@@ -1 +1,41 @@
-from huggingface_hub import InferenceClient
+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+
+
+if __name__ == "__main__":
+    load_dotenv() 
+    ENDPOINT_URL = os.getenv("HF_ENDPOINT_URL") + "/v1/" # if endpoint object is not available check the UI 
+    API_KEY = os.getenv("HF_TOKEN")
+    STREAM = False
+
+    # initialize the client but point it to TGI
+    client = OpenAI(base_url=ENDPOINT_URL, api_key=API_KEY)
+
+    chat_completion = client.chat.completions.create(
+        model="/repository", # needs to be /repository since there are the model artifacts stored
+        messages=[
+            {"role": "user", "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/0/05/Facebook_Logo_%282019%29.png"
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What is in the above image?"
+                }
+            ]},
+        ],
+        max_tokens=500,
+        temperature=0.0,
+        stream=STREAM
+    )
+
+    if STREAM:
+        for message in chat_completion:
+            if message.choices[0].delta.content:
+                print(message.choices[0].delta.content, end="")
+    else:
+        print(chat_completion.choices[0].message.content)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		HF_TOKEN=hf_...
		HF_ENDPOINT_URL=https://<YOUR_ENDPOINT_DETAILS>.endpoints.huggingface.cloud