-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: added inference and deploy scripts along with README instructio…
…ns on how to use them.
- Loading branch information
1 parent
7d07aa4
commit ad01020
Showing
6 changed files
with
458 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
HF_TOKEN=hf_... | ||
HF_ENDPOINT_URL=https://<YOUR_ENDPOINT_DETAILS>.endpoints.huggingface.cloud |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,19 @@ | ||
# vllm-huggingface | ||
Make vllm-openai Docker container compatible with HuggingFace Inference Endpoints. Heavily inspired by https://github.com/philschmid/vllm-huggingface, but does not fork from vllm. | ||
Make vllm-openai Docker container compatible with HuggingFace Inference Endpoints. Specifically, the most recent VLLM version supports vision language models like Phi-3-vision that Text Generation Inference does not yet support, so this repo is useful for deploying those VLM models not supported by TGI. | ||
|
||
This repo was heavily inspired by https://github.com/philschmid/vllm-huggingface, but is simpler because it does not fork from vllm. | ||
|
||
# General Setup | ||
|
||
1. Install dependencies with `poetry install`. If using `poetry` as your environment manager, run `poetry shell` to activate your environment. | ||
2. Add a `.env` file in the root directory with `HF_TOKEN` defined as a read/write token from [huggingface](https://huggingface.co/settings/tokens). See `.env.example` for how to format. | ||
|
||
# Deploy to HuggingFace Endpoint | ||
|
||
1. View/Edit the details in `examples/deploy.py`. It is set up to deploy a HuggingFace Inference Endpoint for the [Phi-3-vision model](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct). Once you have set up the necessary variables, run `python examples/deploy.py`. | ||
2. Go to the link printed by the previous `deploy.py` script to watch the endpoint deployment status and to retrieve the inference base url when finished deploying. | ||
3. Copy this Endpoint Url from step 2 and add the env variable `HF_ENDPOINT_URL` with this copied value. Again, see `.env.example` for how to format. | ||
|
||
# Run inference | ||
1. The endpoint you have deployed above is OpenAI API Compatible, meaning you can use the OpenAI library and any other library built to use OpenAI's library with your endpoint. To see an example of how you can call inference using your new endpoint, see `examples/inference.py`. | ||
2. To run the inference, run `python examples/inference.py`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from huggingface_hub import create_inference_endpoint | ||
import os | ||
from dotenv import load_dotenv | ||
|
||
|
||
VLLM_HF_IMAGE_URL = "hommayushi3/vllm-huggingface" | ||
|
||
|
||
if __name__ == "__main__": | ||
load_dotenv() | ||
repo_id = "microsoft/Phi-3-vision-128k-instruct" | ||
env_vars = { | ||
"MAX_MODEL_LEN": "3072", | ||
"DISABLE_SLIDING_WINDOW": "true", | ||
"DTYPE": "bfloat16", | ||
"TRUST_REMOTE_CODE": "true", | ||
} | ||
|
||
endpoint = create_inference_endpoint( | ||
name=os.path.basename(repo_id).lower(), | ||
repository=repo_id, | ||
framework="pytorch", | ||
task="custom", | ||
accelerator="gpu", | ||
vendor="aws", | ||
region="us-east-1", | ||
type="protected", | ||
instance_size="x1", | ||
instance_type="nvidia-l4", | ||
custom_image={ | ||
"health_route": "/health", | ||
"env": env_vars, | ||
"url": VLLM_HF_IMAGE_URL, | ||
}, | ||
token=os.getenv("OPENAI_API_TOKEN"), | ||
) | ||
|
||
print("Go to https://ui.endpoints.huggingface.co/{endpoint.namespace}/endpoints/{endpoint.name} to see the endpoint status.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,41 @@ | ||
from huggingface_hub import InferenceClient | ||
from openai import OpenAI | ||
import os | ||
from dotenv import load_dotenv | ||
|
||
|
||
if __name__ == "__main__": | ||
load_dotenv() | ||
ENDPOINT_URL = os.getenv("HF_ENDPOINT_URL") + "/v1/" # if endpoint object is not available check the UI | ||
API_KEY = os.getenv("HF_TOKEN") | ||
STREAM = False | ||
|
||
# initialize the client but point it to TGI | ||
client = OpenAI(base_url=ENDPOINT_URL, api_key=API_KEY) | ||
|
||
chat_completion = client.chat.completions.create( | ||
model="/repository", # needs to be /repository since there are the model artifacts stored | ||
messages=[ | ||
{"role": "user", "content": [ | ||
{ | ||
"type": "image_url", | ||
"image_url": { | ||
"url": "https://upload.wikimedia.org/wikipedia/commons/0/05/Facebook_Logo_%282019%29.png" | ||
} | ||
}, | ||
{ | ||
"type": "text", | ||
"text": "What is in the above image?" | ||
} | ||
]}, | ||
], | ||
max_tokens=500, | ||
temperature=0.0, | ||
stream=STREAM | ||
) | ||
|
||
if STREAM: | ||
for message in chat_completion: | ||
if message.choices[0].delta.content: | ||
print(message.choices[0].delta.content, end="") | ||
else: | ||
print(chat_completion.choices[0].message.content) |
Oops, something went wrong.