-
-
Notifications
You must be signed in to change notification settings - Fork 5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'vllm-project:main' into main
- Loading branch information
Showing
352 changed files
with
17,878 additions
and
5,524 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import argparse | ||
import os | ||
|
||
template = """<!DOCTYPE html> | ||
<html> | ||
<body> | ||
<h1>Links for vLLM</h1/> | ||
<a href="../{wheel_html_escaped}">{wheel}</a><br/> | ||
</body> | ||
</html> | ||
""" | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--wheel", help="The wheel path.", required=True) | ||
args = parser.parse_args() | ||
|
||
filename = os.path.basename(args.wheel) | ||
|
||
with open("index.html", "w") as f: | ||
print(f"Generated index.html for {args.wheel}") | ||
# cloudfront requires escaping the '+' character | ||
f.write( | ||
template.format(wheel=filename, | ||
wheel_html_escaped=filename.replace("+", "%2B"))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
# This script build the GH200 docker image and run the offline inference inside the container. | ||
# It serves a sanity check for compilation and basic model usage. | ||
set -ex | ||
|
||
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile | ||
python3 use_existing_torch.py | ||
|
||
# Try building the docker image | ||
DOCKER_BUILDKIT=1 docker build . \ | ||
--target vllm-openai \ | ||
--platform "linux/arm64" \ | ||
-t gh200-test \ | ||
--build-arg max_jobs=66 \ | ||
--build-arg nvcc_threads=2 \ | ||
--build-arg torch_cuda_arch_list="9.0+PTX" \ | ||
--build-arg vllm_fa_cmake_gpu_arches="90-real" | ||
|
||
# Setup cleanup | ||
remove_docker_container() { docker rm -f gh200-test || true; } | ||
trap remove_docker_container EXIT | ||
remove_docker_container | ||
|
||
# Run the image and test offline inference | ||
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' | ||
python3 examples/offline_inference.py | ||
' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
name: Lint and Deploy Charts | ||
|
||
on: pull_request | ||
|
||
jobs: | ||
lint-and-deploy: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | ||
with: | ||
fetch-depth: 0 | ||
|
||
- name: Set up Helm | ||
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0 | ||
with: | ||
version: v3.14.4 | ||
|
||
#Python is required because ct lint runs Yamale and yamllint which require Python. | ||
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 | ||
with: | ||
python-version: '3.13' | ||
|
||
- name: Set up chart-testing | ||
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1 | ||
with: | ||
version: v3.10.1 | ||
|
||
- name: Run chart-testing (lint) | ||
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm | ||
|
||
- name: Setup minio | ||
run: | | ||
docker network create vllm-net | ||
docker run -d -p 9000:9000 --name minio --net vllm-net \ | ||
-e "MINIO_ACCESS_KEY=minioadmin" \ | ||
-e "MINIO_SECRET_KEY=minioadmin" \ | ||
-v /tmp/data:/data \ | ||
-v /tmp/config:/root/.minio \ | ||
minio/minio server /data | ||
export AWS_ACCESS_KEY_ID=minioadmin | ||
export AWS_SECRET_ACCESS_KEY=minioadmin | ||
export AWS_EC2_METADATA_DISABLED=true | ||
mkdir opt-125m | ||
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd .. | ||
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket | ||
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive | ||
- name: Create kind cluster | ||
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0 | ||
|
||
- name: Build the Docker image vllm cpu | ||
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env . | ||
|
||
- name: Configuration of docker images, network and namespace for the kind cluster | ||
run: | | ||
docker pull amazon/aws-cli:2.6.4 | ||
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing | ||
kind load docker-image vllm-cpu-env:latest --name chart-testing | ||
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")" | ||
kubectl create ns ns-vllm | ||
- name: Run chart-testing (install) | ||
run: | | ||
export AWS_ACCESS_KEY_ID=minioadmin | ||
export AWS_SECRET_ACCESS_KEY=minioadmin | ||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" | ||
- name: curl test | ||
run: | | ||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 & | ||
sleep 10 | ||
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \ | ||
--header "Content-Type: application/json" \ | ||
--data '{ | ||
"model": "opt-125m", | ||
"prompt": "San Francisco is a", | ||
"max_tokens": 7, | ||
"temperature": 0 | ||
}'):$CODE" | ||
echo "$CODE" |
Oops, something went wrong.