Skip to content

Commit

Permalink
Add support to pull NIM profiles from GCS cache
Browse files Browse the repository at this point in the history
  • Loading branch information
pwschuurman committed Oct 1, 2024
1 parent 4c8cc24 commit 21b70b5
Show file tree
Hide file tree
Showing 12 changed files with 158 additions and 23 deletions.
10 changes: 6 additions & 4 deletions cloud-service-providers/google-cloud/gke/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,12 @@ cd nim-deploy/cloud-service-providers/google-cloud/gke

| Variable | Description | Default | Need update? |
|---|---|---|---|
| `registry_server` | NVIDIA Registry that hosts the images | `nvcr.io` | *No* |
| `ngc_transfer_repository` | NVIDIA Registry that hosts the images | `nvcr.io` | *No* |
| `ngc_api_key` | NGC API Key from NVIDIA | <> | *Yes* |
| `repository` | NIM image | `nvcr.io/nim/meta/llama3-8b-instruct` | *No* |
| `tag` | Tag of image | `1.0.0` | *No* |
| `ngc_nim_repository` | NIM image | `nvcr.io/nim/meta/llama3-8b-instruct` | *No* |
| `ngc_nim_tag` | Tag of NIM image | `1.0.0` | *No* |
| `ngc_transfer_repository` | NGC transfer image | `nvcr.io/nim/meta/llama3-8b-instruct` | *No* |
| `ngc_transfer_tag` | Tag of NGC transfer image | `1.0.0` | *No* |
| `model_name` | NIM Model name | `meta/llama3-8b-instruct` | *No* |
| `gpu_limits` | GPU Limits | `1` | *No* |

Expand All @@ -125,7 +127,7 @@ imagePullSecrets:
model:
name:
ngcAPISecret: ngc-api
nimCache: /.cache
nimCache: /opt/nim/llm/.cache
persistence:
enabled: true
existingClaim: "ngc-cache"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y aria2 pigz
ENTRYPOINT ["/bin/sh"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# use --token-format=full for print-identity-token if using GCE VM.
cat <<EOF > req.cred.json
{
"bucket": "${NIM_GCS_BUCKET}",
"text": "${NGC_EULA_TEXT}",
"textb64": "$(echo ${NGC_EULA_TEXT} | base64 -w0)",
"jwt": "$(gcloud auth print-identity-token)"
}
EOF

HTTP_URL="$(curl -s -X POST -H 'accept: application/json' -H 'Content-Type: application/json' -d @req.cred.json "https://${SERVICE_FQDN}/v1/request/${GCS_FILENAME}" | sed 's/.*\(https.*\)\\\\n.*/\1/g')"
echo -n "$HTTP_URL"
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ extraVolumes:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: "ngc-gcs-cache"
mountOptions: "max-conns-per-host=0"
mountOptions: "implicit-dirs,max-conns-per-host=0"
extraVolumeMounts:
cache-volume:
mountPath: /upload-dir
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

set -euo pipefail

export CACHE_PATH="$NIM_CACHE_PATH"

if [ -n "${NGC_BUNDLE_URL:-}" ]; then
# Create a sub-directory, as tar tries to modify the parent folder permissions
export CACHE_PATH="$NIM_CACHE_PATH/cache"
mkdir "$CACHE_PATH"
MODEL_BUNDLE_FILENAME="model.tar"
# Fetch and extract from the provided URL, with max concurrency
aria2c -x 16 -s 16 -j 10 --dir "$CACHE_PATH" --out="$MODEL_BUNDLE_FILENAME" "$NGC_BUNDLE_URL"
tar xf "$CACHE_PATH/$MODEL_BUNDLE_FILENAME" -C "$CACHE_PATH"
rm "$CACHE_PATH/$MODEL_BUNDLE_FILENAME"
else
# Fetch directly from NGC to $NIM_CACHE_PATH
download-to-cache
fi

find $CACHE_PATH -type d -printf '%P\n' | xargs -P 100 -I {} mkdir -p /upload-dir/{}
find $CACHE_PATH -type f,l -printf '%P\n' | xargs -P 100 -I {} cp --no-dereference $CACHE_PATH/{} /upload-dir/{}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Release.Name }}-scripts-configmap
labels:
{{- include "nim-llm.labels" . | nindent 4 }}
data:
ngc_pull.sh: |-
{{ .Files.Get "files/ngc_pull.sh" | indent 4 }}

Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,25 @@ spec:
{{- toYaml .Values.containerSecurityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- /bin/bash
- -c
- "download-to-cache && find $NIM_CACHE_PATH -type d -printf '%P\\n' | xargs -P 100 -I {} mkdir -p /upload-dir/{} && find $NIM_CACHE_PATH -type f,l -printf '%P\\n' | xargs -P 100 -I {} cp --no-dereference $NIM_CACHE_PATH/{} /upload-dir/{}"
command: ["/bin/sh", "-c"]
args: ["/scripts/ngc_pull.sh"]
env:
- name: NIM_CACHE_PATH
value: {{ .Values.model.nimCache | quote }}
{{- if .Values.model.ngcAPISecret }}
- name: NGC_API_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.model.ngcAPISecret }}
key: NGC_API_KEY
{{- end }}
{{- if .Values.model.ngcBundleURLSecret }}
- name: NGC_BUNDLE_URL
valueFrom:
secretKeyRef:
name: {{ .Values.model.ngcBundleURLSecret }}
key: NGC_BUNDLE_URL
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
Expand All @@ -56,6 +63,8 @@ spec:
{{- else }}
mountPath: {{ .Values.model.nimCache }}
{{- end }}
- mountPath: /scripts
name: scripts-volume
{{- if .Values.extraVolumeMounts }}
{{- range $k, $v := .Values.extraVolumeMounts }}
- name: {{ $k }}
Expand Down Expand Up @@ -92,6 +101,10 @@ spec:
{{- else }}
emptyDir: {}
{{- end }}
- name: scripts-volume
configMap:
name: {{ .Release.Name }}-scripts-configmap
defaultMode: 0555
{{- if .Values.extraVolumes }}
{{- range $k, $v := .Values.extraVolumes }}
- name: {{ $k }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ spec:
{{- end }}
{{- if .Values.persistence.mountOptions }}
mountOptions:
- {{ .Values.persistence.mountOptions | quote }}
{{- range .Values.persistence.mountOptions }}
- {{ . | quote }}
{{- end }}
{{- end }}
{{- if .Values.persistence.csi }}
csi:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ model: # most of these values only matter if not using customCommand
nimCache: /model-store
name: my-model # optionsl name of the model in the OpenAI API -- used in `helm test`
ngcAPISecret: ngc-api
# ngcBundleURLSecret: ngc-bundle-url
openaiPort: 8000
labels: {} # any extra labels desired on deployed pods
jsonLogging: true
Expand Down
65 changes: 55 additions & 10 deletions cloud-service-providers/google-cloud/gke/infra/3-config/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data "google_project" "current" {
locals {
cluster_name = data.terraform_remote_state.gke-cluster.outputs.cluster_name
cluster_location = data.terraform_remote_state.gke-cluster.outputs.cluster_location
use_bundle_url = var.ngc_bundle_gcs_bucket != "" && var.ngc_bundle_filename != ""
}

provider "kubernetes" {
Expand All @@ -48,13 +49,39 @@ resource "null_resource" "get-credentials" {

}

data "local_file" "ngc-eula" {
filename = "${path.module}/NIM_GKE_GCS_SIGNED_URL_EULA"
}

resource "null_resource" "get-signed-ngc-bundle-url" {
count = local.use_bundle_url ? 1 : 0
triggers = {
shell_hash = "${sha256(file("${path.module}/fetch-ngc-url.sh"))}"
}
provisioner "local-exec" {
command = "./fetch-ngc-url.sh > ${path.module}/ngc_signed_url.txt"
environment = {
NGC_EULA_TEXT = "${data.local_file.ngc-eula.content}"
NIM_GCS_BUCKET = "${var.ngc_bundle_gcs_bucket}"
GCS_FILENAME = "${var.ngc_bundle_filename}"
SERVICE_FQDN = "nim-gke-gcs-signed-url-722708171432.us-central1.run.app"
}
}
}

data "local_file" "ngc-bundle-url" {
count = local.use_bundle_url ? 1 : 0
filename = "${path.module}/ngc_signed_url.txt"
depends_on = [null_resource.get-signed-ngc-bundle-url]
}

resource "kubernetes_namespace" "nim" {
metadata {
name = "nim"
}
}

resource "kubernetes_secret" "registry_secret" {
resource "kubernetes_secret" "ngc_registry_secret" {
metadata {
name = "registry-secret"
namespace = "nim"
Expand All @@ -65,7 +92,7 @@ resource "kubernetes_secret" "registry_secret" {
data = {
".dockerconfigjson" = jsonencode({
"auths" = {
"${var.registry_server}" = {
"${var.ngc_registry_server}" = {
"username" = var.ngc_username
"password" = var.ngc_api_key
"auth" = base64encode("${var.ngc_username}:${var.ngc_api_key}")
Expand All @@ -90,7 +117,22 @@ resource "kubernetes_secret" "ngc_api" {
}

depends_on = [kubernetes_namespace.nim]
}

resource "kubernetes_secret" "ngc_bundle_url" {
count = local.use_bundle_url ? 1 : 0
metadata {
name = "ngc-bundle-url"
namespace = "nim"
}

type = "Opaque" # Generic secret type

data = {
"NGC_BUNDLE_URL" = "${data.local_file.ngc-bundle-url[0].content}"
}

depends_on = [kubernetes_namespace.nim]
}

resource "kubernetes_service_account" "ngc_gcs_ksa" {
Expand All @@ -101,9 +143,12 @@ resource "kubernetes_service_account" "ngc_gcs_ksa" {
depends_on = [kubernetes_namespace.nim]
}

resource "random_uuid" "gcs_cache_uuid" {
}

resource "google_storage_bucket" "ngc_gcs_cache" {
project = data.google_project.current.name
name = "${data.google_project.current.name}-ngc-gcs-cache"
name = "ngc-gcs-cache-${random_uuid.gcs_cache_uuid.result}"
location = "US"
force_destroy = true

Expand Down Expand Up @@ -143,12 +188,12 @@ resource "helm_release" "ngc_to_gcs_transfer" {

set {
name = "image.repository"
value = var.repository
value = var.ngc_transfer_repository
}

set {
name = "image.tag"
value = var.tag
value = var.ngc_transfer_tag
}

set {
Expand All @@ -166,9 +211,9 @@ resource "helm_release" "ngc_to_gcs_transfer" {
value = var.gpu_limits
}

depends_on = [kubernetes_secret.ngc_api, google_storage_bucket_iam_binding.ngc_gcs_ksa_binding]
depends_on = [kubernetes_secret.ngc_api, kubernetes_secret.ngc_bundle_url, google_storage_bucket_iam_binding.ngc_gcs_ksa_binding]

timeout = 900
timeout = 3600
wait = true
}

Expand All @@ -184,17 +229,17 @@ resource "helm_release" "my_nim" {

set {
name = "csi.volumeAttributes.bucketName"
value = google_storage_bucket.ngc_gcs_cache.name
value = "ngc-gcs-cache-5f0f6937-fad0-1df7-025e-a912ebf61647"
}

set {
name = "image.repository"
value = var.repository
value = var.ngc_nim_repository
}

set {
name = "image.tag"
value = var.tag
value = var.ngc_nim_tag
}

set {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.


variable "registry_server" {
variable "ngc_registry_server" {
type = string
default = "nvcr.io"
description = "Registry that hosts the NIM images"
Expand All @@ -33,16 +33,36 @@ variable "ngc_api_key" {
sensitive = true
}

variable "repository" {
variable "ngc_transfer_repository" {
type = string
description = "Docker image of NGC transfer container"
}

variable "ngc_transfer_tag" {
type = string
description = "Docker repository tag of the NGC transfer container"
}

variable "ngc_nim_repository" {
type = string
description = "Docker image of NIM container"
}

variable "tag" {
variable "ngc_nim_tag" {
type = string
description = "Docker repository tag of NIM container"
}

variable "ngc_bundle_gcs_bucket" {
type = string
description = "GCS bucket containing NGC bucket with NIM profiles"
}

variable "ngc_bundle_filename" {
type = string
description = "Filename containing NIM profiles from NGC"
}

variable "model_name" {
type = string
description = "Name of the NIM model"
Expand Down
2 changes: 2 additions & 0 deletions helm/nim-llm/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,13 @@ spec:
env:
- name: NIM_CACHE_PATH
value: {{ .Values.model.nimCache | quote }}
{{- if .Values.model.ngcAPISecret }}
- name: NGC_API_KEY
valueFrom:
secretKeyRef:
name: {{ .Values.model.ngcAPISecret }}
key: NGC_API_KEY
{{- end }}
- name: OUTLINES_CACHE_DIR
value: /tmp/outlines
- name: NIM_SERVER_PORT
Expand Down

0 comments on commit 21b70b5

Please sign in to comment.