From fe72c7312a681961545b43aeb9f72b1671ed33e4 Mon Sep 17 00:00:00 2001
From: AlexHe99 <alehe@amd.com>
Date: Tue, 24 Dec 2024 18:47:05 +0800
Subject: [PATCH] Update deploying_with_k8s.md with AMD ROCm GPU example

Add the example of using AMD ROCm GPU

Signed-off-by: Alex He <alehe@amd.com>
---
 docs/source/serving/deploying_with_k8s.md | 73 +++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md
index d27db826cd006..81ffc3e3703ad 100644
--- a/docs/source/serving/deploying_with_k8s.md
+++ b/docs/source/serving/deploying_with_k8s.md
@@ -119,6 +119,79 @@ spec:
           periodSeconds: 5
 ```
 
+- AMD ROCm GPU
+
+You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+  namespace: default
+  labels:
+    app: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral-7b
+  template:
+    metadata:
+      labels:
+        app: mistral-7b
+    spec:
+      volumes:
+      # PVC
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: mistral-7b
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "8Gi"
+      hostNetwork: true
+      hostIPC: true
+      containers:
+      - name: mistral-7b
+        image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+        securityContext:
+          seccompProfile:
+            type: Unconfined
+          runAsGroup: 44
+          capabilities:
+            add:
+            - SYS_PTRACE
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+        - containerPort: 8000
+        resources:
+          limits:
+            cpu: "10"
+            memory: 20G
+            amd.com/gpu: "1"
+          requests:
+            cpu: "6"
+            memory: 6G
+            amd.com/gpu: "1"
+        volumeMounts:
+        - name: cache-volume
+          mountPath: /root/.cache/huggingface
+        - name: shm
+          mountPath: /dev/shm
+```
+The full example is at https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve.
+
 2. **Create a Kubernetes Service for vLLM**
 
 Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: