From b1b20502b97c8296561fe284acb52bdb459000a1 Mon Sep 17 00:00:00 2001
From: Andrew Smith <andrew@andrew.codes>
Date: Thu, 14 Nov 2024 13:48:19 +0000
Subject: [PATCH] gpu sharing for frigate

---
 .pnp.cjs                                 |  21 ++++
 apps/alexa/deployment/index.jsonnet      |   3 +-
 apps/alexa/project.json                  |   7 --
 apps/frigate/values.yaml                 |   5 +-
 apps/gpu-scheduler/deployment/index.yaml | 131 +++++++++++++++++++++++
 apps/gpu-scheduler/package.json          |  15 +++
 apps/gpu-scheduler/project.json          |  17 +++
 apps/gpu-scheduler/scripts/deploy.ts     |  20 ++++
 yarn.lock                                |  14 +++
 9 files changed, 223 insertions(+), 10 deletions(-)
 create mode 100755 apps/gpu-scheduler/deployment/index.yaml
 create mode 100755 apps/gpu-scheduler/package.json
 create mode 100755 apps/gpu-scheduler/project.json
 create mode 100755 apps/gpu-scheduler/scripts/deploy.ts

diff --git a/.pnp.cjs b/.pnp.cjs
index 1188df090..f9ea88b09 100755
--- a/.pnp.cjs
+++ b/.pnp.cjs
@@ -58,6 +58,10 @@ const RAW_RUNTIME_STATE =
       "name": "@ha/github-action-runners",\
       "reference": "workspace:apps/github-action-runners"\
     },\
+    {\
+      "name": "@ha/gpu-scheduler",\
+      "reference": "workspace:apps/gpu-scheduler"\
+    },\
     {\
       "name": "@ha/guest-db",\
       "reference": "workspace:apps/guest-db"\
@@ -322,6 +326,7 @@ const RAW_RUNTIME_STATE =
     ["@ha/gaming-pc", ["workspace:apps/gaming-pc"]],\
     ["@ha/github-action-runners", ["workspace:apps/github-action-runners"]],\
     ["@ha/github-secrets", ["workspace:packages/github-secrets"]],\
+    ["@ha/gpu-scheduler", ["workspace:apps/gpu-scheduler"]],\
     ["@ha/guest-db", ["workspace:apps/guest-db"]],\
     ["@ha/guest-pin-codes", ["workspace:apps/guest-pin-codes"]],\
     ["@ha/guest-wifi-registrar", ["workspace:apps/guest-wifi-registrar"]],\
@@ -5959,6 +5964,22 @@ const RAW_RUNTIME_STATE =
         "linkType": "SOFT"\
       }]\
     ]],\
+    ["@ha/gpu-scheduler", [\
+      ["workspace:apps/gpu-scheduler", {\
+        "packageLocation": "./apps/gpu-scheduler/",\
+        "packageDependencies": [\
+          ["@ha/gpu-scheduler", "workspace:apps/gpu-scheduler"],\
+          ["@ha/configuration-api", "workspace:packages/configuration-api"],\
+          ["@ha/configuration-workspace", "workspace:packages/configuration-workspace"],\
+          ["@ha/docker", "workspace:packages/docker"],\
+          ["@ha/jsonnet", "workspace:packages/jsonnet"],\
+          ["@ha/kubectl", "workspace:packages/kubectl"],\
+          ["@ha/nx-executors", "workspace:packages/nx-executors"],\
+          ["shelljs", "npm:0.8.5"]\
+        ],\
+        "linkType": "SOFT"\
+      }]\
+    ]],\
     ["@ha/guest-db", [\
       ["workspace:apps/guest-db", {\
         "packageLocation": "./apps/guest-db/",\
diff --git a/apps/alexa/deployment/index.jsonnet b/apps/alexa/deployment/index.jsonnet
index 1754bf232..d70ba3d11 100755
--- a/apps/alexa/deployment/index.jsonnet
+++ b/apps/alexa/deployment/index.jsonnet
@@ -46,7 +46,7 @@ local sttContainer = k.core.v1.container.new(name='whisper', image=std.extVar('w
                      {
                        resources: {
                          limits: {
-                           'nvidia.com/gpu': 1,
+                           'aliyun.com/gpu-mem': 4,
                          },
                        },
                      }
@@ -58,7 +58,6 @@ local sttDeployment = k.apps.v1.deployment.new(name='whisper', containers=[sttCo
                         template+: {
 
                           spec+: {
-                            runtimeClassName: 'nvidia',
                             // tolerations: [
                             // { key: 'nvidia.com/gpu', operator: 'Exists', effect: 'NoSchedule' },
                             // ],
diff --git a/apps/alexa/project.json b/apps/alexa/project.json
index 4ef8ca41c..4493c7ad2 100755
--- a/apps/alexa/project.json
+++ b/apps/alexa/project.json
@@ -10,13 +10,6 @@
         "cwd": "apps/alexa",
         "module": "scripts/deploy.ts"
       }
-    },
-    "image/push": {
-      "executor": "@ha/nx-executors:invoke",
-      "options": {
-        "module": "scripts/image-push.ts",
-        "cwd": "apps/alexa"
-      }
     }
   },
   "tags": [],
diff --git a/apps/frigate/values.yaml b/apps/frigate/values.yaml
index b6140275d..b37aa67d4 100755
--- a/apps/frigate/values.yaml
+++ b/apps/frigate/values.yaml
@@ -18,7 +18,10 @@ coral:
 gpu:
   nvidia:
     enabled: true
-    runtimeClassName: nvidia
+
+resources:
+  limits:
+    aliyun.com/gpu-mem: 4
 env:
   YOLO_MODELS: yolov7x-640
 extraVolumeMounts:
diff --git a/apps/gpu-scheduler/deployment/index.yaml b/apps/gpu-scheduler/deployment/index.yaml
new file mode 100755
index 000000000..d32237fd5
--- /dev/null
+++ b/apps/gpu-scheduler/deployment/index.yaml
@@ -0,0 +1,131 @@
+# rbac.yaml
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-schd-extender
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - nodes
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - ""
+    resources:
+      - events
+    verbs:
+      - create
+      - patch
+  - apiGroups:
+      - ""
+    resources:
+      - pods
+    verbs:
+      - update
+      - patch
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - ""
+    resources:
+      - bindings
+      - pods/binding
+    verbs:
+      - create
+  - apiGroups:
+      - ""
+    resources:
+      - configmaps
+    verbs:
+      - get
+      - list
+      - watch
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: gpushare-schd-extender
+subjects:
+  - kind: ServiceAccount
+    name: gpushare-schd-extender
+    namespace: kube-system
+
+# deployment yaml
+---
+kind: Deployment
+apiVersion: apps/v1
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: gpushare
+      component: gpushare-schd-extender
+  template:
+    metadata:
+      labels:
+        app: gpushare
+        component: gpushare-schd-extender
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+    spec:
+      hostNetwork: true
+      tolerations:
+        - effect: NoSchedule
+          operator: Exists
+          key: node-role.kubernetes.io/master
+        - effect: NoSchedule
+          operator: Exists
+          key: node.cloudprovider.kubernetes.io/uninitialized
+      nodeName: "k8s-node-3"
+      serviceAccount: gpushare-schd-extender
+      containers:
+        - name: gpushare-schd-extender
+          image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-schd-extender:1.11-d170d8a
+          env:
+            - name: LOG_LEVEL
+              value: debug
+            - name: PORT
+              value: "12345"
+
+# service.yaml
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpushare-schd-extender
+  namespace: kube-system
+  labels:
+    app: gpushare
+    component: gpushare-schd-extender
+spec:
+  type: NodePort
+  ports:
+    - port: 12345
+      name: http
+      targetPort: 12345
+      nodePort: 32766
+  selector:
+    # select app=ingress-nginx pods
+    app: gpushare
+    component: gpushare-schd-extender
diff --git a/apps/gpu-scheduler/package.json b/apps/gpu-scheduler/package.json
new file mode 100755
index 000000000..87cf0455b
--- /dev/null
+++ b/apps/gpu-scheduler/package.json
@@ -0,0 +1,15 @@
+{
+  "private": true,
+  "name": "@ha/gpu-scheduler",
+  "version": "0.0.1",
+  "license": "MIT",
+  "devDependencies": {
+    "@ha/configuration-api": "workspace:^0.0.1",
+    "@ha/configuration-workspace": "workspace:^0.0.1",
+    "@ha/docker": "workspace:^0.0.1",
+    "@ha/jsonnet": "workspace:^0.0.1",
+    "@ha/kubectl": "workspace:^1.0.0",
+    "@ha/nx-executors": "workspace:^0.1.0",
+    "shelljs": "^0.8.5"
+  }
+}
diff --git a/apps/gpu-scheduler/project.json b/apps/gpu-scheduler/project.json
new file mode 100755
index 000000000..c50e43168
--- /dev/null
+++ b/apps/gpu-scheduler/project.json
@@ -0,0 +1,17 @@
+{
+  "name": "gpu-scheduler",
+  "$schema": "../../node_modules/nx/schemas/project-schema.json",
+  "sourceRoot": "apps/gpu-scheduler",
+  "projectType": "application",
+  "targets": {
+    "deploy": {
+      "executor": "@ha/nx-executors:invoke",
+      "options": {
+        "cwd": "apps/gpu-scheduler",
+        "module": "scripts/deploy.ts"
+      }
+    }
+  },
+  "tags": [],
+  "implicitDependencies": []
+}
diff --git a/apps/gpu-scheduler/scripts/deploy.ts b/apps/gpu-scheduler/scripts/deploy.ts
new file mode 100755
index 000000000..44c7340de
--- /dev/null
+++ b/apps/gpu-scheduler/scripts/deploy.ts
@@ -0,0 +1,20 @@
+import type { ConfigurationApi } from "@ha/configuration-api"
+import type { Configuration } from "@ha/configuration-workspace"
+import path from "path"
+import sh from "shelljs"
+
+const run = async (
+  configurationApi: ConfigurationApi<Configuration>,
+): Promise<void> => {
+  // https://github.com/AliyunContainerService/gpushare-scheduler-extender/blob/master/docs/install.md
+  sh.exec(
+    `kubectl create -f ${path.join(
+      __dirname,
+      "..",
+      "deployment",
+      "index.yaml",
+    )}`,
+  )
+}
+
+export default run
diff --git a/yarn.lock b/yarn.lock
index c4584524e..ac1df5b0c 100755
--- a/yarn.lock
+++ b/yarn.lock
@@ -3600,6 +3600,20 @@ __metadata:
   languageName: unknown
   linkType: soft
 
+"@ha/gpu-scheduler@workspace:apps/gpu-scheduler":
+  version: 0.0.0-use.local
+  resolution: "@ha/gpu-scheduler@workspace:apps/gpu-scheduler"
+  dependencies:
+    "@ha/configuration-api": "workspace:^0.0.1"
+    "@ha/configuration-workspace": "workspace:^0.0.1"
+    "@ha/docker": "workspace:^0.0.1"
+    "@ha/jsonnet": "workspace:^0.0.1"
+    "@ha/kubectl": "workspace:^1.0.0"
+    "@ha/nx-executors": "workspace:^0.1.0"
+    shelljs: "npm:^0.8.5"
+  languageName: unknown
+  linkType: soft
+
 "@ha/guest-db@workspace:apps/guest-db":
   version: 0.0.0-use.local
   resolution: "@ha/guest-db@workspace:apps/guest-db"