From b1b20502b97c8296561fe284acb52bdb459000a1 Mon Sep 17 00:00:00 2001 From: Andrew Smith Date: Thu, 14 Nov 2024 13:48:19 +0000 Subject: [PATCH] gpu sharing for frigate --- .pnp.cjs | 21 ++++ apps/alexa/deployment/index.jsonnet | 3 +- apps/alexa/project.json | 7 -- apps/frigate/values.yaml | 5 +- apps/gpu-scheduler/deployment/index.yaml | 131 +++++++++++++++++++++++ apps/gpu-scheduler/package.json | 15 +++ apps/gpu-scheduler/project.json | 17 +++ apps/gpu-scheduler/scripts/deploy.ts | 20 ++++ yarn.lock | 14 +++ 9 files changed, 223 insertions(+), 10 deletions(-) create mode 100755 apps/gpu-scheduler/deployment/index.yaml create mode 100755 apps/gpu-scheduler/package.json create mode 100755 apps/gpu-scheduler/project.json create mode 100755 apps/gpu-scheduler/scripts/deploy.ts diff --git a/.pnp.cjs b/.pnp.cjs index 1188df090..f9ea88b09 100755 --- a/.pnp.cjs +++ b/.pnp.cjs @@ -58,6 +58,10 @@ const RAW_RUNTIME_STATE = "name": "@ha/github-action-runners",\ "reference": "workspace:apps/github-action-runners"\ },\ + {\ + "name": "@ha/gpu-scheduler",\ + "reference": "workspace:apps/gpu-scheduler"\ + },\ {\ "name": "@ha/guest-db",\ "reference": "workspace:apps/guest-db"\ @@ -322,6 +326,7 @@ const RAW_RUNTIME_STATE = ["@ha/gaming-pc", ["workspace:apps/gaming-pc"]],\ ["@ha/github-action-runners", ["workspace:apps/github-action-runners"]],\ ["@ha/github-secrets", ["workspace:packages/github-secrets"]],\ + ["@ha/gpu-scheduler", ["workspace:apps/gpu-scheduler"]],\ ["@ha/guest-db", ["workspace:apps/guest-db"]],\ ["@ha/guest-pin-codes", ["workspace:apps/guest-pin-codes"]],\ ["@ha/guest-wifi-registrar", ["workspace:apps/guest-wifi-registrar"]],\ @@ -5959,6 +5964,22 @@ const RAW_RUNTIME_STATE = "linkType": "SOFT"\ }]\ ]],\ + ["@ha/gpu-scheduler", [\ + ["workspace:apps/gpu-scheduler", {\ + "packageLocation": "./apps/gpu-scheduler/",\ + "packageDependencies": [\ + ["@ha/gpu-scheduler", "workspace:apps/gpu-scheduler"],\ + ["@ha/configuration-api", "workspace:packages/configuration-api"],\ + ["@ha/configuration-workspace", "workspace:packages/configuration-workspace"],\ + ["@ha/docker", "workspace:packages/docker"],\ + ["@ha/jsonnet", "workspace:packages/jsonnet"],\ + ["@ha/kubectl", "workspace:packages/kubectl"],\ + ["@ha/nx-executors", "workspace:packages/nx-executors"],\ + ["shelljs", "npm:0.8.5"]\ + ],\ + "linkType": "SOFT"\ + }]\ + ]],\ ["@ha/guest-db", [\ ["workspace:apps/guest-db", {\ "packageLocation": "./apps/guest-db/",\ diff --git a/apps/alexa/deployment/index.jsonnet b/apps/alexa/deployment/index.jsonnet index 1754bf232..d70ba3d11 100755 --- a/apps/alexa/deployment/index.jsonnet +++ b/apps/alexa/deployment/index.jsonnet @@ -46,7 +46,7 @@ local sttContainer = k.core.v1.container.new(name='whisper', image=std.extVar('w { resources: { limits: { - 'nvidia.com/gpu': 1, + 'aliyun.com/gpu-mem': 4, }, }, } @@ -58,7 +58,6 @@ local sttDeployment = k.apps.v1.deployment.new(name='whisper', containers=[sttCo template+: { spec+: { - runtimeClassName: 'nvidia', // tolerations: [ // { key: 'nvidia.com/gpu', operator: 'Exists', effect: 'NoSchedule' }, // ], diff --git a/apps/alexa/project.json b/apps/alexa/project.json index 4ef8ca41c..4493c7ad2 100755 --- a/apps/alexa/project.json +++ b/apps/alexa/project.json @@ -10,13 +10,6 @@ "cwd": "apps/alexa", "module": "scripts/deploy.ts" } - }, - "image/push": { - "executor": "@ha/nx-executors:invoke", - "options": { - "module": "scripts/image-push.ts", - "cwd": "apps/alexa" - } } }, "tags": [], diff --git a/apps/frigate/values.yaml b/apps/frigate/values.yaml index b6140275d..b37aa67d4 100755 --- a/apps/frigate/values.yaml +++ b/apps/frigate/values.yaml @@ -18,7 +18,10 @@ coral: gpu: nvidia: enabled: true - runtimeClassName: nvidia + +resources: + limits: + aliyun.com/gpu-mem: 4 env: YOLO_MODELS: yolov7x-640 extraVolumeMounts: diff --git a/apps/gpu-scheduler/deployment/index.yaml b/apps/gpu-scheduler/deployment/index.yaml new file mode 100755 index 000000000..d32237fd5 --- /dev/null +++ b/apps/gpu-scheduler/deployment/index.yaml @@ -0,0 +1,131 @@ +# rbac.yaml +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: gpushare-schd-extender +rules: + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - apiGroups: + - "" + resources: + - pods + verbs: + - update + - patch + - get + - list + - watch + - apiGroups: + - "" + resources: + - bindings + - pods/binding + verbs: + - create + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gpushare-schd-extender + namespace: kube-system +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: gpushare-schd-extender + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: gpushare-schd-extender +subjects: + - kind: ServiceAccount + name: gpushare-schd-extender + namespace: kube-system + +# deployment yaml +--- +kind: Deployment +apiVersion: apps/v1 +metadata: + name: gpushare-schd-extender + namespace: kube-system +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: gpushare + component: gpushare-schd-extender + template: + metadata: + labels: + app: gpushare + component: gpushare-schd-extender + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + spec: + hostNetwork: true + tolerations: + - effect: NoSchedule + operator: Exists + key: node-role.kubernetes.io/master + - effect: NoSchedule + operator: Exists + key: node.cloudprovider.kubernetes.io/uninitialized + nodeName: "k8s-node-3" + serviceAccount: gpushare-schd-extender + containers: + - name: gpushare-schd-extender + image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-schd-extender:1.11-d170d8a + env: + - name: LOG_LEVEL + value: debug + - name: PORT + value: "12345" + +# service.yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: gpushare-schd-extender + namespace: kube-system + labels: + app: gpushare + component: gpushare-schd-extender +spec: + type: NodePort + ports: + - port: 12345 + name: http + targetPort: 12345 + nodePort: 32766 + selector: + # select app=ingress-nginx pods + app: gpushare + component: gpushare-schd-extender diff --git a/apps/gpu-scheduler/package.json b/apps/gpu-scheduler/package.json new file mode 100755 index 000000000..87cf0455b --- /dev/null +++ b/apps/gpu-scheduler/package.json @@ -0,0 +1,15 @@ +{ + "private": true, + "name": "@ha/gpu-scheduler", + "version": "0.0.1", + "license": "MIT", + "devDependencies": { + "@ha/configuration-api": "workspace:^0.0.1", + "@ha/configuration-workspace": "workspace:^0.0.1", + "@ha/docker": "workspace:^0.0.1", + "@ha/jsonnet": "workspace:^0.0.1", + "@ha/kubectl": "workspace:^1.0.0", + "@ha/nx-executors": "workspace:^0.1.0", + "shelljs": "^0.8.5" + } +} diff --git a/apps/gpu-scheduler/project.json b/apps/gpu-scheduler/project.json new file mode 100755 index 000000000..c50e43168 --- /dev/null +++ b/apps/gpu-scheduler/project.json @@ -0,0 +1,17 @@ +{ + "name": "gpu-scheduler", + "$schema": "../../node_modules/nx/schemas/project-schema.json", + "sourceRoot": "apps/gpu-scheduler", + "projectType": "application", + "targets": { + "deploy": { + "executor": "@ha/nx-executors:invoke", + "options": { + "cwd": "apps/gpu-scheduler", + "module": "scripts/deploy.ts" + } + } + }, + "tags": [], + "implicitDependencies": [] +} diff --git a/apps/gpu-scheduler/scripts/deploy.ts b/apps/gpu-scheduler/scripts/deploy.ts new file mode 100755 index 000000000..44c7340de --- /dev/null +++ b/apps/gpu-scheduler/scripts/deploy.ts @@ -0,0 +1,20 @@ +import type { ConfigurationApi } from "@ha/configuration-api" +import type { Configuration } from "@ha/configuration-workspace" +import path from "path" +import sh from "shelljs" + +const run = async ( + configurationApi: ConfigurationApi, +): Promise => { + // https://github.com/AliyunContainerService/gpushare-scheduler-extender/blob/master/docs/install.md + sh.exec( + `kubectl create -f ${path.join( + __dirname, + "..", + "deployment", + "index.yaml", + )}`, + ) +} + +export default run diff --git a/yarn.lock b/yarn.lock index c4584524e..ac1df5b0c 100755 --- a/yarn.lock +++ b/yarn.lock @@ -3600,6 +3600,20 @@ __metadata: languageName: unknown linkType: soft +"@ha/gpu-scheduler@workspace:apps/gpu-scheduler": + version: 0.0.0-use.local + resolution: "@ha/gpu-scheduler@workspace:apps/gpu-scheduler" + dependencies: + "@ha/configuration-api": "workspace:^0.0.1" + "@ha/configuration-workspace": "workspace:^0.0.1" + "@ha/docker": "workspace:^0.0.1" + "@ha/jsonnet": "workspace:^0.0.1" + "@ha/kubectl": "workspace:^1.0.0" + "@ha/nx-executors": "workspace:^0.1.0" + shelljs: "npm:^0.8.5" + languageName: unknown + linkType: soft + "@ha/guest-db@workspace:apps/guest-db": version: 0.0.0-use.local resolution: "@ha/guest-db@workspace:apps/guest-db"