Skip to content

Commit

Permalink
Re-adding nvidia-device-plugin to production cluster (#4436)
Browse files Browse the repository at this point in the history
nvidia-device-plugin
  • Loading branch information
AntFMoJ authored May 31, 2024
1 parent 8766ed8 commit 7fa53f3
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 139 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# resource "kubernetes_manifest" "nvidia_device_plugin" {
# manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
# }
resource "kubernetes_manifest" "nvidia_device_plugin" {
manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
}

resource "kubernetes_manifest" "nvidia_gpu_slicing" {
manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml"))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,136 +1,136 @@
# # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# #
# # Licensed under the Apache License, Version 2.0 (the "License");
# # you may not use this file except in compliance with the License.
# # You may obtain a copy of the License at
# #
# # http://www.apache.org/licenses/LICENSE-2.0
# #
# # Unless required by applicable law or agreed to in writing, software
# # distributed under the License is distributed on an "AS IS" BASIS,
# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# # See the License for the specific language governing permissions and
# # limitations under the License.
# ---
# apiVersion: apps/v1
# kind: DaemonSet
# metadata:
# name: nvidia-device-plugin-daemonset
# namespace: kube-system
# spec:
# selector:
# matchLabels:
# name: nvidia-device-plugin-ds
# updateStrategy:
# type: RollingUpdate
# template:
# metadata:
# labels:
# name: nvidia-device-plugin-ds
# spec:
# # @ministryofjustice/analytical-platform: We added this nodeSelector
# nodeSelector:
# gpu-compute: "true"
# tolerations:
# - key: nvidia.com/gpu
# operator: Exists
# effect: NoSchedule
# # @ministryofjustice/analytical-platform: We added this toleration
# - key: gpu-compute
# operator: Exists
# effect: NoSchedule
# # Mark this pod as a critical add-on; when enabled, the critical add-on
# # scheduler reserves resources for critical add-on pods so that they can
# # be rescheduled after a failure.
# # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
# priorityClassName: "system-node-critical"
# shareProcessNamespace: true
# serviceAccountName: nvidia-device-plugin-daemonset-service-account
# initContainers:
# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
# name: nvidia-device-plugin-init
# command: ["config-manager"]
# env:
# - name: ONESHOT
# value: "true"
# - name: NODE_NAME
# valueFrom:
# fieldRef:
# fieldPath: "spec.nodeName"
# - name: NODE_LABEL
# value: "nvidia.com/device-plugin.config"
# - name: CONFIG_FILE_SRCDIR
# value: "/available-configs"
# - name: CONFIG_FILE_DST
# value: "/config/config.yaml"
# - name: FALLBACK_STRATEGIES
# value: named,single
# - name: SEND_SIGNAL
# value: "false"
# volumeMounts:
# - name: available-configs
# mountPath: /available-configs
# - name: config
# mountPath: /config
# containers:
# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
# name: nvidia-device-plugin-sidecar
# command: ["config-manager"]
# env:
# - name: ONESHOT
# value: "false"
# - name: NODE_NAME
# valueFrom:
# fieldRef:
# fieldPath: "spec.nodeName"
# - name: NODE_LABEL
# value: "nvidia.com/device-plugin.config"
# - name: CONFIG_FILE_SRCDIR
# value: "/available-configs"
# - name: CONFIG_FILE_DST
# value: "/config/config.yaml"
# - name: FALLBACK_STRATEGIES
# value: named,single
# - name: SEND_SIGNAL
# value: "true"
# - name: SIGNAL
# value: "1" # SIGHUP
# - name: PROCESS_TO_SIGNAL
# value: "nvidia-device-plugin"
# volumeMounts:
# - name: available-configs
# mountPath: /available-configs
# - name: config
# mountPath: /config
# securityContext:
# allowPrivilegeEscalation: false
# capabilities:
# drop: ["ALL"]
# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
# name: nvidia-device-plugin-ctr
# env:
# - name: FAIL_ON_INIT_ERROR
# value: "false"
# - name: CONFIG_FILE
# value: /config/config.yaml
# securityContext:
# allowPrivilegeEscalation: false
# capabilities:
# drop: ["ALL"]
# volumeMounts:
# - name: device-plugin
# mountPath: /var/lib/kubelet/device-plugins
# - name: available-configs
# mountPath: /available-configs
# - name: config
# mountPath: /config
# volumes:
# - name: device-plugin
# hostPath:
# path: /var/lib/kubelet/device-plugins
# - name: available-configs
# configMap:
# name: nvidia-device-plugin-daemonset
# defaultMode: 444
# - name: config
# emptyDir: {}
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
# @ministryofjustice/analytical-platform: We added this nodeSelector
nodeSelector:
gpu-compute: "true"
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# @ministryofjustice/analytical-platform: We added this toleration
- key: gpu-compute
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
shareProcessNamespace: true
serviceAccountName: nvidia-device-plugin-daemonset-service-account
initContainers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-init
command: ["config-manager"]
env:
- name: ONESHOT
value: "true"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "false"
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-sidecar
command: ["config-manager"]
env:
- name: ONESHOT
value: "false"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "true"
- name: SIGNAL
value: "1" # SIGHUP
- name: PROCESS_TO_SIGNAL
value: "nvidia-device-plugin"
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
- name: CONFIG_FILE
value: /config/config.yaml
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: available-configs
configMap:
name: nvidia-device-plugin-daemonset
defaultMode: 444
- name: config
emptyDir: {}

0 comments on commit 7fa53f3

Please sign in to comment.