From 417d2015cb680d5566ab22c8bbf8447e41ec7895 Mon Sep 17 00:00:00 2001 From: Lukas Metzner Date: Tue, 12 Nov 2024 09:55:42 +0100 Subject: [PATCH] fix: reverted NodeGetInfo response as it breaks Nomad clusters (#776) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We modified the response for `NodeGetInfo` to return an additional Topology Segment. We assumed that this only “adds” new info, but in practice it breaks the spec. When trying to schedule a volume to nodes, the container orchestration systems should verify that the Node fulfills at least one Accessible Topology of the Node, where “fulfills” means that all supplied segments match. This is not implemented in the same way between Kubernetes and Nomad. - **Kubernetes**: requirements are fulfilled if the volume specifies a subset of the Nodes topology - **Nomad**: requirements are fulfilled if the volume specifies all of the Nodes topology We made these changes to work around a bug in the Kubernetes scheduler ([here](https://github.com/kubernetes-csi/external-provisioner/issues/544)) where nodes without the CSI Plugin would still be considered for scheduling, but then creating and attaching the volume fails with no automatic reconciliation of this error. --- chart/templates/core/storageclass.yaml | 7 ------- chart/values.yaml | 2 -- docs/kubernetes/README.md | 14 ++------------ internal/driver/driver.go | 1 - internal/driver/node.go | 1 - 5 files changed, 2 insertions(+), 23 deletions(-) diff --git a/chart/templates/core/storageclass.yaml b/chart/templates/core/storageclass.yaml index 5771edca..1bc246d2 100644 --- a/chart/templates/core/storageclass.yaml +++ b/chart/templates/core/storageclass.yaml @@ -10,13 +10,6 @@ provisioner: csi.hetzner.cloud volumeBindingMode: WaitForFirstConsumer allowVolumeExpansion: true reclaimPolicy: {{ $val.reclaimPolicy | quote }} -{{- if $val.allowedTopologyCloudServer }} -allowedTopologies: -- matchLabelExpressions: - - key: instance.hetzner.cloud/provided-by - values: - - "cloud" -{{- end }} --- {{- end }} {{- end }} \ No newline at end of file diff --git a/chart/values.yaml b/chart/values.yaml index 3508ecb8..481862f7 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -728,5 +728,3 @@ storageClasses: - name: hcloud-volumes defaultStorageClass: true reclaimPolicy: Delete - ## @param storageClass.allowedTopologyCloudServer Prevents pods from being scheduled on nodes, specifically Robot servers, where Hetzner volumes are unavailable - allowedTopologyCloudServer: false diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index b73d0c99..fe85dc71 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -209,19 +209,9 @@ $ kubectl apply -f https://raw.githubusercontent.com/hetznercloud/csi-driver/v2. ## Integration with Root Servers -Root servers can be part of the cluster, but the CSI plugin doesn't work there and the current behaviour of the scheduler can cause Pods to be stuck in `Pending`. +Root servers can be part of the cluster, but the CSI plugin doesn't work there. -In the Helm Chart you can set `allowedTopologyCloudServer` to true to prevent pods from being scheduled on nodes, specifically Robot servers, where Hetzner volumes are unavailable. This value can not be changed after the initial creation of a storage class. - -```yaml -storageClasses: - - name: hcloud-volumes - defaultStorageClass: true - reclaimPolicy: Delete - allowedTopologyCloudServer: true # <--- -``` - -To ensure proper topology evaluation, labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually. +Labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually. ### Adding labels manually diff --git a/internal/driver/driver.go b/internal/driver/driver.go index ea8f7e7e..dc2f65f4 100644 --- a/internal/driver/driver.go +++ b/internal/driver/driver.go @@ -9,5 +9,4 @@ const ( DefaultVolumeSize = MinVolumeSize TopologySegmentLocation = PluginName + "/location" - ProvidedByLabel = "instance.hetzner.cloud/provided-by" ) diff --git a/internal/driver/node.go b/internal/driver/node.go index 6716ff02..5b084ea1 100644 --- a/internal/driver/node.go +++ b/internal/driver/node.go @@ -178,7 +178,6 @@ func (s *NodeService) NodeGetInfo(_ context.Context, _ *proto.NodeGetInfoRequest AccessibleTopology: &proto.Topology{ Segments: map[string]string{ TopologySegmentLocation: s.serverLocation, - ProvidedByLabel: "cloud", }, }, }