diff --git a/chart/templates/controller/deployment.yaml b/chart/templates/controller/deployment.yaml index 5d178257..f375d8c3 100644 --- a/chart/templates/controller/deployment.yaml +++ b/chart/templates/controller/deployment.yaml @@ -167,6 +167,10 @@ spec: key: {{ .Values.controller.hcloudToken.existingSecret.key }} {{- end }} {{- end }} + {{- if .Values.global.enableProvidedByTopology}} + - name: ENABLE_PROVIDED_BY_TOPOLOGY + value: "t" + {{ end }} {{- if .Values.controller.extraEnvVars }} {{- include "common.tplvalues.render" (dict "value" .Values.controller.extraEnvVars "context" $) | nindent 12 }} {{- end }} diff --git a/chart/templates/core/storageclass.yaml b/chart/templates/core/storageclass.yaml index 1bc246d2..f5f282c8 100644 --- a/chart/templates/core/storageclass.yaml +++ b/chart/templates/core/storageclass.yaml @@ -1,3 +1,4 @@ +{{ $global := .Values.global }} {{- if .Values.storageClasses }} {{- range $key, $val := .Values.storageClasses }} kind: StorageClass @@ -10,6 +11,13 @@ provisioner: csi.hetzner.cloud volumeBindingMode: WaitForFirstConsumer allowVolumeExpansion: true reclaimPolicy: {{ $val.reclaimPolicy | quote }} +{{- if $global.enableProvidedByTopology }} +allowedTopologies: +- matchLabelExpressions: + - key: instance.hetzner.cloud/provided-by + values: + - "cloud" +{{- end}} --- {{- end }} {{- end }} \ No newline at end of file diff --git a/chart/templates/node/daemonset.yaml b/chart/templates/node/daemonset.yaml index ec1ee4c5..cde83e3b 100644 --- a/chart/templates/node/daemonset.yaml +++ b/chart/templates/node/daemonset.yaml @@ -121,6 +121,10 @@ spec: {{- end }} - name: ENABLE_METRICS value: {{if .Values.metrics.enabled}}"true"{{ else }}"false"{{end}} + {{- if .Values.global.enableProvidedByTopology}} + - name: ENABLE_PROVIDED_BY_TOPOLOGY + value: "t" + {{ end }} {{- if .Values.node.extraEnvVars }} {{- include "common.tplvalues.render" (dict "value" .Values.node.extraEnvVars "context" $) | nindent 12 }} {{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index 481862f7..bb6d548e 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -11,6 +11,14 @@ global: ## - myRegistryKeySecretName ## imagePullSecrets: [] + ## @param node.enableProvidedByTopology Enables workaround for upstream Kubernetes issue where nodes without the CSI plugin are still considered for scheduling. + ## ref: https://github.com/kubernetes-csi/external-provisioner/issues/544 + ## Warning: Once enabled, this feature cannot be easily disabled. + ## It automatically adds required nodeAffinities to each volume and the topology keys to `csinode` objects. + ## If the feature is later disabled, the topology keys are removed from the `csinode` objects, leaving volumes with required affinities that cannot be satisfied. + ## Note: After enabling this feature, the workaround for the Kubernetes upstream issue only works on newly created volumes, as old volumes are not updated with the required node affinity. + ## + enableProvidedByTopology: false ## @section Common parameters ## @@ -648,6 +656,7 @@ node: ## kubeletDir: /var/lib/k0s/kubelet ## For microk8s: ## kubeletDir: /var/snap/microk8s/common/var/lib/kubelet + ## kubeletDir: /var/lib/kubelet ## @section Other Parameters diff --git a/cmd/aio/main.go b/cmd/aio/main.go index b610195a..84abb09d 100644 --- a/cmd/aio/main.go +++ b/cmd/aio/main.go @@ -63,10 +63,13 @@ func main() { volumeResizeService := volumes.NewLinuxResizeService(logger.With("component", "linux-resize-service")) volumeStatsService := volumes.NewLinuxStatsService(logger.With("component", "linux-stats-service")) + enableProvidedByTopology := app.GetEnableProvidedByTopology() + nodeService := driver.NewNodeService( logger.With("component", "driver-node-service"), strconv.FormatInt(serverID, 10), serverLocation, + enableProvidedByTopology, volumeMountService, volumeResizeService, volumeStatsService, @@ -84,6 +87,7 @@ func main() { logger.With("component", "driver-controller-service"), volumeService, serverLocation, + enableProvidedByTopology, ) // common diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 27a5c769..a29e6e26 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -54,6 +54,8 @@ func main() { location = server.Datacenter.Location.Name } + enableProvidedByTopology := app.GetEnableProvidedByTopology() + volumeService := volumes.NewIdempotentService( logger.With("component", "idempotent-volume-service"), api.NewVolumeService( @@ -65,6 +67,7 @@ func main() { logger.With("component", "driver-controller-service"), volumeService, location, + enableProvidedByTopology, ) identityService := driver.NewIdentityService( diff --git a/cmd/node/main.go b/cmd/node/main.go index 772ec47e..c3daed82 100644 --- a/cmd/node/main.go +++ b/cmd/node/main.go @@ -53,10 +53,13 @@ func main() { volumeStatsService := volumes.NewLinuxStatsService(logger.With("component", "linux-stats-service")) identityService := driver.NewIdentityService(logger.With("component", "driver-identity-service")) + enableProvidedByTopology := app.GetEnableProvidedByTopology() + nodeService := driver.NewNodeService( logger.With("component", "driver-node-service"), strconv.FormatInt(serverID, 10), serverLocation, + enableProvidedByTopology, volumeMountService, volumeResizeService, volumeStatsService, diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index fe85dc71..28ad2dda 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -209,9 +209,7 @@ $ kubectl apply -f https://raw.githubusercontent.com/hetznercloud/csi-driver/v2. ## Integration with Root Servers -Root servers can be part of the cluster, but the CSI plugin doesn't work there. - -Labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually. +Root servers can be part of the cluster, but the CSI plugin doesn't work there. To ensure proper topology evaluation, labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually. ### Adding labels manually @@ -240,6 +238,24 @@ kubectl label nodes instance.hetzner.cloud/is-root-server=false kubectl label nodes instance.hetzner.cloud/is-root-server=true ``` +### Pods stuck in pending +The current behavior of the scheduler can cause Pods to be stuck in `Pending` when using the integration with Robot servers. + +To address this behavior, you can set `enableProvidedByTopology` to `true` in the Helm Chart configuration. This setting prevents pods from being scheduled on nodes — specifically, Robot servers — where Hetzner volumes are unavailable. Enabling this option adds the `instance.hetzner.cloud/provided-by` label to the [allowed topologies](https://kubernetes.io/docs/concepts/storage/storage-classes/#allowed-topologies) section of the storage classes that are created. Additionally, this label is included in the `topologyKeys` section of `csinode` objects, and a node affinity is set up for each persistent volume. This workaround does not work with the [old label](#deprecated-old-label). + +> [!WARNING] +> Once enabled, this feature cannot be easily disabled. It automatically adds required nodeAffinities to each volume and the topology keys to `csinode` objects. If the feature is later disabled, the topology keys are removed from the `csinode` objects, leaving volumes with required affinities that cannot be satisfied. + +> [!NOTE] +> After enabling this feature, the workaround for the Kubernetes upstream issue only works on newly created volumes, as old volumes are not updated with the required node affinity. + +```yaml +global: + enableProvidedByTopology: true +``` + +Further information on the upstream issue can be found [here](https://github.com/kubernetes-csi/external-provisioner/issues/544). + ## Versioning policy We aim to support the latest three versions of Kubernetes. When a Kubernetes diff --git a/internal/app/app.go b/internal/app/app.go index 7118240e..9051907a 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -49,6 +49,15 @@ func CreateLogger() *slog.Logger { return logger } +// GetEnableProvidedByTopology parses the ENABLE_PROVIDED_BY_TOPOLOGY environment variable and returns false by default. +func GetEnableProvidedByTopology() bool { + var enableProvidedByTopology bool + if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists { + enableProvidedByTopology, _ = strconv.ParseBool(featFlag) + } + return enableProvidedByTopology +} + // CreateListener creates and binds the unix socket in location specified by the CSI_ENDPOINT environment variable. func CreateListener() (net.Listener, error) { endpoint := os.Getenv("CSI_ENDPOINT") diff --git a/internal/driver/controller.go b/internal/driver/controller.go index d7572145..2591a920 100644 --- a/internal/driver/controller.go +++ b/internal/driver/controller.go @@ -18,20 +18,23 @@ import ( type ControllerService struct { proto.UnimplementedControllerServer - logger *slog.Logger - volumeService volumes.Service - location string + logger *slog.Logger + volumeService volumes.Service + location string + enableProvidedByTopology bool } func NewControllerService( logger *slog.Logger, volumeService volumes.Service, location string, + enableProvidedByTopology bool, ) *ControllerService { return &ControllerService{ - logger: logger, - volumeService: volumeService, - location: location, + logger: logger, + volumeService: volumeService, + location: location, + enableProvidedByTopology: enableProvidedByTopology, } } @@ -88,16 +91,22 @@ func (s *ControllerService) CreateVolume(ctx context.Context, req *proto.CreateV "volume-name", volume.Name, ) + topology := &proto.Topology{ + Segments: map[string]string{ + TopologySegmentLocation: volume.Location, + }, + } + + if s.enableProvidedByTopology { + topology.Segments[ProvidedByLabel] = "cloud" + } + resp := &proto.CreateVolumeResponse{ Volume: &proto.Volume{ VolumeId: strconv.FormatInt(volume.ID, 10), CapacityBytes: volume.SizeBytes(), AccessibleTopology: []*proto.Topology{ - { - Segments: map[string]string{ - TopologySegmentLocation: volume.Location, - }, - }, + topology, }, VolumeContext: map[string]string{ "fsFormatOptions": req.Parameters["fsFormatOptions"], diff --git a/internal/driver/controller_test.go b/internal/driver/controller_test.go index 8b781714..d1f7c0b5 100644 --- a/internal/driver/controller_test.go +++ b/internal/driver/controller_test.go @@ -33,6 +33,7 @@ func newControllerServiceTestEnv() *controllerServiceTestEnv { logger, volumeService, "testloc", + false, ), volumeService: volumeService, } @@ -41,6 +42,8 @@ func newControllerServiceTestEnv() *controllerServiceTestEnv { func TestControllerServiceCreateVolume(t *testing.T) { env := newControllerServiceTestEnv() + env.service.enableProvidedByTopology = true + env.volumeService.CreateFunc = func(ctx context.Context, opts volumes.CreateOpts) (*csi.Volume, error) { if opts.Name != "testvol" { t.Errorf("unexpected name passed to volume service: %s", opts.Name) @@ -94,6 +97,11 @@ func TestControllerServiceCreateVolume(t *testing.T) { if loc := top.Segments[TopologySegmentLocation]; loc != "testloc" { t.Errorf("unexpected location segment in topology: %s", loc) } + if env.service.enableProvidedByTopology { + if provider := top.Segments[ProvidedByLabel]; provider != "cloud" { + t.Errorf("unexpected provider segment in topology: %s", provider) + } + } } else { t.Errorf("unexpected number of topologies: %d", len(resp.Volume.AccessibleTopology)) } diff --git a/internal/driver/driver.go b/internal/driver/driver.go index 8fbcd0f6..ce7d8c29 100644 --- a/internal/driver/driver.go +++ b/internal/driver/driver.go @@ -9,4 +9,5 @@ const ( DefaultVolumeSize = MinVolumeSize TopologySegmentLocation = PluginName + "/location" + ProvidedByLabel = "instance.hetzner.cloud/provided-by" ) diff --git a/internal/driver/node.go b/internal/driver/node.go index 5b084ea1..0e03ee44 100644 --- a/internal/driver/node.go +++ b/internal/driver/node.go @@ -15,29 +15,32 @@ import ( type NodeService struct { proto.UnimplementedNodeServer - logger *slog.Logger - serverID string - serverLocation string - volumeMountService volumes.MountService - volumeResizeService volumes.ResizeService - volumeStatsService volumes.StatsService + logger *slog.Logger + serverID string + serverLocation string + enableProvidedByTopology bool + volumeMountService volumes.MountService + volumeResizeService volumes.ResizeService + volumeStatsService volumes.StatsService } func NewNodeService( logger *slog.Logger, serverID string, serverLocation string, + enableProvidedByTopology bool, volumeMountService volumes.MountService, volumeResizeService volumes.ResizeService, volumeStatsService volumes.StatsService, ) *NodeService { return &NodeService{ - logger: logger, - serverID: serverID, - serverLocation: serverLocation, - volumeMountService: volumeMountService, - volumeResizeService: volumeResizeService, - volumeStatsService: volumeStatsService, + logger: logger, + serverID: serverID, + serverLocation: serverLocation, + enableProvidedByTopology: enableProvidedByTopology, + volumeMountService: volumeMountService, + volumeResizeService: volumeResizeService, + volumeStatsService: volumeStatsService, } } @@ -181,6 +184,11 @@ func (s *NodeService) NodeGetInfo(_ context.Context, _ *proto.NodeGetInfoRequest }, }, } + + if s.enableProvidedByTopology { + resp.AccessibleTopology.Segments[ProvidedByLabel] = "cloud" + } + return resp, nil } diff --git a/internal/driver/node_test.go b/internal/driver/node_test.go index 624c2afc..22a19823 100644 --- a/internal/driver/node_test.go +++ b/internal/driver/node_test.go @@ -35,6 +35,7 @@ func newNodeServerTestEnv() nodeServiceTestEnv { testutil.NewNopLogger(), "1", "loc", + false, volumeMountService, volumeResizeService, volumeStatsService, diff --git a/internal/driver/sanity_test.go b/internal/driver/sanity_test.go index 43094309..4b34609c 100644 --- a/internal/driver/sanity_test.go +++ b/internal/driver/sanity_test.go @@ -46,6 +46,7 @@ func TestSanity(t *testing.T) { logger.With("component", "driver-controller-service"), volumeService, "testloc", + false, ) identityService := NewIdentityService( @@ -56,6 +57,7 @@ func TestSanity(t *testing.T) { logger.With("component", "driver-node-service"), "123456", "loc", + false, volumeMountService, volumeResizeService, volumeStatsService,