Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PCP-2902: skip killing CSI pod after frist drain #182

Closed
wants to merge 9 commits into from
1 change: 1 addition & 0 deletions api/v1alpha3/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions api/v1alpha4/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion api/v1beta1/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package v1beta1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

capierrors "sigs.k8s.io/cluster-api/errors"
)

Expand Down Expand Up @@ -121,6 +120,10 @@ type MachineSpec struct {
// Defaults to 10 seconds.
// +optional
NodeDeletionTimeout *metav1.Duration `json:"nodeDeletionTimeout,omitempty"`

// NodeDrainPodFilters allows to specify filters for pods to be excluded during node drain
// +optional
NodeDrainPodFilters *metav1.LabelSelector `json:"nodeDrainPodFilters,omitempty"`
}

// ANCHOR_END: MachineSpec
Expand Down
5 changes: 5 additions & 0 deletions api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion api/v1beta1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinepools.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

46 changes: 46 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machines.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinesets.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 18 additions & 3 deletions internal/controllers/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
}

if result, err := r.drainNode(ctx, cluster, m.Status.NodeRef.Name); !result.IsZero() || err != nil {
if result, err := r.drainNode(ctx, cluster, m); !result.IsZero() || err != nil {
if err != nil {
conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingFailedReason, clusterv1.ConditionSeverityWarning, err.Error())
r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedDrainNode", "error draining Machine's node %q: %v", m.Status.NodeRef.Name, err)
Expand Down Expand Up @@ -572,7 +572,8 @@ func (r *Reconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1
return nil
}

func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster, nodeName string) (ctrl.Result, error) {
func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (ctrl.Result, error) {
nodeName := m.Status.NodeRef.Name
log := ctrl.LoggerFrom(ctx, "Node", klog.KRef("", nodeName))

restConfig, err := remote.RESTConfig(ctx, controllerName, r.Client, util.ObjectKey(cluster))
Expand Down Expand Up @@ -620,8 +621,10 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster,
}},
// SPECTRO: Even if the node is reachable, we wait 30 minutes for drain completion else move ahead
SkipWaitForDeleteTimeoutSeconds: 60 * 30, // 30 minutes
AdditionalFilters: []kubedrain.PodFilter{
SkipFuncGenerator(m.Spec.NodeDrainPodFilters),
},
}

if noderefutil.IsNodeUnreachable(node) {
// When the node is unreachable and some pods are not evicted for as long as this timeout, we ignore them.
drainer.SkipWaitForDeleteTimeoutSeconds = 60 * 5 // 5 minutes
Expand All @@ -643,6 +646,18 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster,
return ctrl.Result{}, nil
}

func SkipFuncGenerator(labelSelector *metav1.LabelSelector) func(pod corev1.Pod) kubedrain.PodDeleteStatus {
return func(pod corev1.Pod) kubedrain.PodDeleteStatus {
if pod.Labels == nil {
return kubedrain.MakePodDeleteStatusOkay()
}
if HasMatchingLabels(*labelSelector, pod.ObjectMeta.Labels) {
return kubedrain.MakePodDeleteStatusSkip()
}
return kubedrain.MakePodDeleteStatusOkay()
}
}

// shouldWaitForNodeVolumes returns true if node status still have volumes attached
// pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node
// this could cause issue for some storage provisioner, for example, vsphere-volume this is problematic
Expand Down
2 changes: 1 addition & 1 deletion spectro/generated/bootstrap-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
- --bootstrap-token-ttl=${KUBEADM_BOOTSTRAP_TOKEN_TTL:=15m}
command:
- /manager
image: gcr.io/spectro-dev-public/release/kubeadm-bootstrap-controller-amd64:20220805
image: gcr.io/spectro-dev-public/devop2023/release-fips/kubeadm-bootstrap-controller:v1.3.2-spectro-4.0.0-dev
imagePullPolicy: Always
name: manager
terminationGracePeriodSeconds: 10
Expand Down
Loading
Loading