Skip to content

Commit

Permalink
Change Disruption behavior at end of lifecycle (#488)
Browse files Browse the repository at this point in the history
* Tune log statement down to DEBUG

* Always round activeDeadlineSeconds up

* Dont try to clean chaos pods that have expired

* Add Events on pod and disruption after duration

* Reduce default gc delay to 10m

* Tune log statement down to debug

* Make docs match config change
  • Loading branch information
ptnapoleon authored Feb 1, 2022
1 parent 800e01f commit 1fb906c
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 15 deletions.
2 changes: 1 addition & 1 deletion chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ controller:
enabled: false
tokenFilepath: ""
defaultDuration: 1h # default spec.duration for a disruption with none specified
expiredDisruptionGCDelay: 15m # time after a disruption expires before deleting it
expiredDisruptionGCDelay: 10m # time after a disruption expires before deleting it
webhook: # admission webhook configuration
generateCert: false # if you want Helm to generate certificates (e.g. in case the cert-manager is not installed in the cluster) set this to true
certDir: "" # certificate directory (must contain tls.crt and tls.key files)
Expand Down
23 changes: 10 additions & 13 deletions controllers/disruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,20 +189,17 @@ func (r *DisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{}, fmt.Errorf("error updating disruption injection status: %w", err)
}

isCleaned, err := r.cleanDisruption(instance)
r.Recorder.Event(instance, "Normal", "DurationOver", fmt.Sprintf("The disruption has lived longer than its specified duration, and will be garbage collected after %s.", r.ExpiredDisruptionGCDelay))

chaosPods, err := r.getChaosPods(instance, nil)
if err != nil {
r.log.Errorw("couldn't list instance's chaos pods", "err", err)

return ctrl.Result{}, err
}

if !isCleaned {
requeueAfter := time.Duration(rand.Intn(5)+5) * time.Second //nolint:gosec

r.log.Infow(fmt.Sprintf("disruption has not been fully cleaned yet, re-queuing in %v", requeueAfter))

return ctrl.Result{
Requeue: true,
RequeueAfter: requeueAfter,
}, r.Update(context.Background(), instance)
for _, pod := range chaosPods {
r.Recorder.Event(&pod, "Normal", "DurationOver", fmt.Sprintf("The chaos pod has lived longer than the disruption duration, and will be garbage collected after %s.", r.ExpiredDisruptionGCDelay))
}

requeueDelay := r.ExpiredDisruptionGCDelay
Expand Down Expand Up @@ -411,7 +408,7 @@ func (r *DisruptionReconciler) startInjection(instance *chaosv1beta1.Disruption)
r.recordEventOnTarget(instance, target, corev1.EventTypeWarning, "Disrupted", fmt.Sprintf("Pod %s from disruption %s targeted this resource for injection", chaosPod.Name, instance.Name))
r.handleMetricSinkError(r.MetricsSink.MetricPodsCreated(target, instance.Name, instance.Namespace, true))
case 1:
r.log.Infow("an injection pod is already existing for the selected target", "target", target, "chaosPod", found[0].Name)
r.log.Debugw("an injection pod is already existing for the selected target", "target", target, "chaosPod", found[0].Name)
default:
var chaosPodNames []string
for _, pod := range found {
Expand Down Expand Up @@ -833,7 +830,7 @@ func (r *DisruptionReconciler) getChaosPods(instance *chaosv1beta1.Disruption, l
podNames = append(podNames, pod.Name)
}

r.log.Infow("searching for chaos pods with label selector...", "labels", ls.String(), "foundPods", podNames)
r.log.Debugw("searching for chaos pods with label selector...", "labels", ls.String(), "foundPods", podNames)

return pods.Items, nil
}
Expand All @@ -849,7 +846,7 @@ func (r *DisruptionReconciler) generatePod(instance *chaosv1beta1.Disruption, ta
// the signal sent to a pod becomes SIGKILL, which will interrupt any in-progress cleaning. By double this to 1 minute in the pod spec itself,
// ensures that whether a chaos pod is deleted directly or by deleting a disruption, it will have time to finish cleaning up after itself.
terminationGracePeriod := int64(60)
activeDeadlineSeconds := int64(calculateRemainingDuration(*instance).Seconds())
activeDeadlineSeconds := int64(calculateRemainingDuration(*instance).Seconds()) + 1

if activeDeadlineSeconds < 1 {
return nil
Expand Down
2 changes: 1 addition & 1 deletion docs/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ that the Disruption resource is created, not from when the injection of the actu
If a `duration` is not specified, then a disruption will receive the default duration, which is configured at the controller level by setting
`controller.defaultDuration` in the controller's config map, and this value defaults to 1 hour.

After a disruption's duration expires, the disruption resource will live in k8s for a default of 15 minutes. This can be configured by altering
After a disruption's duration expires, the disruption resource will live in k8s for a default of 10 minutes. This can be configured by altering
`controller.expiredDisruptionGCDelay` in the controller's config map.

## Pulse
Expand Down

0 comments on commit 1fb906c

Please sign in to comment.