From 44b4b753f03a961e2373a5b580bde9196b4db016 Mon Sep 17 00:00:00 2001 From: Philip Thompson Date: Thu, 10 Feb 2022 13:01:38 -0500 Subject: [PATCH] Always clean up a node failure chaos pod (#503) * Always clean up a node failure chaos pod * Mark Node Failure chaos pods as ready before they inject * Make the tests take way longer, should we just remove them? --- controllers/disruption_controller.go | 7 +++++++ injector/node_failure.go | 21 +++++++++++---------- injector/node_failure_test.go | 2 ++ 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/controllers/disruption_controller.go b/controllers/disruption_controller.go index cce2b0e39..597d9460f 100644 --- a/controllers/disruption_controller.go +++ b/controllers/disruption_controller.go @@ -572,6 +572,13 @@ func (r *DisruptionReconciler) handleChaosPodsTermination(instance *chaosv1beta1 } } + // It is always safe to remove a node failure chaos pod. It is usually hard to tell if a node failure chaos pod has + // succeeded or not, so we choose to always remove the finalizer. + if chaosPod.Labels[chaostypes.DisruptionKindLabel] == chaostypes.DisruptionKindNodeFailure { + removeFinalizer = true + ignoreStatus = true + } + // check the chaos pod status to determine if we can safely delete it or not switch chaosPod.Status.Phase { case corev1.PodSucceeded, corev1.PodPending: diff --git a/injector/node_failure.go b/injector/node_failure.go index 7ee91637b..5b5f8121a 100644 --- a/injector/node_failure.go +++ b/injector/node_failure.go @@ -81,18 +81,19 @@ func (i nodeFailureInjector) Inject() error { i.config.Log.Infow("from this point, if no fatal log occurs, the injection succeeded and the system will crash") _ = i.config.Log.Sync() // If we can't flush the logger, why would logging the error help? so we just ignore - // Wait ten seconds for the logs to be flushed and collected, as the shutdown will be immediate - time.Sleep(time.Second * 10) + go func() { // Wait ten seconds for the logs to be flushed and collected, as the shutdown will be immediate + time.Sleep(time.Second * 10) - if i.spec.Shutdown { - err = i.config.FileWriter.Write(i.sysrqTriggerPath, 0200, "o") - } else { - err = i.config.FileWriter.Write(i.sysrqTriggerPath, 0200, "c") - } + if i.spec.Shutdown { + err = i.config.FileWriter.Write(i.sysrqTriggerPath, 0200, "o") + } else { + err = i.config.FileWriter.Write(i.sysrqTriggerPath, 0200, "c") + } - if err != nil { - return fmt.Errorf("error while writing to the sysrq trigger file (%s): %w", i.sysrqTriggerPath, err) - } + if err != nil { + i.config.Log.Errorf("error while writing to the sysrq trigger file (%s): %v", i.sysrqTriggerPath, err) + } + }() return nil } diff --git a/injector/node_failure_test.go b/injector/node_failure_test.go index af42c133c..45706128c 100644 --- a/injector/node_failure_test.go +++ b/injector/node_failure_test.go @@ -7,6 +7,7 @@ package injector_test import ( "os" + "time" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" @@ -54,6 +55,7 @@ var _ = Describe("Failure", func() { Describe("injection", func() { JustBeforeEach(func() { Expect(inj.Inject()).To(BeNil()) + time.Sleep(time.Second * 11) }) It("should enable the sysrq handler", func() {