diff --git a/internal/overlord/checkstate/manager.go b/internal/overlord/checkstate/manager.go index 20d95a11f..cb5cbdfd0 100644 --- a/internal/overlord/checkstate/manager.go +++ b/internal/overlord/checkstate/manager.go @@ -16,11 +16,13 @@ package checkstate import ( "context" + "fmt" "sort" "sync" "time" "github.com/canonical/pebble/internal/logger" + "github.com/canonical/pebble/internal/overlord/state" "github.com/canonical/pebble/internal/plan" ) @@ -29,14 +31,15 @@ type CheckManager struct { mutex sync.Mutex checks map[string]*checkData failureHandlers []FailureFunc + state *state.State } // FailureFunc is the type of function called when a failure action is triggered. type FailureFunc func(name string) // NewManager creates a new check manager. -func NewManager() *CheckManager { - return &CheckManager{} +func NewManager(st *state.State) *CheckManager { + return &CheckManager{state: st} } // NotifyCheckFailed adds f to the list of functions that are called whenever @@ -56,7 +59,8 @@ func (m *CheckManager) PlanChanged(p *plan.Plan) { // First stop existing checks. for _, check := range m.checks { - check.cancel() + check.cancelCtx() + check.closeChange() } // Then configure and start new checks. @@ -64,11 +68,12 @@ func (m *CheckManager) PlanChanged(p *plan.Plan) { for name, config := range p.Checks { ctx, cancel := context.WithCancel(context.Background()) check := &checkData{ - config: config, - checker: newChecker(config), - ctx: ctx, - cancel: cancel, - action: m.callFailureHandlers, + config: config, + checker: newChecker(config), + ctx: ctx, + cancelCtx: cancel, + action: m.callFailureHandlers, + state: m.state, } checks[name] = check go check.loop() @@ -136,13 +141,11 @@ func (m *CheckManager) Checks() ([]*CheckInfo, error) { // CheckInfo provides status information about a single check. type CheckInfo struct { - Name string - Level plan.CheckLevel - Status CheckStatus - Failures int - Threshold int - LastError string - ErrorDetails string + Name string + Level plan.CheckLevel + Status CheckStatus + Failures int + Threshold int } type CheckStatus string @@ -154,16 +157,17 @@ const ( // checkData holds state for an active health check. type checkData struct { - config *plan.Check - checker checker - ctx context.Context - cancel context.CancelFunc - action FailureFunc + config *plan.Check + checker checker + ctx context.Context + cancelCtx context.CancelFunc + action FailureFunc mutex sync.Mutex failures int actionRan bool - lastErr error + state *state.State + change *state.Change } type checker interface { @@ -204,9 +208,9 @@ func (c *checkData) runCheck() { if err == nil { // Successful check - c.lastErr = nil c.failures = 0 c.actionRan = false + c.closeChangeNoLock() return } @@ -217,7 +221,6 @@ func (c *checkData) runCheck() { } // Track failure, run failure action if "failures" threshold was hit. - c.lastErr = err c.failures++ logger.Noticef("Check %q failure %d (threshold %d): %v", c.config.Name, c.failures, c.config.Threshold, err) @@ -227,6 +230,60 @@ func (c *checkData) runCheck() { c.action(c.config.Name) c.actionRan = true } + if c.failures <= c.config.Threshold { + c.recordFailureTask(err) + } +} + +func (c *checkData) recordFailureTask(err error) { + c.state.Lock() + defer c.state.Unlock() + + if c.change == nil { + c.change = c.state.NewChange("recover-check", fmt.Sprintf("Recover check %q", c.config.Name)) + } + + summary := fmt.Sprintf("Check failure %d (threshold %d)", c.failures, c.config.Threshold) + task := c.state.NewTask("check-failure", summary) + log := err.Error() + if d, ok := err.(interface{ Details() string }); ok && d.Details() != "" { + log += "\n" + d.Details() + } + task.Logf("%s", log) + task.SetStatus(state.DoingStatus) + c.change.AddTask(task) + + // Mark the previous task as Done (after the Doing task has been added so + // the entire change doesn't get marked Done). + tasks := c.change.Tasks() + if len(tasks) > 1 { + tasks[len(tasks)-2].SetStatus(state.DoneStatus) + } +} + +func (c *checkData) closeChange() { + c.mutex.Lock() + defer c.mutex.Unlock() + + c.closeChangeNoLock() +} + +func (c *checkData) closeChangeNoLock() { + if c.change == nil { + return + } + + c.state.Lock() + defer c.state.Unlock() + + // Mark the last task as Done. + tasks := c.change.Tasks() + if len(tasks) > 0 { + tasks[len(tasks)-1].SetStatus(state.DoneStatus) + } + + c.change.SetStatus(state.DoneStatus) // should be done automatically, but doesn't hurt + c.change = nil } // info returns user-facing check information for use in Checks (and tests). @@ -244,11 +301,5 @@ func (c *checkData) info() *CheckInfo { if c.failures >= c.config.Threshold { info.Status = CheckStatusDown } - if c.lastErr != nil { - info.LastError = c.lastErr.Error() - if d, ok := c.lastErr.(interface{ Details() string }); ok { - info.ErrorDetails = d.Details() - } - } return info } diff --git a/internal/overlord/checkstate/manager_test.go b/internal/overlord/checkstate/manager_test.go index d0ad5c48e..b88bcd5ab 100644 --- a/internal/overlord/checkstate/manager_test.go +++ b/internal/overlord/checkstate/manager_test.go @@ -26,6 +26,7 @@ import ( . "gopkg.in/check.v1" "github.com/canonical/pebble/internal/logger" + "github.com/canonical/pebble/internal/overlord/state" "github.com/canonical/pebble/internal/plan" "github.com/canonical/pebble/internal/reaper" ) @@ -56,7 +57,8 @@ func (s *ManagerSuite) TearDownSuite(c *C) { } func (s *ManagerSuite) TestChecks(c *C) { - mgr := NewManager() + st := state.New(nil) + mgr := NewManager(st) mgr.PlanChanged(&plan.Plan{ Checks: map[string]*plan.Check{ "chk1": { @@ -117,7 +119,8 @@ func stopChecks(c *C, mgr *CheckManager) { } func (s *ManagerSuite) TestTimeout(c *C) { - mgr := NewManager() + st := state.New(nil) + mgr := NewManager(st) mgr.PlanChanged(&plan.Plan{ Checks: map[string]*plan.Check{ "chk1": { @@ -134,14 +137,26 @@ func (s *ManagerSuite) TestTimeout(c *C) { check := waitCheck(c, mgr, "chk1", func(check *CheckInfo) bool { return check.Status != CheckStatusUp }) - c.Assert(check.Failures, Equals, 1) - c.Assert(check.Threshold, Equals, 1) - c.Assert(check.LastError, Equals, "exec check timed out") - c.Assert(check.ErrorDetails, Equals, "FOO") + c.Check(check.Failures, Equals, 1) + c.Check(check.Threshold, Equals, 1) + + st.Lock() + defer st.Unlock() + changes := st.Changes() + c.Assert(changes, HasLen, 1) + c.Check(changes[0].Kind(), Equals, "recover-check") + c.Check(changes[0].Summary(), Equals, `Recover check "chk1"`) + tasks := changes[0].Tasks() + c.Assert(tasks, HasLen, 1) + c.Check(tasks[0].Kind(), Equals, "check-failure") + c.Check(tasks[0].Summary(), Equals, `Check failure 1 (threshold 1)`) + c.Assert(tasks[0].Log(), HasLen, 1) + c.Check(tasks[0].Log()[0], Matches, `2\S+ INFO exec check timed out\nFOO`) } func (s *ManagerSuite) TestCheckCanceled(c *C) { - mgr := NewManager() + st := state.New(nil) + mgr := NewManager(st) failureName := "" mgr.NotifyCheckFailed(func(name string) { failureName = name @@ -196,17 +211,20 @@ func (s *ManagerSuite) TestCheckCanceled(c *C) { // Ensure it didn't trigger failure action c.Check(failureName, Equals, "") - // Ensure it didn't update check failure details (white box testing) + // Ensure it didn't update check failure details (white box testing) or record a change. info := check.info() c.Check(info.Status, Equals, CheckStatusUp) c.Check(info.Failures, Equals, 0) c.Check(info.Threshold, Equals, 1) - c.Check(info.LastError, Equals, "") - c.Check(info.ErrorDetails, Equals, "") + st.Lock() + defer st.Unlock() + changes := st.Changes() + c.Assert(changes, HasLen, 0) } func (s *ManagerSuite) TestFailures(c *C) { - mgr := NewManager() + st := state.New(nil) + mgr := NewManager(st) failureName := "" mgr.NotifyCheckFailed(func(name string) { failureName = name @@ -231,8 +249,24 @@ func (s *ManagerSuite) TestFailures(c *C) { }) c.Assert(check.Threshold, Equals, 3) c.Assert(check.Status, Equals, CheckStatusUp) - c.Assert(check.LastError, Matches, "exit status 1") c.Assert(failureName, Equals, "") + checkRecoverChange := func(status state.Status, numTasks int) { + st.Lock() + defer st.Unlock() + changes := st.Changes() + c.Assert(changes, HasLen, 1) + c.Check(changes[0].Kind(), Equals, "recover-check") + c.Check(changes[0].Status(), Equals, status) + tasks := changes[0].Tasks() + c.Assert(tasks, HasLen, numTasks) + task := tasks[len(tasks)-1] + c.Check(task.Kind(), Equals, "check-failure") + c.Check(task.Status(), Equals, status) + c.Check(task.Summary(), Equals, fmt.Sprintf(`Check failure %d (threshold 3)`, numTasks)) + c.Assert(task.Log(), HasLen, 1) + c.Check(task.Log()[0], Matches, `2\S+ INFO exit status 1`) + } + checkRecoverChange(state.DoingStatus, 1) // Shouldn't have called failure handler after only 2 failures check = waitCheck(c, mgr, "chk1", func(check *CheckInfo) bool { @@ -240,8 +274,8 @@ func (s *ManagerSuite) TestFailures(c *C) { }) c.Assert(check.Threshold, Equals, 3) c.Assert(check.Status, Equals, CheckStatusUp) - c.Assert(check.LastError, Matches, "exit status 1") c.Assert(failureName, Equals, "") + checkRecoverChange(state.DoingStatus, 2) // Should have called failure handler and be unhealthy after 3 failures (threshold) check = waitCheck(c, mgr, "chk1", func(check *CheckInfo) bool { @@ -249,8 +283,8 @@ func (s *ManagerSuite) TestFailures(c *C) { }) c.Assert(check.Threshold, Equals, 3) c.Assert(check.Status, Equals, CheckStatusDown) - c.Assert(check.LastError, Matches, "exit status 1") c.Assert(failureName, Equals, "chk1") + checkRecoverChange(state.DoingStatus, 3) // Should reset number of failures if command then succeeds failureName = "" @@ -260,8 +294,8 @@ func (s *ManagerSuite) TestFailures(c *C) { }) c.Assert(check.Failures, Equals, 0) c.Assert(check.Threshold, Equals, 3) - c.Assert(check.LastError, Equals, "") c.Assert(failureName, Equals, "") + checkRecoverChange(state.DoneStatus, 3) } func waitCheck(c *C, mgr *CheckManager, name string, f func(check *CheckInfo) bool) *CheckInfo { diff --git a/internal/overlord/overlord.go b/internal/overlord/overlord.go index 5fd826011..955b1b1eb 100644 --- a/internal/overlord/overlord.go +++ b/internal/overlord/overlord.go @@ -114,7 +114,7 @@ func New(pebbleDir string, restartHandler restart.Handler, serviceOutput io.Writ o.commandMgr = cmdstate.NewManager(o.runner) o.addManager(o.commandMgr) - o.checkMgr = checkstate.NewManager() + o.checkMgr = checkstate.NewManager(s) // Tell check manager about plan updates. o.serviceMgr.NotifyPlanChanged(o.checkMgr.PlanChanged) diff --git a/internal/overlord/servstate/manager_test.go b/internal/overlord/servstate/manager_test.go index 3455c1c65..0655c1ead 100644 --- a/internal/overlord/servstate/manager_test.go +++ b/internal/overlord/servstate/manager_test.go @@ -948,7 +948,7 @@ services: func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) { // Create check manager and tell it about plan updates - checkMgr := checkstate.NewManager() + checkMgr := checkstate.NewManager(s.st) defer checkMgr.PlanChanged(&plan.Plan{}) s.manager.NotifyPlanChanged(checkMgr.PlanChanged) @@ -999,7 +999,6 @@ checks: c.Assert(err, IsNil) if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp { c.Assert(checks[0].Failures, Equals, 1) - c.Assert(checks[0].LastError, Matches, ".* executable file not found .*") break } time.Sleep(5 * time.Millisecond) @@ -1030,7 +1029,6 @@ checks: c.Assert(err, IsNil) c.Assert(len(checks), Equals, 1) c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown) - c.Assert(checks[0].LastError, Matches, ".* executable file not found .*") svc := s.serviceByName(c, "test2") c.Assert(svc.Current, Equals, servstate.StatusActive) c.Assert(s.manager.BackoffNum("test2"), Equals, 1) @@ -1038,7 +1036,7 @@ checks: func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) { // Create check manager and tell it about plan updates - checkMgr := checkstate.NewManager() + checkMgr := checkstate.NewManager(s.st) defer checkMgr.PlanChanged(&plan.Plan{}) s.manager.NotifyPlanChanged(checkMgr.PlanChanged) @@ -1111,12 +1109,11 @@ checks: c.Assert(err, IsNil) c.Assert(len(checks), Equals, 1) c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown) - c.Assert(checks[0].LastError, Matches, ".* executable file not found .*") } func (s *S) TestOnCheckFailureIgnore(c *C) { // Create check manager and tell it about plan updates - checkMgr := checkstate.NewManager() + checkMgr := checkstate.NewManager(s.st) defer checkMgr.PlanChanged(&plan.Plan{}) s.manager.NotifyPlanChanged(checkMgr.PlanChanged) @@ -1166,7 +1163,6 @@ checks: c.Assert(err, IsNil) if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp { c.Assert(checks[0].Failures, Equals, 1) - c.Assert(checks[0].LastError, Matches, ".* executable file not found .*") break } time.Sleep(5 * time.Millisecond) @@ -1181,14 +1177,13 @@ checks: c.Assert(err, IsNil) c.Assert(len(checks), Equals, 1) c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown) - c.Assert(checks[0].LastError, Matches, ".* executable file not found .*") svc := s.serviceByName(c, "test2") c.Assert(svc.Current, Equals, servstate.StatusActive) } func (s *S) TestOnCheckFailureShutdown(c *C) { // Create check manager and tell it about plan updates - checkMgr := checkstate.NewManager() + checkMgr := checkstate.NewManager(s.st) defer checkMgr.PlanChanged(&plan.Plan{}) s.manager.NotifyPlanChanged(checkMgr.PlanChanged) @@ -1238,7 +1233,6 @@ checks: c.Assert(err, IsNil) if len(checks) == 1 && checks[0].Status != checkstate.CheckStatusUp { c.Assert(checks[0].Failures, Equals, 1) - c.Assert(checks[0].LastError, Matches, ".* executable file not found .*") break } time.Sleep(5 * time.Millisecond)