Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Record "recover-check" change and task logs when health check is failing #119

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 80 additions & 29 deletions internal/overlord/checkstate/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@ package checkstate

import (
"context"
"fmt"
"sort"
"sync"
"time"

"github.com/canonical/pebble/internal/logger"
"github.com/canonical/pebble/internal/overlord/state"
"github.com/canonical/pebble/internal/plan"
)

Expand All @@ -29,14 +31,15 @@ type CheckManager struct {
mutex sync.Mutex
checks map[string]*checkData
failureHandlers []FailureFunc
state *state.State
}

// FailureFunc is the type of function called when a failure action is triggered.
type FailureFunc func(name string)

// NewManager creates a new check manager.
func NewManager() *CheckManager {
return &CheckManager{}
func NewManager(st *state.State) *CheckManager {
return &CheckManager{state: st}
}

// NotifyCheckFailed adds f to the list of functions that are called whenever
Expand All @@ -56,19 +59,21 @@ func (m *CheckManager) PlanChanged(p *plan.Plan) {

// First stop existing checks.
for _, check := range m.checks {
check.cancel()
check.cancelCtx()
check.closeChange()
}

// Then configure and start new checks.
checks := make(map[string]*checkData, len(p.Checks))
for name, config := range p.Checks {
ctx, cancel := context.WithCancel(context.Background())
check := &checkData{
config: config,
checker: newChecker(config),
ctx: ctx,
cancel: cancel,
action: m.callFailureHandlers,
config: config,
checker: newChecker(config),
ctx: ctx,
cancelCtx: cancel,
action: m.callFailureHandlers,
state: m.state,
}
checks[name] = check
go check.loop()
Expand Down Expand Up @@ -136,13 +141,11 @@ func (m *CheckManager) Checks() ([]*CheckInfo, error) {

// CheckInfo provides status information about a single check.
type CheckInfo struct {
Name string
Level plan.CheckLevel
Status CheckStatus
Failures int
Threshold int
LastError string
ErrorDetails string
Name string
Level plan.CheckLevel
Status CheckStatus
Failures int
Threshold int
}

type CheckStatus string
Expand All @@ -154,16 +157,17 @@ const (

// checkData holds state for an active health check.
type checkData struct {
config *plan.Check
checker checker
ctx context.Context
cancel context.CancelFunc
action FailureFunc
config *plan.Check
checker checker
ctx context.Context
cancelCtx context.CancelFunc
action FailureFunc

mutex sync.Mutex
failures int
actionRan bool
lastErr error
state *state.State
change *state.Change
}

type checker interface {
Expand Down Expand Up @@ -204,9 +208,9 @@ func (c *checkData) runCheck() {

if err == nil {
// Successful check
c.lastErr = nil
c.failures = 0
c.actionRan = false
c.closeChangeNoLock()
return
}

Expand All @@ -217,7 +221,6 @@ func (c *checkData) runCheck() {
}

// Track failure, run failure action if "failures" threshold was hit.
c.lastErr = err
c.failures++
logger.Noticef("Check %q failure %d (threshold %d): %v",
c.config.Name, c.failures, c.config.Threshold, err)
Expand All @@ -227,6 +230,60 @@ func (c *checkData) runCheck() {
c.action(c.config.Name)
c.actionRan = true
}
if c.failures <= c.config.Threshold {
c.recordFailureTask(err)
}
}

func (c *checkData) recordFailureTask(err error) {
c.state.Lock()
defer c.state.Unlock()

if c.change == nil {
c.change = c.state.NewChange("recover-check", fmt.Sprintf("Recover check %q", c.config.Name))
}

summary := fmt.Sprintf("Check failure %d (threshold %d)", c.failures, c.config.Threshold)
task := c.state.NewTask("check-failure", summary)
log := err.Error()
if d, ok := err.(interface{ Details() string }); ok && d.Details() != "" {
log += "\n" + d.Details()
}
task.Logf("%s", log)
task.SetStatus(state.DoingStatus)
c.change.AddTask(task)

// Mark the previous task as Done (after the Doing task has been added so
// the entire change doesn't get marked Done).
tasks := c.change.Tasks()
if len(tasks) > 1 {
tasks[len(tasks)-2].SetStatus(state.DoneStatus)
}
}

func (c *checkData) closeChange() {
c.mutex.Lock()
defer c.mutex.Unlock()

c.closeChangeNoLock()
}

func (c *checkData) closeChangeNoLock() {
if c.change == nil {
return
}

c.state.Lock()
defer c.state.Unlock()

// Mark the last task as Done.
tasks := c.change.Tasks()
if len(tasks) > 0 {
tasks[len(tasks)-1].SetStatus(state.DoneStatus)
}

c.change.SetStatus(state.DoneStatus) // should be done automatically, but doesn't hurt
c.change = nil
}

// info returns user-facing check information for use in Checks (and tests).
Expand All @@ -244,11 +301,5 @@ func (c *checkData) info() *CheckInfo {
if c.failures >= c.config.Threshold {
info.Status = CheckStatusDown
}
if c.lastErr != nil {
info.LastError = c.lastErr.Error()
if d, ok := c.lastErr.(interface{ Details() string }); ok {
info.ErrorDetails = d.Details()
}
}
return info
}
64 changes: 49 additions & 15 deletions internal/overlord/checkstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
. "gopkg.in/check.v1"

"github.com/canonical/pebble/internal/logger"
"github.com/canonical/pebble/internal/overlord/state"
"github.com/canonical/pebble/internal/plan"
"github.com/canonical/pebble/internal/reaper"
)
Expand Down Expand Up @@ -56,7 +57,8 @@ func (s *ManagerSuite) TearDownSuite(c *C) {
}

func (s *ManagerSuite) TestChecks(c *C) {
mgr := NewManager()
st := state.New(nil)
mgr := NewManager(st)
mgr.PlanChanged(&plan.Plan{
Checks: map[string]*plan.Check{
"chk1": {
Expand Down Expand Up @@ -117,7 +119,8 @@ func stopChecks(c *C, mgr *CheckManager) {
}

func (s *ManagerSuite) TestTimeout(c *C) {
mgr := NewManager()
st := state.New(nil)
mgr := NewManager(st)
mgr.PlanChanged(&plan.Plan{
Checks: map[string]*plan.Check{
"chk1": {
Expand All @@ -134,14 +137,26 @@ func (s *ManagerSuite) TestTimeout(c *C) {
check := waitCheck(c, mgr, "chk1", func(check *CheckInfo) bool {
return check.Status != CheckStatusUp
})
c.Assert(check.Failures, Equals, 1)
c.Assert(check.Threshold, Equals, 1)
c.Assert(check.LastError, Equals, "exec check timed out")
c.Assert(check.ErrorDetails, Equals, "FOO")
c.Check(check.Failures, Equals, 1)
c.Check(check.Threshold, Equals, 1)

st.Lock()
defer st.Unlock()
changes := st.Changes()
c.Assert(changes, HasLen, 1)
c.Check(changes[0].Kind(), Equals, "recover-check")
c.Check(changes[0].Summary(), Equals, `Recover check "chk1"`)
tasks := changes[0].Tasks()
c.Assert(tasks, HasLen, 1)
c.Check(tasks[0].Kind(), Equals, "check-failure")
c.Check(tasks[0].Summary(), Equals, `Check failure 1 (threshold 1)`)
c.Assert(tasks[0].Log(), HasLen, 1)
c.Check(tasks[0].Log()[0], Matches, `2\S+ INFO exec check timed out\nFOO`)
}

func (s *ManagerSuite) TestCheckCanceled(c *C) {
mgr := NewManager()
st := state.New(nil)
mgr := NewManager(st)
failureName := ""
mgr.NotifyCheckFailed(func(name string) {
failureName = name
Expand Down Expand Up @@ -196,17 +211,20 @@ func (s *ManagerSuite) TestCheckCanceled(c *C) {
// Ensure it didn't trigger failure action
c.Check(failureName, Equals, "")

// Ensure it didn't update check failure details (white box testing)
// Ensure it didn't update check failure details (white box testing) or record a change.
info := check.info()
c.Check(info.Status, Equals, CheckStatusUp)
c.Check(info.Failures, Equals, 0)
c.Check(info.Threshold, Equals, 1)
c.Check(info.LastError, Equals, "")
c.Check(info.ErrorDetails, Equals, "")
st.Lock()
defer st.Unlock()
changes := st.Changes()
c.Assert(changes, HasLen, 0)
}

func (s *ManagerSuite) TestFailures(c *C) {
mgr := NewManager()
st := state.New(nil)
mgr := NewManager(st)
failureName := ""
mgr.NotifyCheckFailed(func(name string) {
failureName = name
Expand All @@ -231,26 +249,42 @@ func (s *ManagerSuite) TestFailures(c *C) {
})
c.Assert(check.Threshold, Equals, 3)
c.Assert(check.Status, Equals, CheckStatusUp)
c.Assert(check.LastError, Matches, "exit status 1")
c.Assert(failureName, Equals, "")
checkRecoverChange := func(status state.Status, numTasks int) {
st.Lock()
defer st.Unlock()
changes := st.Changes()
c.Assert(changes, HasLen, 1)
c.Check(changes[0].Kind(), Equals, "recover-check")
c.Check(changes[0].Status(), Equals, status)
tasks := changes[0].Tasks()
c.Assert(tasks, HasLen, numTasks)
task := tasks[len(tasks)-1]
c.Check(task.Kind(), Equals, "check-failure")
c.Check(task.Status(), Equals, status)
c.Check(task.Summary(), Equals, fmt.Sprintf(`Check failure %d (threshold 3)`, numTasks))
c.Assert(task.Log(), HasLen, 1)
c.Check(task.Log()[0], Matches, `2\S+ INFO exit status 1`)
}
checkRecoverChange(state.DoingStatus, 1)

// Shouldn't have called failure handler after only 2 failures
check = waitCheck(c, mgr, "chk1", func(check *CheckInfo) bool {
return check.Failures == 2
})
c.Assert(check.Threshold, Equals, 3)
c.Assert(check.Status, Equals, CheckStatusUp)
c.Assert(check.LastError, Matches, "exit status 1")
c.Assert(failureName, Equals, "")
checkRecoverChange(state.DoingStatus, 2)

// Should have called failure handler and be unhealthy after 3 failures (threshold)
check = waitCheck(c, mgr, "chk1", func(check *CheckInfo) bool {
return check.Failures == 3
})
c.Assert(check.Threshold, Equals, 3)
c.Assert(check.Status, Equals, CheckStatusDown)
c.Assert(check.LastError, Matches, "exit status 1")
c.Assert(failureName, Equals, "chk1")
checkRecoverChange(state.DoingStatus, 3)

// Should reset number of failures if command then succeeds
failureName = ""
Expand All @@ -260,8 +294,8 @@ func (s *ManagerSuite) TestFailures(c *C) {
})
c.Assert(check.Failures, Equals, 0)
c.Assert(check.Threshold, Equals, 3)
c.Assert(check.LastError, Equals, "")
c.Assert(failureName, Equals, "")
checkRecoverChange(state.DoneStatus, 3)
}

func waitCheck(c *C, mgr *CheckManager, name string, f func(check *CheckInfo) bool) *CheckInfo {
Expand Down
2 changes: 1 addition & 1 deletion internal/overlord/overlord.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ func New(pebbleDir string, restartHandler restart.Handler, serviceOutput io.Writ
o.commandMgr = cmdstate.NewManager(o.runner)
o.addManager(o.commandMgr)

o.checkMgr = checkstate.NewManager()
o.checkMgr = checkstate.NewManager(s)

// Tell check manager about plan updates.
o.serviceMgr.NotifyPlanChanged(o.checkMgr.PlanChanged)
Expand Down
Loading