Skip to content

Commit

Permalink
Record a "recover" change when recovering a svc, and 1 task per restart
Browse files Browse the repository at this point in the history
This will enable debugging/introspection on whether a service is
failing and being auto-restarted. You can introspect it with the
changes API or the "pebble changes" and "pebble tasks" CLI commands.

The kind (type) of the changes is "recover" and the kind of the tasks
is "restart".

Example output while I service is begin recovered (change 3 is not yet
ready):

$ pebble changes
ID   Status  Spawn                Ready                Summary
1    Done    today at 16:08 NZST  today at 16:08 NZST  Autostart service "test2"
2    Done    today at 16:08 NZST  today at 16:09 NZST  Recover service "test2"
3    Done    today at 16:09 NZST  -                    Recover service "test2"

After it's recovered the change is marked done/Ready:

$ pebble changes
ID   Status  Spawn                Ready                Summary
1    Done    today at 16:08 NZST  today at 16:08 NZST  Autostart service "test2"
2    Done    today at 16:08 NZST  today at 16:09 NZST  Recover service "test2"
3    Done    today at 16:09 NZST  today at 16:10 NZST  Recover service "test2"

$ pebble tasks 2
Status  Spawn                Ready                Summary
Done    today at 16:08 NZST  today at 16:08 NZST  Restart service "test2"
Done    today at 16:08 NZST  today at 16:08 NZST  Restart service "test2"
Done    today at 16:09 NZST  today at 16:09 NZST  Restart service "test2"

$ pebble tasks 3
Status  Spawn                Ready                Summary
Done    today at 16:09 NZST  today at 16:09 NZST  Restart service "test2"

Fixes canonical#104
  • Loading branch information
benhoyt committed May 2, 2022
1 parent ceeaaa5 commit d91e700
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 14 deletions.
67 changes: 55 additions & 12 deletions internal/overlord/servstate/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,18 +81,19 @@ const (

// serviceData holds the state and other data for a service under our control.
type serviceData struct {
manager *ServiceManager
state serviceState
config *plan.Service
logs *servicelog.RingBuffer
started chan error
stopped chan error
cmd *exec.Cmd
backoffNum int
backoffTime time.Duration
resetTimer *time.Timer
restarting bool
restarts int
manager *ServiceManager
state serviceState
config *plan.Service
logs *servicelog.RingBuffer
started chan error
stopped chan error
cmd *exec.Cmd
backoffNum int
backoffTime time.Duration
resetTimer *time.Timer
restarting bool
restarts int
recoverChange *state.Change
}

func (m *ServiceManager) doStart(task *state.Task, tomb *tomb.Tomb) error {
Expand Down Expand Up @@ -185,6 +186,7 @@ func (m *ServiceManager) serviceForStart(task *state.Task, config *plan.Service)
// Start allowed when service is backing off, was stopped, or has exited.
service.backoffNum = 0
service.backoffTime = 0
service.closeRecoverChange()
service.transition(stateInitial)
return service
default:
Expand Down Expand Up @@ -263,6 +265,7 @@ func (m *ServiceManager) serviceForStop(task *state.Task, name string) *serviceD
service.transition(stateStopped)
return nil
default:
service.closeRecoverChange()
return service
}
}
Expand Down Expand Up @@ -484,7 +487,11 @@ func addLastLogs(task *state.Task, logBuffer *servicelog.RingBuffer) {
task.Logf("Most recent service output:\n%s", logs)
}
}

func (s *serviceData) doBackoff(action plan.ServiceAction, onType string) {
if s.backoffNum == 0 {
s.openRecoverChange()
}
s.backoffNum++
s.backoffTime = calculateNextBackoff(s.config, s.backoffTime)
logger.Noticef("Service %q %s action is %q, waiting ~%s before restart (backoff %d)",
Expand All @@ -494,6 +501,16 @@ func (s *serviceData) doBackoff(action plan.ServiceAction, onType string) {
time.AfterFunc(duration, func() { logError(s.backoffTimeElapsed()) })
}

func (s *serviceData) openRecoverChange() {
s.closeRecoverChange() // shouldn't be necessary, but won't hurt either

st := s.manager.state
st.Lock()
defer st.Unlock()

s.recoverChange = st.NewChange("recover", fmt.Sprintf("Recover service %q", s.config.Name))
}

func calculateNextBackoff(config *plan.Service, current time.Duration) time.Duration {
if current == 0 {
// First backoff time
Expand Down Expand Up @@ -603,6 +620,9 @@ func (s *serviceData) backoffTimeElapsed() error {
return err
}
s.transition(stateRunning)
if s.recoverChange != nil {
s.addRecoverTask()
}

default:
// Ignore if timer elapsed in any other state.
Expand All @@ -611,6 +631,16 @@ func (s *serviceData) backoffTimeElapsed() error {
return nil
}

func (s *serviceData) addRecoverTask() {
st := s.manager.state
st.Lock()
defer st.Unlock()

task := st.NewTask("restart", fmt.Sprintf("Restart service %q", s.config.Name))
task.SetStatus(state.DoneStatus)
s.recoverChange.AddTask(task)
}

// terminateTimeElapsed is called after stop sends SIGTERM and the service
// still hasn't exited (and we then send SIGTERM).
func (s *serviceData) terminateTimeElapsed() error {
Expand Down Expand Up @@ -672,6 +702,7 @@ func (s *serviceData) backoffResetElapsed() error {
s.config.Name, s.backoffNum, s.backoffTime)
s.backoffNum = 0
s.backoffTime = 0
s.closeRecoverChange() // service is running successfully, set "recover" change to Done

default:
// Ignore if timer elapsed in any other state.
Expand All @@ -680,6 +711,18 @@ func (s *serviceData) backoffResetElapsed() error {
return nil
}

func (s *serviceData) closeRecoverChange() {
st := s.manager.state
st.Lock()
defer st.Unlock()

if s.recoverChange == nil {
return
}
s.recoverChange.SetStatus(state.DoneStatus)
s.recoverChange = nil
}

// checkFailed handles a health check failure (from the check manager).
func (s *serviceData) checkFailed(action plan.ServiceAction) {
switch s.state {
Expand Down
85 changes: 83 additions & 2 deletions internal/overlord/servstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ func (s *S) startServices(c *C, services []string, nEnsure int) *state.Change {
s.st.Lock()
ts, err := servstate.Start(s.st, services)
c.Check(err, IsNil)
chg := s.st.NewChange("test", "Start test")
chg := s.st.NewChange("start", "Start test")
chg.AddAll(ts)
s.st.Unlock()

Expand All @@ -234,7 +234,7 @@ func (s *S) stopServices(c *C, services []string, nEnsure int) *state.Change {
s.st.Lock()
ts, err := servstate.Stop(s.st, services)
c.Check(err, IsNil)
chg := s.st.NewChange("test", "Stop test")
chg := s.st.NewChange("stop", "Stop test")
chg.AddAll(ts)
s.st.Unlock()

Expand Down Expand Up @@ -891,6 +891,25 @@ services:
c.Assert(svc.Current, Equals, servstate.StatusActive)
c.Check(s.logBufferString(), Matches, `2.* \[test2\] test2\n`)

// Ensure it has recorded the "recover" change correctly.
func() {
s.st.Lock()
defer s.st.Unlock()
var changes []*state.Change
for _, chg := range s.st.Changes() {
if chg.Kind() == "recover" {
changes = append(changes, chg)
}
}
c.Assert(changes, HasLen, 1)
tasks := changes[0].Tasks()
c.Check(tasks, HasLen, 2)
c.Check(tasks[0].Kind(), Equals, "restart")
c.Check(tasks[0].Status(), Equals, state.DoneStatus)
c.Check(tasks[1].Kind(), Equals, "restart")
c.Check(tasks[1].Status(), Equals, state.DoneStatus)
}()

// Test that backoff reset time is working (set to backoff-limit)
time.Sleep(175 * time.Millisecond)
c.Check(s.manager.BackoffNum("test2"), Equals, 0)
Expand All @@ -909,6 +928,23 @@ services:
svc = s.serviceByName(c, "test2")
c.Assert(svc.Current, Equals, servstate.StatusActive)
c.Check(s.logBufferString(), Matches, `2.* \[test2\] test2\n`)

// Ensure it has recorded the "recover" changes correctly.
s.st.Lock()
defer s.st.Unlock()
var changes []*state.Change
for _, chg := range s.st.Changes() {
if chg.Kind() == "recover" {
changes = append(changes, chg)
}
}
c.Assert(changes, HasLen, 2)
c.Check(changes[0].Status(), Equals, state.DoneStatus)
c.Check(changes[1].Status(), Equals, state.DoneStatus)
tasks := changes[1].Tasks()
c.Check(tasks, HasLen, 1)
c.Check(tasks[0].Kind(), Equals, "restart")
c.Check(tasks[0].Status(), Equals, state.DoneStatus)
}

func (s *S) TestStopDuringBackoff(c *C) {
Expand Down Expand Up @@ -944,6 +980,19 @@ services:
s.waitUntilService(c, "test2", func(svc *servstate.ServiceInfo) bool {
return svc.Current == servstate.StatusInactive
})

// Ensure it has recorded the "recover" change correctly.
s.st.Lock()
defer s.st.Unlock()
var changes []*state.Change
for _, chg := range s.st.Changes() {
if chg.Kind() == "recover" {
changes = append(changes, chg)
}
}
c.Assert(changes, HasLen, 1)
c.Check(changes[0].Status(), Equals, state.DoneStatus)
c.Check(changes[0].Tasks(), HasLen, 0) // no "restart" tasks because it hadn't been restarted
}

func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) {
Expand Down Expand Up @@ -1034,6 +1083,22 @@ checks:
svc := s.serviceByName(c, "test2")
c.Assert(svc.Current, Equals, servstate.StatusActive)
c.Assert(s.manager.BackoffNum("test2"), Equals, 1)

// Ensure it has recorded the "recover" change correctly.
s.st.Lock()
defer s.st.Unlock()
var changes []*state.Change
for _, chg := range s.st.Changes() {
if chg.Kind() == "recover" {
changes = append(changes, chg)
}
}
c.Assert(changes, HasLen, 1)
c.Check(changes[0].Status(), Equals, state.DoneStatus)
tasks := changes[0].Tasks()
c.Assert(tasks, HasLen, 1)
c.Check(tasks[0].Kind(), Equals, "restart")
c.Check(tasks[0].Status(), Equals, state.DoneStatus)
}

func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) {
Expand Down Expand Up @@ -1112,6 +1177,22 @@ checks:
c.Assert(len(checks), Equals, 1)
c.Assert(checks[0].Status, Equals, checkstate.CheckStatusDown)
c.Assert(checks[0].LastError, Matches, ".* executable file not found .*")

// Ensure it has recorded the "recover" change correctly.
s.st.Lock()
defer s.st.Unlock()
var changes []*state.Change
for _, chg := range s.st.Changes() {
if chg.Kind() == "recover" {
changes = append(changes, chg)
}
}
c.Assert(changes, HasLen, 1)
c.Check(changes[0].Status(), Equals, state.DoneStatus)
tasks := changes[0].Tasks()
c.Assert(tasks, HasLen, 1)
c.Check(tasks[0].Kind(), Equals, "restart")
c.Check(tasks[0].Status(), Equals, state.DoneStatus)
}

func (s *S) TestOnCheckFailureIgnore(c *C) {
Expand Down

0 comments on commit d91e700

Please sign in to comment.