canonical · benhoyt · Nov 30, 2023 · Nov 27, 2023 · Nov 28, 2023 · Nov 29, 2023
diff --git a/README.md b/README.md
@@ -263,7 +263,9 @@ If the configuration of `requires`, `before`, and `after` for a service results
 Pebble's service manager automatically restarts services that exit unexpectedly. By default, this is done whether the exit code is zero or non-zero, but you can change this using the `on-success` and `on-failure` fields in a configuration layer. The possible values for these fields are:
 
 * `restart`: restart the service and enter a restart-backoff loop (the default behaviour).
-* `shutdown`: shut down and exit the Pebble daemon
+* `shutdown`: shut down and exit the Pebble daemon (with exit code 0 if the service exits successfully, exit code 10 otherwise)
+  - `success-shutdown`: shut down, ensuring exit code 0 (valid only for `on-failure`)
+  - `failure-shutdown`: shut down, ensuring exit code 10 (valid only for `on-success`)
 * `ignore`: ignore the service exiting and do nothing further
 
 In `restart` mode, the first time a service exits, Pebble waits the `backoff-delay`, which defaults to half a second. If the service exits again, Pebble calculates the next backoff delay by multiplying the current delay by `backoff-factor`, which defaults to 2.0 (doubling). The increasing delay is capped at `backoff-limit`, which defaults to 30 seconds.
@@ -691,21 +693,29 @@ services:
         working-dir: <directory>
 
         # (Optional) Defines what happens when the service exits with a zero
-        # exit code. Possible values are: "restart" (default) which restarts
-        # the service after the backoff delay, "shutdown" which shuts down and
-        # exits the Pebble server, and "ignore" which does nothing further.
-        on-success: restart | shutdown | ignore
+        # exit code. Possible values are:
+        #
+        # - restart (default): restart the service after the backoff delay
+        # - shutdown: shut down and exit the Pebble daemon (with exit code 0)
+        # - failure-shutdown: shut down and exit Pebble, ensuring exit code 10
+        # - ignore: do nothing further
+        on-success: restart | shutdown | failure-shutdown | ignore
 
         # (Optional) Defines what happens when the service exits with a nonzero
-        # exit code. Possible values are: "restart" (default) which restarts
-        # the service after the backoff delay, "shutdown" which shuts down and
-        # exits the Pebble server, and "ignore" which does nothing further.
-        on-failure: restart | shutdown | ignore
+        # exit code. Possible values are:
+        #
+        # - restart (default): restart the service after the backoff delay
+        # - shutdown: shut down and exit the Pebble daemon (with exit code 10)
+        # - success-shutdown: shut down and exit Pebble, ensuring exit code 0
+        # - ignore: do nothing further
+        on-failure: restart | shutdown | success-shutdown | ignore
 
         # (Optional) Defines what happens when each of the named health checks
-        # fail. Possible values are: "restart" (default) which restarts
-        # the service once, "shutdown" which shuts down and exits the Pebble
-        # server, and "ignore" which does nothing further.
+        # fail. Possible values are:
+        #
+        # - restart (default): restart the service once
+        # - shutdown: shut down and exit the Pebble daemon
+        # - ignore: do nothing further
         on-check-failure:
             <check name>: restart | shutdown | ignore
 

diff --git a/internals/cli/cmd_run.go b/internals/cli/cmd_run.go
@@ -15,6 +15,7 @@
 package cli
 
 import (
+	"errors"
 	"fmt"
 	"os"
 	"os/signal"
@@ -92,11 +93,15 @@ func (rcmd *cmdRun) run(ready chan<- func()) {
 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
 
 	if err := runDaemon(rcmd, sigs, ready); err != nil {
-		if err == daemon.ErrRestartSocket {
+		switch {
+		case errors.Is(err, daemon.ErrRestartSocket):
 			// No "error: " prefix as this isn't an error.
 			fmt.Fprintf(os.Stdout, "%v\n", err)
 			// This exit code must be in system'd SuccessExitStatus.
 			panic(&exitStatus{42})
+		case errors.Is(err, daemon.ErrRestartServiceFailure):
+			// Daemon returns this exit code for service-failure shutdown.
+			panic(&exitStatus{10})
 		}
 		fmt.Fprintf(os.Stderr, "cannot run daemon: %v\n", err)
 		panic(&exitStatus{1})

diff --git a/internals/daemon/daemon.go b/internals/daemon/daemon.go
@@ -47,7 +47,8 @@ import (
 )
 
 var (
-	ErrRestartSocket = fmt.Errorf("daemon stop requested to wait for socket activation")
+	ErrRestartSocket         = fmt.Errorf("daemon stop requested to wait for socket activation")
+	ErrRestartServiceFailure = fmt.Errorf("daemon stop requested due to service failure")
 
 	systemdSdNotify = systemd.SdNotify
 	sysGetuid       = sys.Getuid
@@ -96,12 +97,7 @@ type Daemon struct {
 	router              *mux.Router
 	standbyOpinions     *standby.StandbyOpinions
 
-	// set to remember we need to restart the system
-	restartSystem bool
-
-	// set to remember that we need to exit the daemon in a way that
-	// prevents systemd from restarting it
-	restartSocket bool
+	restartType restart.RestartType
 
 	// degradedErr is set when the daemon is in degraded mode
 	degradedErr error
@@ -529,24 +525,19 @@ func (d *Daemon) Start() error {
 
 // HandleRestart implements overlord.RestartBehavior.
 func (d *Daemon) HandleRestart(t restart.RestartType) {
+	d.mu.Lock()
+	d.restartType = t
+	d.mu.Unlock()
+
 	// die when asked to restart (systemd should get us back up!) etc
 	switch t {
-	case restart.RestartDaemon:
+	case restart.RestartDaemon, restart.RestartServiceFailure, restart.RestartSocket:
 	case restart.RestartSystem:
 		// try to schedule a fallback slow reboot already here
 		// in case we get stuck shutting down
 		if err := rebootHandler(rebootWaitTimeout); err != nil {
 			logger.Noticef("%s", err)
 		}
-
-		d.mu.Lock()
-		defer d.mu.Unlock()
-		// remember we need to restart the system
-		d.restartSystem = true
-	case restart.RestartSocket:
-		d.mu.Lock()
-		defer d.mu.Unlock()
-		d.restartSocket = true
 	default:
 		logger.Noticef("Internal error: restart handler called with unknown restart type: %v", t)
 	}
@@ -584,13 +575,12 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error {
 	d.tomb.Kill(nil)
 
 	d.mu.Lock()
-	restartSystem := d.restartSystem
-	restartSocket := d.restartSocket
+	restartType := d.restartType
 	d.mu.Unlock()
 
 	d.standbyOpinions.Stop()
 
-	if restartSystem {
+	if restartType == restart.RestartSystem {
 		// give time to polling clients to notice restart
 		time.Sleep(rebootNoticeWait)
 	}
@@ -602,12 +592,12 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error {
 	d.tomb.Kill(d.serve.Shutdown(ctx))
 	cancel()
 
-	if !restartSystem {
+	if restartType != restart.RestartSystem {
 		// tell systemd that we are stopping
 		systemdSdNotify("STOPPING=1")
 	}
 
-	if restartSocket {
+	if restartType == restart.RestartSocket {
 		// At this point we processed all open requests (and
 		// stopped accepting new requests) - before going into
 		// socket activated mode we need to check if any of
@@ -617,7 +607,7 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error {
 		// If this is the case we do a "normal" pebble restart
 		// to process the new changes.
 		if !d.standbyOpinions.CanStandby() {
-			d.restartSocket = false
+			d.restartType = restart.RestartUnset
 		}
 	}
 	d.overlord.Stop()
@@ -628,19 +618,22 @@ func (d *Daemon) Stop(sigCh chan<- os.Signal) error {
 		// because we already scheduled a slow shutdown and
 		// exiting here will just restart pebble (via systemd)
 		// which will lead to confusing results.
-		if restartSystem {
+		if restartType == restart.RestartSystem {
 			logger.Noticef("WARNING: cannot stop daemon: %v", err)
 		} else {
 			return err
 		}
 	}
 
-	if restartSystem {
+	if restartType == restart.RestartSystem {
 		return d.doReboot(sigCh, rebootWaitTimeout)
 	}
 
-	if d.restartSocket {
+	switch d.restartType {
+	case restart.RestartSocket:
 		return ErrRestartSocket
+	case restart.RestartServiceFailure:
+		return ErrRestartServiceFailure
 	}
 
 	return nil

diff --git a/internals/daemon/daemon_test.go b/internals/daemon/daemon_test.go
@@ -779,7 +779,7 @@ func (s *daemonSuite) TestRestartSystemWiring(c *C) {
 
 	defer func() {
 		d.mu.Lock()
-		d.restartSystem = false
+		d.restartType = restart.RestartUnset
 		d.mu.Unlock()
 	}()
 
@@ -790,10 +790,10 @@ func (s *daemonSuite) TestRestartSystemWiring(c *C) {
 	}
 
 	d.mu.Lock()
-	rs := d.restartSystem
+	restartType := d.restartType
 	d.mu.Unlock()
 
-	c.Check(rs, Equals, true)
+	c.Check(restartType, Equals, restart.RestartSystem)
 
 	c.Check(delays, HasLen, 1)
 	c.Check(delays[0], DeepEquals, rebootWaitTimeout)
@@ -1052,7 +1052,7 @@ func (s *daemonSuite) TestRestartIntoSocketModeNoNewChanges(c *C) {
 	}
 	err := d.Stop(nil)
 	c.Check(err, Equals, ErrRestartSocket)
-	c.Check(d.restartSocket, Equals, true)
+	c.Check(d.restartType, Equals, restart.RestartSocket)
 }
 
 func (s *daemonSuite) TestRestartIntoSocketModePendingChanges(c *C) {
@@ -1095,7 +1095,55 @@ func (s *daemonSuite) TestRestartIntoSocketModePendingChanges(c *C) {
 	// when the daemon got a pending change it just restarts
 	err := d.Stop(nil)
 	c.Check(err, IsNil)
-	c.Check(d.restartSocket, Equals, false)
+	c.Check(d.restartType, Equals, restart.RestartUnset)
+}
+
+func (s *daemonSuite) TestRestartServiceFailure(c *C) {
+	writeTestLayer(s.pebbleDir, `
+services:
+    test1:
+        override: replace
+        command: /bin/sh -c 'sleep 1.5; exit 1'
+        on-failure: shutdown
+`)
+	d := s.newDaemon(c)
+	err := d.Init()
+	c.Assert(err, IsNil)
+	c.Assert(d.Start(), IsNil)
+
+	// Start the test service.
+	payload := bytes.NewBufferString(`{"action": "start", "services": ["test1"]}`)
+	req, err := http.NewRequest("POST", "/v1/services", payload)
+	c.Assert(err, IsNil)
+	rsp := v1PostServices(apiCmd("/v1/services"), req, nil).(*resp)
+	rec := httptest.NewRecorder()
+	rsp.ServeHTTP(rec, req)
+	c.Check(rec.Result().StatusCode, Equals, 202)
+
+	// We have to wait for it be in running state.
+	for i := 0; ; i++ {
+		if i >= 25 {
+			c.Fatalf("timed out waiting or service to start")
+		}
+		d.state.Lock()
+		change := d.state.Change(rsp.Change)
+		d.state.Unlock()
+		if change != nil && change.IsReady() {
+			break
+		}
+		time.Sleep(50 * time.Millisecond)
+	}
+
+	// Wait for daemon to be shut down by the failed service.
+	select {
+	case <-d.Dying():
+	case <-time.After(2 * time.Second):
+		c.Fatalf("timed out waiting for ")
+	}
+
+	// Ensure it returned a service-failure error.
+	err = d.Stop(nil)
+	c.Assert(err, Equals, ErrRestartServiceFailure)
 }
 
 func (s *daemonSuite) TestConnTrackerCanShutdown(c *C) {

diff --git a/internals/overlord/restart/restart.go b/internals/overlord/restart/restart.go
@@ -36,6 +36,7 @@ const (
 	RestartSystemHaltNow
 	// RestartSystemPoweroffNow will shutdown --poweroff the system asap
 	RestartSystemPoweroffNow
+	RestartServiceFailure
 )
 
 // Handler can handle restart requests and whether expected reboots happen.

diff --git a/internals/overlord/servstate/handlers.go b/internals/overlord/servstate/handlers.go
@@ -525,10 +525,27 @@ func (s *serviceData) exited(exitCode int) error {
 			s.transition(stateExited)
 
 		case plan.ActionShutdown:
-			logger.Noticef("Service %q %s action is %q, triggering server exit", s.config.Name, onType, action)
+			shutdownStr := "success"
+			restartType := restart.RestartDaemon
+			if exitCode != 0 {
+				shutdownStr = "failure"
+				restartType = restart.RestartServiceFailure
+			}
+			logger.Noticef("Service %q %s action is %q, triggering %s shutdown",
+				s.config.Name, onType, action, shutdownStr)
+			s.manager.restarter.HandleRestart(restartType)
+			s.transition(stateExited)
+
+		case plan.ActionSuccessShutdown:
+			logger.Noticef("Service %q %s action is %q, triggering success shutdown", s.config.Name, onType, action)
 			s.manager.restarter.HandleRestart(restart.RestartDaemon)
 			s.transition(stateExited)
 
+		case plan.ActionFailureShutdown:
+			logger.Noticef("Service %q %s action is %q, triggering failure shutdown", s.config.Name, onType, action)
+			s.manager.restarter.HandleRestart(restart.RestartServiceFailure)
+			s.transition(stateExited)
+
 		case plan.ActionRestart:
 			s.doBackoff(action, onType)