Timeout when scaling down services should be configurable

offen · Jan 28, 2024 · 8535967 · 8535967
1 parent e6ca4ac
commit 8535967
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 4 deletions.
diff --git a/cmd/backup/config.go b/cmd/backup/config.go
@@ -38,6 +38,7 @@ type Config struct {
 	BackupPruningLeeway           time.Duration   `split_words:"true" default:"1m"`
 	BackupPruningPrefix           string          `split_words:"true"`
 	BackupStopContainerLabel      string          `split_words:"true" default:"true"`
+	BackupStopServiceTimeout      time.Duration   `split_words:"true" default:"5m"`
 	BackupFromSnapshot            bool            `split_words:"true"`
 	BackupExcludeRegexp           RegexpDecoder   `split_words:"true"`
 	BackupSkipBackendsFromPrune   []string        `split_words:"true"`

diff --git a/cmd/backup/docker.go b/cmd/backup/docker.go
@@ -41,17 +41,18 @@ func scaleService(cli *client.Client, serviceID string, replicas uint64) ([]stri
 	return response.Warnings, nil
 }
 
-func awaitContainerCountForService(cli *client.Client, serviceID string, count int) error {
+func awaitContainerCountForService(cli *client.Client, serviceID string, count int, timeoutAfter time.Duration) error {
 	poll := time.NewTicker(time.Second)
-	timeout := time.NewTimer(5 * time.Minute)
+	timeout := time.NewTimer(timeoutAfter)
 	defer timeout.Stop()
 	defer poll.Stop()
 
 	for {
 		select {
 		case <-timeout.C:
 			return fmt.Errorf(
-				"awaitContainerCount: timed out after waiting 5 minutes for service %s to reach desired container count of %d",
+				"awaitContainerCount: timed out after waiting %s for service %s to reach desired container count of %d",
+				timeoutAfter,
 				serviceID,
 				count,
 			)
@@ -196,7 +197,7 @@ func (s *script) stopContainersAndServices() (func() error, error) {
 				}
 				// progress.ServiceProgress returns too early, so we need to manually check
 				// whether all containers belonging to the service have actually been removed
-				if err := awaitContainerCountForService(s.cli, svc.serviceID, 0); err != nil {
+				if err := awaitContainerCountForService(s.cli, svc.serviceID, 0, s.c.BackupStopServiceTimeout); err != nil {
 					scaleDownErrors.append(err)
 				}
 			}(svc)

diff --git a/docs/reference/index.md b/docs/reference/index.md
@@ -326,6 +326,14 @@ You can populate below template according to your requirements and use it as you
 
 # BACKUP_STOP_CONTAINER_LABEL="service1"
 
+# When trying to scale down Docker Swarm services, give up after
+# the specified amount of time in case the service has not converged yet.
+# In case you need to adjust this timeout, supply a duration
+# value as per https://pkg.go.dev/time#ParseDuration to `BACKUP_STOP_SERVICE_TIMEOUT`.
+# Defaults to 5 minutes.
+
+# BACKUP_STOP_SERVICE_TIMEOUT="5m"
+
 ########### EXECUTING COMMANDS IN CONTAINERS PRE/POST BACKUP
 
 # It is possible to define commands to be run in any container before and after