Skip to content

Commit

Permalink
Improve Swarm support (#333)
Browse files Browse the repository at this point in the history
* Query for labeled services as well

* Try scaling down services

* Scale services back up

* Use progress tool from Docker CLI

* In test, label both services

* Clean up error and log messages

* Document scale-up/down approach in docs

* Downgrade Docker CLI to match client

* Document services stats

* Do not rely on PreviousSpec for storing desired replica count

* Log warnings from Docker when updating services

* Check whether container and service labels collide

* Document script behavior on label collision

* Add additional check if all containers have been removed

* Scale services concurrently

* Move docker interaction code into own file

* Factor out code for service updating

* Time out after five minutes of not reaching desired container count

* Inline handling of in-swarm container level restart

* Timer is more suitable for timeout race

* Timeout when scaling down services should be configurable

* Choose better filename

* Reflect changes in naming

* Rename and deprecate BACKUP_STOP_CONTAINER_LABEL

* Improve logging

* Further simplify logging
  • Loading branch information
m90 authored Jan 31, 2024
1 parent 2065fb2 commit c3daeac
Show file tree
Hide file tree
Showing 18 changed files with 642 additions and 147 deletions.
4 changes: 3 additions & 1 deletion cmd/backup/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ type Config struct {
BackupRetentionDays int32 `split_words:"true" default:"-1"`
BackupPruningLeeway time.Duration `split_words:"true" default:"1m"`
BackupPruningPrefix string `split_words:"true"`
BackupStopContainerLabel string `split_words:"true" default:"true"`
BackupStopContainerLabel string `split_words:"true"`
BackupStopDuringBackupLabel string `split_words:"true" default:"true"`
BackupStopServiceTimeout time.Duration `split_words:"true" default:"5m"`
BackupFromSnapshot bool `split_words:"true"`
BackupExcludeRegexp RegexpDecoder `split_words:"true"`
BackupSkipBackendsFromPrune []string `split_words:"true"`
Expand Down
4 changes: 2 additions & 2 deletions cmd/backup/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ func main() {
}()

s.must(s.withLabeledCommands(lifecyclePhaseArchive, func() error {
restartContainers, err := s.stopContainers()
restartContainersAndServices, err := s.stopContainersAndServices()
// The mechanism for restarting containers is not using hooks as it
// should happen as soon as possible (i.e. before uploading backups or
// similar).
defer func() {
s.must(restartContainers())
s.must(restartContainersAndServices())
}()
if err != nil {
return err
Expand Down
128 changes: 1 addition & 127 deletions cmd/backup/script.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ package main

import (
"bytes"
"context"
"errors"
"fmt"
"io"
"io/fs"
Expand All @@ -30,10 +28,6 @@ import (
openpgp "github.com/ProtonMail/go-crypto/openpgp/v2"
"github.com/containrrr/shoutrrr"
"github.com/containrrr/shoutrrr/pkg/router"
"github.com/docker/docker/api/types"
ctr "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/api/types/swarm"
"github.com/docker/docker/client"
"github.com/leekchan/timeutil"
"github.com/offen/envconfig"
Expand Down Expand Up @@ -318,126 +312,6 @@ func newScript() (*script, error) {
return s, nil
}

// stopContainers stops all Docker containers that are marked as to being
// stopped during the backup and returns a function that can be called to
// restart everything that has been stopped.
func (s *script) stopContainers() (func() error, error) {
if s.cli == nil {
return noop, nil
}

allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{})
if err != nil {
return noop, fmt.Errorf("stopContainers: error querying for containers: %w", err)
}

containerLabel := fmt.Sprintf(
"docker-volume-backup.stop-during-backup=%s",
s.c.BackupStopContainerLabel,
)
containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{
Filters: filters.NewArgs(filters.KeyValuePair{
Key: "label",
Value: containerLabel,
}),
})

if err != nil {
return noop, fmt.Errorf("stopContainers: error querying for containers to stop: %w", err)
}

if len(containersToStop) == 0 {
return noop, nil
}

s.logger.Info(
fmt.Sprintf(
"Stopping %d container(s) labeled `%s` out of %d running container(s).",
len(containersToStop),
containerLabel,
len(allContainers),
),
)

var stoppedContainers []types.Container
var stopErrors []error
for _, container := range containersToStop {
if err := s.cli.ContainerStop(context.Background(), container.ID, ctr.StopOptions{}); err != nil {
stopErrors = append(stopErrors, err)
} else {
stoppedContainers = append(stoppedContainers, container)
}
}

var stopError error
if len(stopErrors) != 0 {
stopError = fmt.Errorf(
"stopContainers: %d error(s) stopping containers: %w",
len(stopErrors),
errors.Join(stopErrors...),
)
}

s.stats.Containers = ContainersStats{
All: uint(len(allContainers)),
ToStop: uint(len(containersToStop)),
Stopped: uint(len(stoppedContainers)),
}

return func() error {
servicesRequiringUpdate := map[string]struct{}{}

var restartErrors []error
for _, container := range stoppedContainers {
if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok {
servicesRequiringUpdate[swarmServiceName] = struct{}{}
continue
}
if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil {
restartErrors = append(restartErrors, err)
}
}

if len(servicesRequiringUpdate) != 0 {
services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
for serviceName := range servicesRequiringUpdate {
var serviceMatch swarm.Service
for _, service := range services {
if service.Spec.Name == serviceName {
serviceMatch = service
break
}
}
if serviceMatch.ID == "" {
return fmt.Errorf("stopContainers: couldn't find service with name %s", serviceName)
}
serviceMatch.Spec.TaskTemplate.ForceUpdate += 1
if _, err := s.cli.ServiceUpdate(
context.Background(), serviceMatch.ID,
serviceMatch.Version, serviceMatch.Spec, types.ServiceUpdateOptions{},
); err != nil {
restartErrors = append(restartErrors, err)
}
}
}

if len(restartErrors) != 0 {
return fmt.Errorf(
"stopContainers: %d error(s) restarting containers and services: %w",
len(restartErrors),
errors.Join(restartErrors...),
)
}
s.logger.Info(
fmt.Sprintf(
"Restarted %d container(s) and the matching service(s).",
len(stoppedContainers),
),
)
return nil
}, stopError
}

// createArchive creates a tar archive of the configured backup location and
// saves it to disk.
func (s *script) createArchive() error {
Expand All @@ -448,7 +322,7 @@ func (s *script) createArchive() error {
"Using BACKUP_FROM_SNAPSHOT has been deprecated and will be removed in the next major version.",
)
s.logger.Warn(
"Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the README for an upgrade guide.",
"Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the documentation for an upgrade guide.",
)
backupSources = filepath.Join("/tmp", s.c.BackupSources)
// copy before compressing guard against a situation where backup folder's content are still growing.
Expand Down
10 changes: 10 additions & 0 deletions cmd/backup/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ type ContainersStats struct {
StopErrors uint
}

// ServicesStats contains info about Swarm services that have been
// operated upon
type ServicesStats struct {
All uint
ToScaleDown uint
ScaledDown uint
ScaleDownErrors uint
}

// BackupFileStats stats about the created backup file
type BackupFileStats struct {
Name string
Expand All @@ -40,6 +49,7 @@ type Stats struct {
LockedTime time.Duration
LogOutput *bytes.Buffer
Containers ContainersStats
Services ServicesStats
BackupFile BackupFileStats
Storages map[string]StorageStats
}
Loading

0 comments on commit c3daeac

Please sign in to comment.