Skip to content

Commit

Permalink
feat: maintain idle pool of runners (#1038)
Browse files Browse the repository at this point in the history
Adds `--idle-runners` arg to define how large the idle pool should be.

Fixes #1036
Fixes #1030

### Previous notes
Currently a draft because this PR makes #1036 more likely to be hit.
Before this change, killing all deployments would mean there are 0
runners, leading to no runner id collisions when you bring up more
deployments
After this change, killing all deployments means that there will still
be runners which will cause collisions if the idle runner ids aren't the
lowest possible [`R00000000000000000000002000`,
`R00000000000000000000004000` ... ]

I've been testing with a hacky fix replacing line
`bankend/controller/scaling/local_scaling.go:96` to:
```
binary.BigEndian.PutUint32(ulid[10:], rand.Uint32())
```
  • Loading branch information
matt2e authored Mar 7, 2024
1 parent 847d1c8 commit 2b8f713
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 12 deletions.
5 changes: 3 additions & 2 deletions backend/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ type Config struct {
RunnerTimeout time.Duration `help:"Runner heartbeat timeout." default:"10s"`
DeploymentReservationTimeout time.Duration `help:"Deployment reservation timeout." default:"120s"`
ArtefactChunkSize int `help:"Size of each chunk streamed to the client." default:"1048576"`
IdleRunners int `help:"Number of idle runners to keep around (not supported in production)." default:"1"`
}

func (c *Config) SetDefaults() {
Expand Down Expand Up @@ -787,7 +788,7 @@ func (s *Service) reconcileDeployments(ctx context.Context) (time.Duration, erro
deploymentLogger.Debugf("Need %d more runners for %s", require, reconcile.Deployment)
wg.Go(func(ctx context.Context) error {
if err := s.deploy(ctx, deployment); err != nil {
deploymentLogger.Debugf("Failed to increase deployment replicas: %s", err)
deploymentLogger.Errorf(err, "Failed to increase deployment replicas")
} else {
deploymentLogger.Debugf("Reconciled %s to %d/%d replicas", reconcile.Deployment, reconcile.AssignedReplicas+1, reconcile.RequiredReplicas)
if reconcile.AssignedReplicas+1 == reconcile.RequiredReplicas {
Expand Down Expand Up @@ -825,7 +826,7 @@ func (s *Service) reconcileRunners(ctx context.Context) (time.Duration, error) {
return 0, fmt.Errorf("%s: %w", "failed to get deployments needing reconciliation", err)
}

totalRunners := 0
totalRunners := s.config.IdleRunners
for _, deployment := range activeDeployments {
totalRunners += deployment.MinReplicas
}
Expand Down
11 changes: 1 addition & 10 deletions backend/controller/scaling/localscaling/local_scaling.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package localscaling

import (
"context"
"encoding/binary"
"errors"
"fmt"
"net/url"
Expand Down Expand Up @@ -80,6 +79,7 @@ func (l *LocalScaling) SetReplicas(ctx context.Context, replicas int, idleRunner
Bind: l.portAllocator.Next(),
ControllerEndpoint: controllerEndpoint,
TemplateDir: templateDir(ctx),
Key: model.NewRunnerKey(),
}

name := fmt.Sprintf("runner%d", i)
Expand All @@ -90,15 +90,6 @@ func (l *LocalScaling) SetReplicas(ctx context.Context, replicas int, idleRunner
return err
}

// Create a readable ULID for the runner.
var ulid [16]byte
binary.BigEndian.PutUint32(ulid[10:], uint32(len(l.runners)+1))
ulidStr := fmt.Sprintf("%025X", ulid)
err := config.Key.Scan(ulidStr)
if err != nil {
return err
}

runnerCtx := log.ContextWithLogger(ctx, logger.Scope(name))

runnerCtx, cancel := context.WithCancel(runnerCtx)
Expand Down
2 changes: 2 additions & 0 deletions cmd/ftl/cmd_serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type serveCmd struct {
Background bool `help:"Run in the background." default:"false"`
Stop bool `help:"Stop the running FTL instance. Can be used to --background to restart the server" default:"false"`
StartupTimeout time.Duration `help:"Timeout for the server to start up." default:"20s"`
IdleRunners int `help:"Number of idle runners to keep around (not supported in production)." default:"1"`
}

const ftlContainerName = "ftl-db-1"
Expand Down Expand Up @@ -100,6 +101,7 @@ func (s *serveCmd) Run(ctx context.Context) error {
DSN: dsn,
AllowOrigins: s.AllowOrigins,
NoConsole: s.NoConsole,
IdleRunners: s.IdleRunners,
}
if err := kong.ApplyDefaults(&config); err != nil {
return err
Expand Down

0 comments on commit 2b8f713

Please sign in to comment.