Skip to content

Commit

Permalink
fix: don't retry indefinitly with local scaling (#2974)
Browse files Browse the repository at this point in the history
If the runner keeps crashing dont keep restarting it
  • Loading branch information
stuartwdouglas authored Oct 3, 2024
1 parent 91d1822 commit c726b29
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion backend/controller/scaling/localscaling/local_scaling.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (

var _ scaling.RunnerScaling = &localScaling{}

const maxExits = 10

type localScaling struct {
lock sync.Mutex
cacheDir string
Expand Down Expand Up @@ -71,6 +73,7 @@ type deploymentInfo struct {
replicas int32
key string
language string
exits int
}
type runnerInfo struct {
cancelFunc context.CancelFunc
Expand Down Expand Up @@ -133,7 +136,7 @@ func (l *localScaling) handleSchemaChange(ctx context.Context, msg *ftlv1.PullSc
func (l *localScaling) reconcileRunners(ctx context.Context, deploymentRunners *deploymentInfo) error {
// Must be called under lock
logger := log.FromContext(ctx)
if deploymentRunners.replicas > 0 && !deploymentRunners.runner.Ok() {
if deploymentRunners.replicas > 0 && !deploymentRunners.runner.Ok() && deploymentRunners.exits < maxExits {
if err := l.startRunner(ctx, deploymentRunners.key, deploymentRunners); err != nil {
logger.Errorf(err, "Failed to start runner")
return err
Expand Down Expand Up @@ -205,6 +208,10 @@ func (l *localScaling) startRunner(ctx context.Context, deploymentKey string, in
}
l.lock.Lock()
defer l.lock.Unlock()
info.exits++
if info.exits >= maxExits {
logger.Errorf(fmt.Errorf("too many restarts"), "Runner failed too many times, not restarting")
}
info.runner = optional.None[runnerInfo]()
if l.debugPorts[info.module] == debug {
delete(l.debugPorts, info.module)
Expand Down

0 comments on commit c726b29

Please sign in to comment.