Skip to content

Commit

Permalink
fix: rolling deployments race (#2790)
Browse files Browse the repository at this point in the history
If we delete the deployments/runners straight away it may still be in
some controllers route tables. By adding a small delay we make sure that
all the controllers will have updated their table.

This is a pretty nasty hack, but will likely be temporary.

fixes: #2789
  • Loading branch information
stuartwdouglas authored Sep 23, 2024
1 parent 82c5898 commit 2580e66
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
16 changes: 11 additions & 5 deletions backend/controller/scaling/k8sscaling/deployment_provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,17 @@ func (r *DeploymentProvisioner) handleSchemaChange(ctx context.Context, msg *ftl
case ftlv1.DeploymentChangeType_DEPLOYMENT_REMOVED:
delete(r.KnownDeployments, msg.DeploymentKey)
if deploymentExists {
logger.Infof("deleting deployment %s", msg.ModuleName)
err := deploymentClient.Delete(ctx, msg.DeploymentKey, v1.DeleteOptions{})
if err != nil {
return fmt.Errorf("failed to delete deployment %s: %w", msg.ModuleName, err)
}
go func() {

// Nasty hack, we want all the controllers to have updated their route tables before we kill the runner
// so we add a slight delay here
time.Sleep(time.Second * 10)
logger.Infof("deleting deployment %s", msg.ModuleName)
err := deploymentClient.Delete(ctx, msg.DeploymentKey, v1.DeleteOptions{})
if err != nil {
logger.Errorf(err, "failed to delete deployment %s", msg.ModuleName)
}
}()
}
}
return nil
Expand Down
7 changes: 6 additions & 1 deletion backend/controller/scaling/localscaling/local_scaling.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,12 @@ func (l *localScaling) reconcileRunners(ctx context.Context, deploymentRunners *
return err
}
} else if deploymentRunners.replicas == 0 && deploymentRunners.runner.Ok() {
deploymentRunners.runner.MustGet().cancelFunc()
go func() {
// Nasty hack, we want all the controllers to have updated their route tables before we kill the runner
// so we add a slight delay here
time.Sleep(time.Second * 10)
deploymentRunners.runner.MustGet().cancelFunc()
}()
deploymentRunners.runner = optional.None[runnerInfo]()
}
return nil
Expand Down

0 comments on commit 2580e66

Please sign in to comment.