Skip to content

Commit

Permalink
set rollout start date and don't start updating if rollout just changed
Browse files Browse the repository at this point in the history
This commit does two changes:
- the controller now sets the rollout start time when resetting the
  rollout
- the controller will not start a group if the rollout changed during
  the maintenance window (checks if the rollout start time is in the
  window)
  • Loading branch information
hugoShaka committed Dec 19, 2024
1 parent 860a9d4 commit e285c9e
Show file tree
Hide file tree
Showing 11 changed files with 232 additions and 57 deletions.
1 change: 1 addition & 0 deletions api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions api/proto/teleport/autoupdate/v1/autoupdate.proto
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ message AutoUpdateAgentRolloutStatus {
// For example, a group updates every day between 13:00 and 14:00. If the target version changes to 13:30, the group
// will not start updating to the new version directly. The controller sees that the group theoretical start time is
// before the rollout start time and the maintenance window belongs to the previous rollout.
// When the timestamp is nil, the controller will ignore the start time and check and allow groups to activate.
google.protobuf.Timestamp start_time = 3;
}

Expand Down
1 change: 1 addition & 0 deletions lib/autoupdate/rollout/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ func NewController(client Client, log *slog.Logger, clock clockwork.Clock, perio
reconciler: reconciler{
clt: client,
log: log,
clock: clock,
rolloutStrategies: []rolloutStrategy{
// TODO(hugoShaka): add the strategies here as we implement them
},
Expand Down
12 changes: 7 additions & 5 deletions lib/autoupdate/rollout/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (r *reconciler) tryReconcile(ctx context.Context) error {
return trace.Wrap(err, "computing rollout status")
}

// there was an existing rollout, we must figure if something changed
// We compute if something changed.
specChanged := !proto.Equal(existingRollout.GetSpec(), newSpec)
statusChanged := !proto.Equal(existingRollout.GetStatus(), newStatus)
rolloutChanged := specChanged || statusChanged
Expand Down Expand Up @@ -273,6 +273,8 @@ func (r *reconciler) computeStatus(
// We create a new status if the rollout should be reset or the previous status was nil
if shouldResetRollout || existingRollout.GetStatus() == nil {
status = new(autoupdate.AutoUpdateAgentRolloutStatus)
// We set the start time if this is a new rollout
status.StartTime = timestamppb.New(r.clock.Now())
} else {
status = utils.CloneProtoMsg(existingRollout.GetStatus())
}
Expand Down Expand Up @@ -302,16 +304,16 @@ func (r *reconciler) computeStatus(
return nil, trace.Wrap(err, "creating groups status")
}
}
status.Groups = groups

err = r.progressRollout(ctx, newSpec.GetStrategy(), groups)
err = r.progressRollout(ctx, newSpec.GetStrategy(), status)
// Failing to progress the update is not a hard failure.
// We want to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user.
if err != nil {
r.log.ErrorContext(ctx, "Errors encountered during rollout progress. Some groups might not get updated properly.",
"error", err)
}

status.Groups = groups
status.State = computeRolloutState(groups)
return status, nil
}
Expand All @@ -320,10 +322,10 @@ func (r *reconciler) computeStatus(
// groups are updated in place.
// If an error is returned, the groups should still be upserted, depending on the strategy,
// failing to update a group might not be fatal (other groups can still progress independently).
func (r *reconciler) progressRollout(ctx context.Context, strategyName string, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error {
func (r *reconciler) progressRollout(ctx context.Context, strategyName string, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
for _, strategy := range r.rolloutStrategies {
if strategy.name() == strategyName {
return strategy.progressRollout(ctx, groups)
return strategy.progressRollout(ctx, status)
}
}
return trace.NotImplemented("rollout strategy %q not implemented", strategyName)
Expand Down
57 changes: 36 additions & 21 deletions lib/autoupdate/rollout/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ func TestTryReconcile(t *testing.T) {
t.Parallel()
log := utils.NewSlogLoggerForTests()
ctx := context.Background()
clock := clockwork.NewFakeClock()

// Test setup: creating fixtures
configOK, err := update.NewAutoUpdateConfig(&autoupdate.AutoUpdateConfigSpec{
Tools: &autoupdate.AutoUpdateConfigSpecTools{
Expand Down Expand Up @@ -186,7 +188,7 @@ func TestTryReconcile(t *testing.T) {
Strategy: update.AgentsStrategyHaltOnError,
})
require.NoError(t, err)
upToDateRollout.Status = &autoupdate.AutoUpdateAgentRolloutStatus{}
upToDateRollout.Status = &autoupdate.AutoUpdateAgentRolloutStatus{StartTime: timestamppb.New(clock.Now())}

outOfDateRollout, err := update.NewAutoUpdateAgentRollout(&autoupdate.AutoUpdateAgentRolloutSpec{
StartVersion: "1.2.2",
Expand Down Expand Up @@ -315,8 +317,9 @@ func TestTryReconcile(t *testing.T) {
// Test execution: Running the reconciliation

reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

require.NoError(t, reconciler.tryReconcile(ctx))
Expand All @@ -330,6 +333,7 @@ func TestTryReconcile(t *testing.T) {
func TestReconciler_Reconcile(t *testing.T) {
log := utils.NewSlogLoggerForTests()
ctx := context.Background()
clock := clockwork.NewFakeClock()
// Test setup: creating fixtures
config, err := update.NewAutoUpdateConfig(&autoupdate.AutoUpdateConfigSpec{
Tools: &autoupdate.AutoUpdateConfigSpecTools{
Expand Down Expand Up @@ -361,7 +365,7 @@ func TestReconciler_Reconcile(t *testing.T) {
Strategy: update.AgentsStrategyHaltOnError,
})
require.NoError(t, err)
upToDateRollout.Status = &autoupdate.AutoUpdateAgentRolloutStatus{}
upToDateRollout.Status = &autoupdate.AutoUpdateAgentRolloutStatus{StartTime: timestamppb.New(clock.Now())}

outOfDateRollout, err := update.NewAutoUpdateAgentRollout(&autoupdate.AutoUpdateAgentRolloutSpec{
StartVersion: "1.2.2",
Expand Down Expand Up @@ -407,8 +411,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand All @@ -431,8 +436,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand Down Expand Up @@ -471,8 +477,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand Down Expand Up @@ -509,8 +516,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand All @@ -533,8 +541,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand Down Expand Up @@ -563,8 +572,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand Down Expand Up @@ -704,7 +714,7 @@ func (f *fakeRolloutStrategy) name() string {
return f.strategyName
}

func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error {
func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
f.calls++
return nil
}
Expand Down Expand Up @@ -742,8 +752,9 @@ func Test_reconciler_computeStatus(t *testing.T) {
newGroups, err := r.makeGroupsStatus(ctx, schedules, clock.Now())
require.NoError(t, err)
newStatus := &autoupdate.AutoUpdateAgentRolloutStatus{
Groups: newGroups,
State: autoupdate.AutoUpdateAgentRolloutState_AUTO_UPDATE_AGENT_ROLLOUT_STATE_UNSTARTED,
Groups: newGroups,
State: autoupdate.AutoUpdateAgentRolloutState_AUTO_UPDATE_AGENT_ROLLOUT_STATE_UNSTARTED,
StartTime: timestamppb.New(clock.Now()),
}

tests := []struct {
Expand Down Expand Up @@ -835,15 +846,19 @@ func Test_reconciler_computeStatus(t *testing.T) {
Strategy: fakeRolloutStrategyName,
},
// groups should be unset
expectedStatus: &autoupdate.AutoUpdateAgentRolloutStatus{},
expectedStatus: &autoupdate.AutoUpdateAgentRolloutStatus{
StartTime: timestamppb.New(clock.Now()),
},
expectedStrategyCalls: 0,
},
{
name: "new groups are populated if previous ones were empty",
existingRollout: &autoupdate.AutoUpdateAgentRollout{
Spec: oldSpec,
// old groups were empty
Status: &autoupdate.AutoUpdateAgentRolloutStatus{},
Status: &autoupdate.AutoUpdateAgentRolloutStatus{
StartTime: timestamppb.New(clock.Now()),
},
},
// no spec change
newSpec: oldSpec,
Expand Down
13 changes: 12 additions & 1 deletion lib/autoupdate/rollout/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,14 @@ const (
// Common update reasons
updateReasonCreated = "created"
updateReasonReconcilerError = "reconciler_error"
updateReasonRolloutChanged = "rollout_changed_during_window"
)

// rolloutStrategy is responsible for rolling out the update across groups.
// This interface allows us to inject dummy strategies for simpler testing.
type rolloutStrategy interface {
name() string
progressRollout(context.Context, []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error
progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutStatus) error
}

func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) {
Expand All @@ -53,6 +54,16 @@ func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time
return int(group.ConfigStartHour) == now.Hour(), nil
}

// rolloutChangedInWindow checks if the rollout got created after the theoretical group start time
func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time) (bool, error) {
// If the rollout is older than 24h, we know it did not change during the window
if now.Sub(rolloutStart) > 24*time.Hour {
return false, nil
}
// Else we check if the rollout happened in the group window.
return inWindow(group, rolloutStart)
}

func canUpdateToday(allowedDays []string, now time.Time) (bool, error) {
for _, allowedDay := range allowedDays {
if allowedDay == types.Wildcard {
Expand Down
39 changes: 27 additions & 12 deletions lib/autoupdate/rollout/strategy_haltonerror.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func newHaltOnErrorStrategy(log *slog.Logger, clock clockwork.Clock) (rolloutStr
}, nil
}

func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error {
func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
now := h.clock.Now()
// We process every group in order, all the previous groups must be in the DONE state
// for the next group to become active. Even if some early groups are not DONE,
Expand All @@ -72,12 +72,12 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*aut
// to transition "staging" to DONE.
previousGroupsAreDone := true

for i, group := range groups {
for i, group := range status.Groups {
switch group.State {
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED:
var previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup
if i != 0 {
previousGroup = groups[i-1]
previousGroup = status.Groups[i-1]
}
canStart, err := canStartHaltOnError(group, previousGroup, now)
if err != nil {
Expand All @@ -86,16 +86,31 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*aut
setGroupState(group, group.State, updateReasonReconcilerError, now)
return err
}

// Check if the rollout got created after the theoretical group start time
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime())
if err != nil {
setGroupState(group, group.State, updateReasonReconcilerError, now)
return err
}

switch {
case previousGroupsAreDone && canStart:
// We can start
setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now)
case previousGroupsAreDone:
// All previous groups are OK, but time-related criteria are not OK
case !previousGroupsAreDone:
// All previous groups are not DONE
setGroupState(group, group.State, updateReasonPreviousGroupsNotDone, now)
case !canStart:
// All previous groups are DONE, but time-related criteria are not met
// This can be because we are outside an update window, or because the
// specified wait_hours doesn't let us update yet.
setGroupState(group, group.State, updateReasonCannotStart, now)
case rolloutChangedDuringWindow:
// All previous groups are DONE and time-related criteria are met.
// However, the rollout changed during the maintenance window.
setGroupState(group, group.State, updateReasonRolloutChanged, now)
default:
// At least one previous group is not DONE
setGroupState(group, group.State, updateReasonPreviousGroupsNotDone, now)
// All previous groups are DONE and time-related criteria are met.
// We can start.
setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now)
}
previousGroupsAreDone = false
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK:
Expand Down Expand Up @@ -132,10 +147,10 @@ func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRollout

previousStart := previousGroup.StartTime.AsTime()
if previousStart.IsZero() || previousStart.Unix() == 0 {
return false, trace.BadParameter("the previous group doesn't have a start time, cannot check the 'wait_hours' criteria")
return false, trace.BadParameter("the previous group doesn't have a start time, cannot check the 'wait_hours' criterion")
}

// Check if the wait_hours criteria is OK, if we are at least after 'wait_hours' hours since the previous start.
// Check if the wait_hours criterion is OK, if we are at least after 'wait_hours' hours since the previous start.
if now.Before(previousGroup.StartTime.AsTime().Add(time.Duration(group.ConfigWaitHours) * time.Hour)) {
return false, nil
}
Expand Down
Loading

0 comments on commit e285c9e

Please sign in to comment.