From 47ce2f33f2b0870cdba67b045db43ba8b014fdcc Mon Sep 17 00:00:00 2001 From: hugoShaka Date: Wed, 18 Dec 2024 13:10:11 -0500 Subject: [PATCH] refuse to start the update if the rollout start date is in window --- lib/autoupdate/rollout/reconciler.go | 2 +- lib/autoupdate/rollout/strategy.go | 11 ++++ .../rollout/strategy_haltonerror.go | 35 +++++++--- .../rollout/strategy_haltonerror_test.go | 38 +++++++++-- lib/autoupdate/rollout/strategy_test.go | 65 +++++++++++++++++-- lib/autoupdate/rollout/strategy_timebased.go | 18 ++--- 6 files changed, 142 insertions(+), 27 deletions(-) diff --git a/lib/autoupdate/rollout/reconciler.go b/lib/autoupdate/rollout/reconciler.go index b9899ac97ac4e..a6264ba3cbf27 100644 --- a/lib/autoupdate/rollout/reconciler.go +++ b/lib/autoupdate/rollout/reconciler.go @@ -153,7 +153,7 @@ func (r *reconciler) tryReconcile(ctx context.Context) error { return trace.Wrap(err, "computing rollout status") } - // we must figure if something changed + // We compute if something changed. specChanged := !proto.Equal(existingRollout.GetSpec(), newSpec) statusChanged := !proto.Equal(existingRollout.GetStatus(), newStatus) rolloutChanged := specChanged || statusChanged diff --git a/lib/autoupdate/rollout/strategy.go b/lib/autoupdate/rollout/strategy.go index aec5703a8f6ac..b3e214f6cebd7 100644 --- a/lib/autoupdate/rollout/strategy.go +++ b/lib/autoupdate/rollout/strategy.go @@ -33,6 +33,7 @@ const ( // Common update reasons updateReasonCreated = "created" updateReasonReconcilerError = "reconciler_error" + updateReasonRolloutChanged = "rollout_changed_during_window" ) // rolloutStrategy is responsible for rolling out the update across groups. @@ -53,6 +54,16 @@ func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time return int(group.ConfigStartHour) == now.Hour(), nil } +// rolloutChangedInWindow checks if the rollout got created after the theoretical group start time +func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time) (bool, error) { + // If the rollout is older than 24h, we know it did not change during the window + if now.Sub(rolloutStart) > 24*time.Hour { + return false, nil + } + // Else we check if the rollout happened in the group window. + return inWindow(group, rolloutStart) +} + func canUpdateToday(allowedDays []string, now time.Time) (bool, error) { for _, allowedDay := range allowedDays { if allowedDay == types.Wildcard { diff --git a/lib/autoupdate/rollout/strategy_haltonerror.go b/lib/autoupdate/rollout/strategy_haltonerror.go index c93438aaa1941..6f35b599e1149 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror.go +++ b/lib/autoupdate/rollout/strategy_haltonerror.go @@ -60,7 +60,7 @@ func newHaltOnErrorStrategy(log *slog.Logger, clock clockwork.Clock) (rolloutStr }, nil } -func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error { +func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus) error { now := h.clock.Now() // We process every group in order, all the previous groups must be in the DONE state // for the next group to become active. Even if some early groups are not DONE, @@ -72,12 +72,12 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*aut // to transition "staging" to DONE. previousGroupsAreDone := true - for i, group := range groups { + for i, group := range status.Groups { switch group.State { case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED: var previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup if i != 0 { - previousGroup = groups[i-1] + previousGroup = status.Groups[i-1] } canStart, err := canStartHaltOnError(group, previousGroup, now) if err != nil { @@ -86,16 +86,31 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*aut setGroupState(group, group.State, updateReasonReconcilerError, now) return err } + + // Check if the rollout got created after the theoretical group start time + rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime()) + if err != nil { + setGroupState(group, group.State, updateReasonReconcilerError, now) + return err + } + switch { - case previousGroupsAreDone && canStart: - // We can start - setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now) - case previousGroupsAreDone: - // All previous groups are OK, but time-related criteria are not OK + case !previousGroupsAreDone: + // All previous groups are not DONE + setGroupState(group, group.State, updateReasonPreviousGroupsNotDone, now) + case !canStart: + // All previous groups are DONE, but time-related criteria are not met + // This can be because we are outside an update window, or because the + // specified wait_hours doesn't let us update yet. setGroupState(group, group.State, updateReasonCannotStart, now) + case rolloutChangedDuringWindow: + // All previous groups are DONE and time-related criterias are met. + // However, the rollout changed during the maintenance window. + setGroupState(group, group.State, updateReasonRolloutChanged, now) default: - // At least one previous group is not DONE - setGroupState(group, group.State, updateReasonPreviousGroupsNotDone, now) + // All previous groups are DONE and time-related criterias are met. + // We can start. + setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now) } previousGroupsAreDone = false case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK: diff --git a/lib/autoupdate/rollout/strategy_haltonerror_test.go b/lib/autoupdate/rollout/strategy_haltonerror_test.go index 71a653c760361..84d4de069efe3 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror_test.go +++ b/lib/autoupdate/rollout/strategy_haltonerror_test.go @@ -148,9 +148,10 @@ func Test_progressGroupsHaltOnError(t *testing.T) { group3Name := "group3" tests := []struct { - name string - initialState []*autoupdate.AutoUpdateAgentRolloutStatusGroup - expectedState []*autoupdate.AutoUpdateAgentRolloutStatusGroup + name string + initialState []*autoupdate.AutoUpdateAgentRolloutStatusGroup + rolloutStartTime *timestamppb.Timestamp + expectedState []*autoupdate.AutoUpdateAgentRolloutStatusGroup }{ { name: "single group unstarted -> unstarted", @@ -175,6 +176,30 @@ func Test_progressGroupsHaltOnError(t *testing.T) { }, }, }, + { + name: "single group unstarted -> unstarted because rollout changed in window", + initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(yesterday), + LastUpdateReason: updateReasonCreated, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + rolloutStartTime: timestamppb.New(clock.Now()), + expectedState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ + { + Name: group1Name, + State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED, + LastUpdateTime: timestamppb.New(clock.Now()), + LastUpdateReason: updateReasonRolloutChanged, + ConfigDays: canStartToday, + ConfigStartHour: matchingStartHour, + }, + }, + }, { name: "single group unstarted -> active", initialState: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{ @@ -470,7 +495,12 @@ func Test_progressGroupsHaltOnError(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - err := strategy.progressRollout(ctx, tt.initialState) + status := &autoupdate.AutoUpdateAgentRolloutStatus{ + Groups: tt.initialState, + State: 0, + StartTime: tt.rolloutStartTime, + } + err := strategy.progressRollout(ctx, status) require.NoError(t, err) // We use require.Equal instead of Elements match because group order matters. // It's not super important for time-based, but is crucial for halt-on-error. diff --git a/lib/autoupdate/rollout/strategy_test.go b/lib/autoupdate/rollout/strategy_test.go index fbb7ec768d644..1348716ba6c1d 100644 --- a/lib/autoupdate/rollout/strategy_test.go +++ b/lib/autoupdate/rollout/strategy_test.go @@ -151,13 +151,70 @@ func Test_inWindow(t *testing.T) { } } +func Test_rolloutChangedInWindow(t *testing.T) { + // Test setup: creating fixtures. + group := &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + Name: "test-group", + ConfigDays: everyWeekdayButSunday, + ConfigStartHour: 12, + } + tests := []struct { + name string + now time.Time + rolloutStart time.Time + want bool + }{ + { + name: "zero rollout start time", + now: testSaturday, + rolloutStart: time.Time{}, + want: false, + }, + { + name: "epoch rollout start time", + now: testSaturday, + // tspb counts since epoch, wile go's zero is 0000-00-00 00:00:00 UTC + rolloutStart: (×tamppb.Timestamp{}).AsTime(), + want: false, + }, + { + name: "rollout changed a week ago", + now: testSaturday, + rolloutStart: testSaturday.Add(-7 * 24 * time.Hour), + want: false, + }, + { + name: "rollout changed the same day, before the window", + now: testSaturday, + rolloutStart: testSaturday.Add(-2 * time.Hour), + want: false, + }, + { + name: "rollout changed the same day, during the window", + now: testSaturday, + rolloutStart: testSaturday.Add(-2 * time.Minute), + want: true, + }, + { + name: "rollout just changed but we are not in a window", + now: testSunday, + rolloutStart: testSunday.Add(-2 * time.Minute), + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Test execution. + result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart) + require.NoError(t, err) + require.Equal(t, tt.want, result) + }) + } +} + func Test_setGroupState(t *testing.T) { groupName := "test-group" - // TODO(hugoShaka) remove those two variables once the strategies are merged and the constants are defined. - updateReasonCanStart := "can_start" - updateReasonCannotStart := "cannot_start" - clock := clockwork.NewFakeClock() // oldUpdateTime is 5 minutes in the past oldUpdateTime := clock.Now() diff --git a/lib/autoupdate/rollout/strategy_timebased.go b/lib/autoupdate/rollout/strategy_timebased.go index d5abd1334bc5f..06cad82ab8348 100644 --- a/lib/autoupdate/rollout/strategy_timebased.go +++ b/lib/autoupdate/rollout/strategy_timebased.go @@ -20,19 +20,16 @@ package rollout import ( "context" + "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" + update "github.com/gravitational/teleport/api/types/autoupdate" "github.com/gravitational/trace" "github.com/jonboulle/clockwork" "log/slog" - "time" - - "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" - update "github.com/gravitational/teleport/api/types/autoupdate" ) const ( - updateReasonInWindow = "in_window" - updateReasonOutsideWindow = "outside_window" - updateReasonRolloutChanged = "rollout_changed_during_window" + updateReasonInWindow = "in_window" + updateReasonOutsideWindow = "outside_window" ) type timeBasedStrategy struct { @@ -79,7 +76,12 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd } // Check if the rollout got created after the theoretical group start time - rolloutChangedDuringWindow := status.StartTime.AsTime().After(now.Truncate(time.Hour)) + rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime()) + if err != nil { + setGroupState(group, group.State, updateReasonReconcilerError, now) + errs = append(errs, err) + continue + } switch { case !shouldBeActive: