Skip to content

Commit

Permalink
[Agent Management] Modify Retry-After behavior (#5350)
Browse files Browse the repository at this point in the history
* Modify retry-after handling to only sleep if retry-after > polling_interval, otherwise it simply falls back to cache and will skip this poll
  • Loading branch information
spartan0x117 authored Oct 5, 2023
1 parent c3cf57e commit 642aa95
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
22 changes: 19 additions & 3 deletions pkg/config/agentmanagement.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ type remoteConfigProvider interface {
GetCachedRemoteConfig() ([]byte, error)
CacheRemoteConfig(remoteConfigBytes []byte) error
FetchRemoteConfig() ([]byte, error)
GetPollingInterval() time.Duration
}

type remoteConfigHTTPProvider struct {
Expand Down Expand Up @@ -192,6 +193,10 @@ func (r remoteConfigHTTPProvider) FetchRemoteConfig() ([]byte, error) {
return bb, nil
}

func (r remoteConfigHTTPProvider) GetPollingInterval() time.Duration {
return r.InitialConfig.PollingInterval
}

type labelMap map[string]string

type RemoteConfiguration struct {
Expand Down Expand Up @@ -233,9 +238,20 @@ func getRemoteConfig(expandEnvVars bool, configProvider remoteConfigProvider, lo
if err != nil {
var retryAfterErr retryAfterError
if errors.As(err, &retryAfterErr) && retry {
level.Error(log).Log("msg", "received retry-after from API, sleeping and falling back to cache", "retry-after", retryAfterErr.retryAfter)
time.Sleep(retryAfterErr.retryAfter)
return getRemoteConfig(expandEnvVars, configProvider, log, fs, false)
// In the case that the server is telling us to retry after a time greater than our polling interval,
// the agent should sleep for the duration of the retry-after header.
//
// If the duration of the retry-after is lower than the polling interval, the agent will simply
// fall back to the cache and continue polling at the polling interval, effectively skipping
// this poll.
if retryAfterErr.retryAfter > configProvider.GetPollingInterval() {
level.Info(log).Log("msg", "received retry-after from API, sleeping and falling back to cache", "retry-after", retryAfterErr.retryAfter)
time.Sleep(retryAfterErr.retryAfter)
} else {
level.Info(log).Log("msg", "received retry-after from API, falling back to cache", "retry-after", retryAfterErr.retryAfter)
}
// Return the cached config, as this is the last known good config and a config must be returned here.
return getCachedRemoteConfig(expandEnvVars, configProvider, fs, log)
}
level.Error(log).Log("msg", "could not fetch from API, falling back to cache", "err", err)
return getCachedRemoteConfig(expandEnvVars, configProvider, fs, log)
Expand Down
9 changes: 6 additions & 3 deletions pkg/config/agentmanagement_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ func (t *testRemoteConfigProvider) CacheRemoteConfig(r []byte) error {
return nil
}

func (t *testRemoteConfigProvider) GetPollingInterval() time.Duration {
return t.InitialConfig.PollingInterval
}

var validAgentManagementConfig = AgentManagementConfig{
Enabled: true,
Host: "localhost:1234",
Expand Down Expand Up @@ -566,9 +570,8 @@ func TestGetCachedConfig_RetryAfter(t *testing.T) {
assert.NoError(t, err)
assert.False(t, testProvider.didCacheRemoteConfig)

// check that FetchRemoteConfig was called twice on the TestProvider:
// 1 call for the initial attempt, a second for the retry
assert.Equal(t, 2, testProvider.fetchRemoteConfigCallCount)
// check that FetchRemoteConfig was called only once on the TestProvider
assert.Equal(t, 1, testProvider.fetchRemoteConfigCallCount)

// the cached config should have been retrieved once, on the second
// attempt to fetch the remote config
Expand Down

0 comments on commit 642aa95

Please sign in to comment.