diff --git a/pkg/config/agentmanagement.go b/pkg/config/agentmanagement.go index ec331cf48737..1f50ccc639d4 100644 --- a/pkg/config/agentmanagement.go +++ b/pkg/config/agentmanagement.go @@ -42,6 +42,7 @@ type remoteConfigProvider interface { GetCachedRemoteConfig() ([]byte, error) CacheRemoteConfig(remoteConfigBytes []byte) error FetchRemoteConfig() ([]byte, error) + GetPollingInterval() time.Duration } type remoteConfigHTTPProvider struct { @@ -192,6 +193,10 @@ func (r remoteConfigHTTPProvider) FetchRemoteConfig() ([]byte, error) { return bb, nil } +func (r remoteConfigHTTPProvider) GetPollingInterval() time.Duration { + return r.InitialConfig.PollingInterval +} + type labelMap map[string]string type RemoteConfiguration struct { @@ -233,9 +238,20 @@ func getRemoteConfig(expandEnvVars bool, configProvider remoteConfigProvider, lo if err != nil { var retryAfterErr retryAfterError if errors.As(err, &retryAfterErr) && retry { - level.Error(log).Log("msg", "received retry-after from API, sleeping and falling back to cache", "retry-after", retryAfterErr.retryAfter) - time.Sleep(retryAfterErr.retryAfter) - return getRemoteConfig(expandEnvVars, configProvider, log, fs, false) + // In the case that the server is telling us to retry after a time greater than our polling interval, + // the agent should sleep for the duration of the retry-after header. + // + // If the duration of the retry-after is lower than the polling interval, the agent will simply + // fall back to the cache and continue polling at the polling interval, effectively skipping + // this poll. + if retryAfterErr.retryAfter > configProvider.GetPollingInterval() { + level.Info(log).Log("msg", "received retry-after from API, sleeping and falling back to cache", "retry-after", retryAfterErr.retryAfter) + time.Sleep(retryAfterErr.retryAfter) + } else { + level.Info(log).Log("msg", "received retry-after from API, falling back to cache", "retry-after", retryAfterErr.retryAfter) + } + // Return the cached config, as this is the last known good config and a config must be returned here. + return getCachedRemoteConfig(expandEnvVars, configProvider, fs, log) } level.Error(log).Log("msg", "could not fetch from API, falling back to cache", "err", err) return getCachedRemoteConfig(expandEnvVars, configProvider, fs, log) diff --git a/pkg/config/agentmanagement_test.go b/pkg/config/agentmanagement_test.go index 1152ff7cac75..9ca5b453f75f 100644 --- a/pkg/config/agentmanagement_test.go +++ b/pkg/config/agentmanagement_test.go @@ -48,6 +48,10 @@ func (t *testRemoteConfigProvider) CacheRemoteConfig(r []byte) error { return nil } +func (t *testRemoteConfigProvider) GetPollingInterval() time.Duration { + return t.InitialConfig.PollingInterval +} + var validAgentManagementConfig = AgentManagementConfig{ Enabled: true, Host: "localhost:1234", @@ -566,9 +570,8 @@ func TestGetCachedConfig_RetryAfter(t *testing.T) { assert.NoError(t, err) assert.False(t, testProvider.didCacheRemoteConfig) - // check that FetchRemoteConfig was called twice on the TestProvider: - // 1 call for the initial attempt, a second for the retry - assert.Equal(t, 2, testProvider.fetchRemoteConfigCallCount) + // check that FetchRemoteConfig was called only once on the TestProvider + assert.Equal(t, 1, testProvider.fetchRemoteConfigCallCount) // the cached config should have been retrieved once, on the second // attempt to fetch the remote config