diff --git a/CHANGELOG.md b/CHANGELOG.md index 34ddaec07fe9..16d90d3a32de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,6 +87,9 @@ Main (unreleased) - Flow: improve river config validation step in `prometheus.scrape` by comparing `scrape_timeout` with `scrape_interval`. (@wildum) +- Flow: add `randomization_factor` and `multiplier` to retry settings in + `otelcol` components. (@rfratto) + - Add support for `windows_certificate_filter` under http tls config block. (@mattdurham) - Add `openstack` config converter to convert OpenStack yaml config (static mode) to river config (flow mode). (@wildum) diff --git a/component/otelcol/config_retry.go b/component/otelcol/config_retry.go index 227246b01f49..8b1d7b35991a 100644 --- a/component/otelcol/config_retry.go +++ b/component/otelcol/config_retry.go @@ -1,27 +1,37 @@ package otelcol import ( + "fmt" "time" - "github.com/cenkalti/backoff/v4" + "github.com/grafana/river" otelexporterhelper "go.opentelemetry.io/collector/exporter/exporterhelper" ) // RetryArguments holds shared settings for components which can retry // requests. type RetryArguments struct { - Enabled bool `river:"enabled,attr,optional"` - InitialInterval time.Duration `river:"initial_interval,attr,optional"` - MaxInterval time.Duration `river:"max_interval,attr,optional"` - MaxElapsedTime time.Duration `river:"max_elapsed_time,attr,optional"` + Enabled bool `river:"enabled,attr,optional"` + InitialInterval time.Duration `river:"initial_interval,attr,optional"` + RandomizationFactor float64 `river:"randomization_factor,attr,optional"` + Multiplier float64 `river:"multiplier,attr,optional"` + MaxInterval time.Duration `river:"max_interval,attr,optional"` + MaxElapsedTime time.Duration `river:"max_elapsed_time,attr,optional"` } +var ( + _ river.Defaulter = (*RetryArguments)(nil) + _ river.Validator = (*RetryArguments)(nil) +) + // DefaultRetryArguments holds default settings for RetryArguments. var DefaultRetryArguments = RetryArguments{ - Enabled: true, - InitialInterval: 5 * time.Second, - MaxInterval: 30 * time.Second, - MaxElapsedTime: 5 * time.Minute, + Enabled: true, + InitialInterval: 5 * time.Second, + RandomizationFactor: 0.5, + Multiplier: 1.5, + MaxInterval: 30 * time.Second, + MaxElapsedTime: 5 * time.Minute, } // SetToDefault implements river.Defaulter. @@ -29,6 +39,19 @@ func (args *RetryArguments) SetToDefault() { *args = DefaultRetryArguments } +// Validate returns an error if args is invalid. +func (args *RetryArguments) Validate() error { + if args.Multiplier <= 1 { + return fmt.Errorf("multiplier must be greater than 1.0") + } + + if args.RandomizationFactor < 0 { + return fmt.Errorf("randomization_factor must be greater or equal to 0") + } + + return nil +} + // Convert converts args into the upstream type. func (args *RetryArguments) Convert() *otelexporterhelper.RetrySettings { if args == nil { @@ -38,8 +61,8 @@ func (args *RetryArguments) Convert() *otelexporterhelper.RetrySettings { return &otelexporterhelper.RetrySettings{ Enabled: args.Enabled, InitialInterval: args.InitialInterval, - RandomizationFactor: backoff.DefaultRandomizationFactor, - Multiplier: backoff.DefaultMultiplier, + RandomizationFactor: args.RandomizationFactor, + Multiplier: args.Multiplier, MaxInterval: args.MaxInterval, MaxElapsedTime: args.MaxElapsedTime, } diff --git a/docs/sources/shared/flow/reference/components/otelcol-retry-block.md b/docs/sources/shared/flow/reference/components/otelcol-retry-block.md index b15911d1f4eb..3fb04e99ad1d 100644 --- a/docs/sources/shared/flow/reference/components/otelcol-retry-block.md +++ b/docs/sources/shared/flow/reference/components/otelcol-retry-block.md @@ -14,14 +14,23 @@ Name | Type | Description | Default | Required ---- | ---- | ----------- | ------- | -------- `enabled` | `boolean` | Enables retrying failed requests. | `true` | no `initial_interval` | `duration` | Initial time to wait before retrying a failed request. | `"5s"` | no +`randomization_factor` | `number` | Factor to randomize wait time before retrying. | `0.5` | no +`multiplier` | `number` | Factor to grow wait time before retrying. | `1.5` | no `max_interval` | `duration` | Maximum time to wait between retries. | `"30s"` | no `max_elapsed_time` | `duration` | Maximum amount of time to wait before discarding a failed batch. | `"5m"` | no When `enabled` is `true`, failed batches are retried after a given interval. The `initial_interval` argument specifies how long to wait before the first retry attempt. If requests continue to fail, the time to wait before retrying -increases exponentially. The `max_interval` argument specifies the upper bound -of how long to wait between retries. +increases by the factor specified by the `multiplier` argument, which must be +greater than `1.0`. The `max_interval` argument specifies the upper bound of +how long to wait between retries. + +The `randomization_factor` argument is useful for adding jitter between retrying agents. +If `randomization_factor` is greater than `0`, the wait time before retries is +multiplied by a random factor in the range +`[ I - randomization_factor * I, I + randomization_factor * I]`, +where `I` is the current interval. If a batch has not sent successfully, it is discarded after the time specified by `max_elapsed_time` elapses. If `max_elapsed_time` is set to `"0s"`, failed