diff --git a/pkg/storage/chunk/client/aws/s3_storage_client.go b/pkg/storage/chunk/client/aws/s3_storage_client.go index 12fea874e311f..2b5458af6af52 100644 --- a/pkg/storage/chunk/client/aws/s3_storage_client.go +++ b/pkg/storage/chunk/client/aws/s3_storage_client.go @@ -21,12 +21,15 @@ import ( "github.com/aws/aws-sdk-go/service/s3" "github.com/aws/aws-sdk-go/service/s3/s3iface" awscommon "github.com/grafana/dskit/aws" + "github.com/grafana/dskit/backoff" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/instrument" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + amnet "k8s.io/apimachinery/pkg/util/net" + bucket_s3 "github.com/grafana/loki/v3/pkg/storage/bucket/s3" "github.com/grafana/loki/v3/pkg/storage/chunk/client" "github.com/grafana/loki/v3/pkg/storage/chunk/client/hedging" @@ -532,5 +535,61 @@ func (a *S3ObjectClient) IsObjectNotFoundErr(err error) bool { return false } -// TODO(dannyk): implement for client -func (a *S3ObjectClient) IsRetryableErr(error) bool { return false } +func isTimeoutError(err error) bool { + var netErr net.Error + return errors.As(err, &netErr) && netErr.Timeout() +} + +func isContextErr(err error) bool { + return errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) +} + +// IsStorageTimeoutErr returns true if error means that object cannot be retrieved right now due to server-side timeouts. +func (a *S3ObjectClient) IsStorageTimeoutErr(err error) bool { + // TODO(dannyk): move these out to be generic + // context errors are all client-side + if isContextErr(err) { + return false + } + + // connection misconfiguration, or writing on a closed connection + // do NOT retry; this is not a server-side issue + if errors.Is(err, net.ErrClosed) || amnet.IsConnectionRefused(err) { + return false + } + + // this is a server-side timeout + if isTimeoutError(err) { + return true + } + + // connection closed (closed before established) or reset (closed after established) + // this is a server-side issue + if errors.Is(err, io.EOF) || amnet.IsConnectionReset(err) { + return true + } + + if rerr, ok := err.(awserr.RequestFailure); ok { + // https://docs.aws.amazon.com/sdkref/latest/guide/feature-retry-behavior.html + return rerr.StatusCode() == http.StatusRequestTimeout || + rerr.StatusCode() == http.StatusGatewayTimeout + } + + return false +} + +// IsStorageThrottledErr returns true if error means that object cannot be retrieved right now due to throttling. +func (a *S3ObjectClient) IsStorageThrottledErr(err error) bool { + if rerr, ok := err.(awserr.RequestFailure); ok { + + // https://docs.aws.amazon.com/sdkref/latest/guide/feature-retry-behavior.html + return rerr.StatusCode() == http.StatusTooManyRequests || + (rerr.StatusCode()/100 == 5) // all 5xx errors are retryable + } + + return false +} +func (a *S3ObjectClient) IsRetryableErr(err error) bool { + return a.IsStorageTimeoutErr(err) || a.IsStorageThrottledErr(err) +} diff --git a/pkg/storage/chunk/client/aws/s3_storage_client_test.go b/pkg/storage/chunk/client/aws/s3_storage_client_test.go index 3a2c1e8dc33c3..ba2939ff46884 100644 --- a/pkg/storage/chunk/client/aws/s3_storage_client_test.go +++ b/pkg/storage/chunk/client/aws/s3_storage_client_test.go @@ -6,9 +6,11 @@ import ( "errors" "fmt" "io" + "net" "net/http" "net/http/httptest" "strings" + "syscall" "testing" "time" @@ -73,6 +75,108 @@ func TestIsObjectNotFoundErr(t *testing.T) { } } +func TestIsRetryableErr(t *testing.T) { + tests := []struct { + err error + expected bool + name string + }{ + { + name: "IsStorageThrottledErr - Too Many Requests", + err: awserr.NewRequestFailure( + awserr.New("TooManyRequests", "TooManyRequests", nil), 429, "reqId", + ), + expected: true, + }, + { + name: "IsStorageThrottledErr - 500", + err: awserr.NewRequestFailure( + awserr.New("500", "500", nil), 500, "reqId", + ), + expected: true, + }, + { + name: "IsStorageThrottledErr - 5xx", + err: awserr.NewRequestFailure( + awserr.New("501", "501", nil), 501, "reqId", + ), + expected: true, + }, + { + name: "IsStorageTimeoutErr - Request Timeout", + err: awserr.NewRequestFailure( + awserr.New("Request Timeout", "Request Timeout", nil), 408, "reqId", + ), + expected: true, + }, + { + name: "IsStorageTimeoutErr - Gateway Timeout", + err: awserr.NewRequestFailure( + awserr.New("Gateway Timeout", "Gateway Timeout", nil), 504, "reqId", + ), + expected: true, + }, + { + name: "IsStorageTimeoutErr - EOF", + err: io.EOF, + expected: true, + }, + { + name: "IsStorageTimeoutErr - Connection Reset", + err: syscall.ECONNRESET, + expected: true, + }, + { + name: "IsStorageTimeoutErr - Timeout Error", + err: awserr.NewRequestFailure( + awserr.New("RequestCanceled", "request canceled due to timeout", nil), 408, "request-id", + ), + expected: true, + }, + { + name: "IsStorageTimeoutErr - Closed", + err: net.ErrClosed, + expected: false, + }, + { + name: "IsStorageTimeoutErr - Connection Refused", + err: syscall.ECONNREFUSED, + expected: false, + }, + { + name: "IsStorageTimeoutErr - Context Deadline Exceeded", + err: context.DeadlineExceeded, + expected: false, + }, + { + name: "IsStorageTimeoutErr - Context Canceled", + err: context.Canceled, + expected: false, + }, + { + name: "Not a retryable error", + err: syscall.EINVAL, + expected: false, + }, + { + name: "Not found 404", + err: awserr.NewRequestFailure( + awserr.New("404", "404", nil), 404, "reqId", + ), + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + client, err := NewS3ObjectClient(S3Config{BucketNames: "mybucket"}, hedging.Config{}) + require.NoError(t, err) + + require.Equal(t, tt.expected, client.IsRetryableErr(tt.err)) + }) + } +} + func TestRequestMiddleware(t *testing.T) { ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { fmt.Fprintln(w, r.Header.Get("echo-me")) diff --git a/production/helm/loki/src/alerts.yaml.tpl b/production/helm/loki/src/alerts.yaml.tpl index 144e263f7061f..0aa37b708b523 100644 --- a/production/helm/loki/src/alerts.yaml.tpl +++ b/production/helm/loki/src/alerts.yaml.tpl @@ -52,7 +52,7 @@ groups: message: | {{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time. expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: "5m" labels: severity: "warning" diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 7c0825d8580d6..09b9b6f543412 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -4,12 +4,12 @@ groups: - alert: LokiRequestErrors annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. summary: Loki request error rate is high. expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) > 10 for: 15m labels: @@ -17,16 +17,16 @@ groups: - alert: LokiRequestPanics annotations: description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. summary: Loki requests are causing code panics. expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 @@ -39,7 +39,7 @@ groups: {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. summary: Loki deployment is running more than one compactor. expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: 5m labels: severity: warning diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json b/production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json index 6e669361a057c..72b6eaf785b42 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json @@ -325,7 +325,7 @@ "sort": "none" } }, - "span": 1, + "span": 3, "targets": [ { "expr": "sum by(pod) (loki_write_memory_streams{cluster=~\"$cluster\", job=~\"($namespace)/(loki.*|enterprise-logs)-write\"})", @@ -414,7 +414,7 @@ "sort": "none" } }, - "span": 1, + "span": 3, "targets": [ { "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\"}[$__rate_interval]))", @@ -515,7 +515,7 @@ "sort": "none" } }, - "span": 1, + "span": 3, "targets": [ { "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\"})", @@ -577,7 +577,7 @@ "sort": "none" } }, - "span": 1, + "span": 3, "targets": [ { "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(loki.*|enterprise-logs)-write\"})", @@ -591,7 +591,19 @@ "sort": 2 }, "type": "timeseries" - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Write path", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ { "datasource": "$datasource", "fieldConfig": { @@ -627,7 +639,7 @@ "sort": "none" } }, - "span": 1, + "span": 4, "targets": [ { "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", @@ -674,7 +686,7 @@ "sort": "none" } }, - "span": 1, + "span": 4, "targets": [ { "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", @@ -721,7 +733,7 @@ "sort": "none" } }, - "span": 1, + "span": 4, "targets": [ { "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*(loki.*|enterprise-logs)-write.*\"} / kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*(loki.*|enterprise-logs)-write.*\"})", @@ -738,7 +750,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Write path", + "title": "", "titleSize": "h6" }, { @@ -819,7 +831,7 @@ "sort": "none" } }, - "span": 2, + "span": 4, "targets": [ { "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\"}[$__rate_interval]))", @@ -920,7 +932,7 @@ "sort": "none" } }, - "span": 2, + "span": 4, "targets": [ { "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\"})", @@ -982,7 +994,7 @@ "sort": "none" } }, - "span": 2, + "span": 4, "targets": [ { "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/(loki.*|enterprise-logs)-backend\"})", @@ -996,7 +1008,19 @@ "sort": 2 }, "type": "timeseries" - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Backend path", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ { "datasource": "$datasource", "fieldConfig": { @@ -1032,7 +1056,7 @@ "sort": "none" } }, - "span": 2, + "span": 4, "targets": [ { "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", @@ -1079,7 +1103,7 @@ "sort": "none" } }, - "span": 2, + "span": 4, "targets": [ { "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", @@ -1126,7 +1150,7 @@ "sort": "none" } }, - "span": 2, + "span": 4, "targets": [ { "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*(loki.*|enterprise-logs)-backend.*\"} / kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\".*(loki.*|enterprise-logs)-backend.*\"})", @@ -1143,7 +1167,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Backend path", + "title": "", "titleSize": "h6" } ], diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index 7c0825d8580d6..09b9b6f543412 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -4,12 +4,12 @@ groups: - alert: LokiRequestErrors annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. summary: Loki request error rate is high. expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) > 10 for: 15m labels: @@ -17,16 +17,16 @@ groups: - alert: LokiRequestPanics annotations: description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. summary: Loki requests are causing code panics. expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 @@ -39,7 +39,7 @@ groups: {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. summary: Loki deployment is running more than one compactor. expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: 5m labels: severity: warning diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 5bff18e72c6e5..9261dbccecf99 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -6,36 +6,36 @@ rules: [ { alert: 'LokiRequestErrors', - expr: ||| - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + expr: std.strReplace(||| + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) > 10 - |||, + |||, 'cluster', $._config.per_cluster_label), 'for': '15m', labels: { severity: 'critical', }, annotations: { summary: 'Loki request error rate is high.', - description: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, 'cluster', $._config.per_cluster_label), }, }, { alert: 'LokiRequestPanics', expr: ||| - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - |||, + sum(increase(loki_panic_total[10m])) by (%s, namespace, job) > 0 + ||| % $._config.per_cluster_label, labels: { severity: 'critical', }, annotations: { summary: 'Loki requests are causing code panics.', - description: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + |||, 'cluster', $._config.per_cluster_label), }, }, { @@ -49,15 +49,15 @@ }, annotations: { summary: 'Loki request error latency is high.', - description: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - |||, + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, 'cluster', $._config.per_cluster_label), }, }, { alert: 'LokiTooManyCompactorsRunning', expr: ||| - sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1 + sum(loki_boltdb_shipper_compactor_running) by (%s, namespace) > 1 ||| % $._config.per_cluster_label, 'for': '5m', labels: { diff --git a/production/loki-mixin/dashboards/loki-resources-overview.libsonnet b/production/loki-mixin/dashboards/loki-resources-overview.libsonnet index 76c77a1453446..a93df5d42e41d 100644 --- a/production/loki-mixin/dashboards/loki-resources-overview.libsonnet +++ b/production/loki-mixin/dashboards/loki-resources-overview.libsonnet @@ -41,7 +41,7 @@ ) + { tooltip: { sort: 2 }, // Sort descending. - }, + } ) .addPanel( $.CPUUsagePanel('CPU', write_pod_matcher), @@ -52,6 +52,9 @@ .addPanel( $.goHeapInUsePanel('Memory (go heap inuse)', write_job_matcher), ) + ) + .addRow( + $.row('') .addPanel( $.newQueryPanel('Disk Writes', 'Bps') + $.queryPanel( @@ -83,6 +86,9 @@ .addPanel( $.goHeapInUsePanel('Memory (go heap inuse)', backend_job_matcher), ) + ) + .addRow( + $.row('') .addPanel( $.newQueryPanel('Disk Writes', 'Bps') + $.queryPanel(