diff --git a/.drone/drone.jsonnet b/.drone/drone.jsonnet index d9bfee681f2a3..6dcea160a78c4 100644 --- a/.drone/drone.jsonnet +++ b/.drone/drone.jsonnet @@ -640,6 +640,7 @@ local build_image_tag = '0.33.0'; 'GIT_TARGET_BRANCH="$DRONE_TARGET_BRANCH"', ]) { depends_on: ['loki'], when: onPRs }, make('validate-example-configs', container=false) { depends_on: ['loki'] }, + make('validate-dev-cluster-config', container=false) { depends_on: ['loki'] }, make('check-example-config-doc', container=false) { depends_on: ['clone'] }, { name: 'build-docs-website', diff --git a/.drone/drone.yml b/.drone/drone.yml index a4b2b44299cee..d45f7898a0851 100644 --- a/.drone/drone.yml +++ b/.drone/drone.yml @@ -303,6 +303,13 @@ steps: environment: {} image: grafana/loki-build-image:0.33.0 name: validate-example-configs +- commands: + - make BUILD_IN_CONTAINER=false validate-dev-cluster-config + depends_on: + - loki + environment: {} + image: grafana/loki-build-image:0.33.0 + name: validate-dev-cluster-config - commands: - make BUILD_IN_CONTAINER=false check-example-config-doc depends_on: @@ -2106,6 +2113,6 @@ kind: secret name: gpg_private_key --- kind: signature -hmac: 30f2fb121d8271e00dc2ae8fe83a32e0e22fd2bd268609d0c3f295033fcd4fb6 +hmac: fe7669a21410ae5f2d1ad6b6205fdc582af874f65f7bd6a679731a88174e3a1c ... diff --git a/CHANGELOG.md b/CHANGELOG.md index f3c35651550ce..84010533ed5e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,8 @@ ##### Enhancements -* [11571](https://github.com/grafana/loki/pull/11571) **MichelHollands**: Add a metrics.go log line for requests from querier to ingester +* [11633](https://github.com/grafana/loki/pull/11633) **cyriltovena**: Add profiling integrations to tracing instrumentation. +* [11571](https://github.com/grafana/loki/pull/11571) **MichelHollands**: Add a metrics.go log line for requests from querier to ingester * [11477](https://github.com/grafana/loki/pull/11477) **MichelHollands**: support GET for /ingester/shutdown * [11363](https://github.com/grafana/loki/pull/11363) **kavirajk**: bugfix(memcached): Make memcached batch fetch truely context aware. * [11319](https://github.com/grafana/loki/pull/11319) **someStrangerFromTheAbyss**: Helm: Add extraContainers to the write pods. @@ -46,6 +47,8 @@ * [11539](https://github.com/grafana/loki/pull/11539) **kaviraj,ashwanthgoli** Support caching /series and /labels query results * [11545](https://github.com/grafana/loki/pull/11545) **dannykopping** Force correct memcached timeout when fetching chunks. * [11589](https://github.com/grafana/loki/pull/11589) **ashwanthgoli** Results Cache: Adds `query_length_served` cache stat to measure the length of the query served from cache. +* [11535](https://github.com/grafana/loki/pull/11535) **dannykopping** Query Frontend: Allow customisable splitting of queries which overlap the `query_ingester_within` window to reduce query pressure on ingesters. +* [11654](https://github.com/grafana/loki/pull/11654) **dannykopping** Cache: atomically check background cache size limit correctly. ##### Fixes * [11074](https://github.com/grafana/loki/pull/11074) **hainenber** Fix panic in lambda-promtail due to mishandling of empty DROP_LABELS env var. @@ -54,6 +57,8 @@ * [11551](https://github.com/grafana/loki/pull/11551) **dannykopping** Do not reflect label names in request metrics' "route" label. * [11601](https://github.com/grafana/loki/pull/11601) **dannykopping** Ruler: Fixed a panic that can be caused by concurrent read-write access of tenant configs when there are a large amount of rules. * [11606](https://github.com/grafana/loki/pull/11606) **dannykopping** Fixed regression adding newlines to HTTP error response bodies which may break client integrations. +* [11657](https://github.com/grafana/loki/pull/11657) **ashwanthgoli** Log results cache: compose empty response based on the request being served to avoid returning incorrect limit or direction. +* [11587](https://github.com/grafana/loki/pull/11587) **trevorwhitney** Fix semantics of label parsing logic of metrics and logs queries. Both only parse the first label if multiple extractions into the same label are requested. ##### Changes @@ -64,6 +69,7 @@ * [10959](https://github.com/grafana/loki/pull/10959) **slim-bean** introduce a backoff wait on subquery retries. * [11121](https://github.com/grafana/loki/pull/11121) **periklis** Ensure all lifecycler cfgs ref a valid IPv6 addr and port combination * [10650](https://github.com/grafana/loki/pull/10650) **matthewpi** Ensure the frontend uses a valid IPv6 addr and port combination +* [11665](https://github.com/grafana/loki/pull/11665) **salvacorts** Deprecate and flip `-legacy-read-mode` flag to `false` by default. #### Promtail @@ -93,6 +99,7 @@ #### Mixins * [11087](https://github.com/grafana/loki/pull/11087) **JoaoBraveCoding**: Adds structured metadata panels for ingested data +* [11637](https://github.com/grafana/loki/pull/11637) **JoaoBraveCoding**: Add route to write Distributor Latency dashboard #### Fixes diff --git a/Makefile b/Makefile index 50938cac56d91..d311ed1c4f3c6 100644 --- a/Makefile +++ b/Makefile @@ -801,6 +801,9 @@ EXAMPLES_SKIP_VALIDATION_FLAG := "doc-example:skip-validation=true" validate-example-configs: loki for f in $$(grep -rL $(EXAMPLES_SKIP_VALIDATION_FLAG) $(EXAMPLES_YAML_PATH)/*.yaml); do echo "Validating provided example config: $$f" && ./cmd/loki/loki -config.file=$$f -verify-config || exit 1; done +validate-dev-cluster-config: loki + ./cmd/loki/loki -config.file=./tools/dev/loki-boltdb-storage-s3/config/loki.yaml -verify-config + # Dynamically generate ./docs/sources/configure/examples.md using the example configs that we provide. # This target should be run if any of our example configs change. generate-example-config-doc: diff --git a/cmd/loki/main.go b/cmd/loki/main.go index 845104eee8de5..937a5c16fab80 100644 --- a/cmd/loki/main.go +++ b/cmd/loki/main.go @@ -10,7 +10,9 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/log" + "github.com/grafana/dskit/spanprofiler" "github.com/grafana/dskit/tracing" + "github.com/opentracing/opentracing-go" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/version" @@ -84,7 +86,9 @@ func main() { if err != nil { level.Error(util_log.Logger).Log("msg", "error in initializing tracing. tracing will not be enabled", "err", err) } - + if config.Tracing.ProfilingEnabled { + opentracing.SetGlobalTracer(spanprofiler.NewTracer(opentracing.GlobalTracer())) + } defer func() { if trace != nil { if err := trace.Close(); err != nil { diff --git a/docs/sources/configure/_index.md b/docs/sources/configure/_index.md index e2185c19474f0..51ecb12af62f1 100644 --- a/docs/sources/configure/_index.md +++ b/docs/sources/configure/_index.md @@ -2884,6 +2884,12 @@ The `limits_config` block configures global and per-tenant limits in Loki. # CLI flag: -querier.split-metadata-queries-by-interval [split_metadata_queries_by_interval: | default = 1d] +# Interval to use for time-based splitting when a request is within the +# `query_ingesters_within` window; defaults to `split-queries-by-interval` by +# setting to 0. +# CLI flag: -querier.split-ingester-queries-by-interval +[split_ingester_queries_by_interval: | default = 0s] + # Limit queries that can be sharded. Queries within the time range of now and # now minus this sharding lookback are not sharded. The default value of 0s # disables the lookback, causing sharding of all queries at all times. @@ -3088,6 +3094,10 @@ shard_streams: # CLI flag: -bloom-compactor.enable-compaction [bloom_compactor_enable_compaction: | default = false] +# The batch size of the chunks the bloom-compactor downloads at once. +# CLI flag: -bloom-compactor.chunks-batch-size +[bloom_compactor_chunks_batch_size: | default = 100] + # Length of the n-grams created when computing blooms from log lines. # CLI flag: -bloom-compactor.ngram-length [bloom_ngram_length: | default = 4] diff --git a/docs/sources/query/query_examples.md b/docs/sources/query/query_examples.md index 7eb4145acf745..1298a36f4a915 100644 --- a/docs/sources/query/query_examples.md +++ b/docs/sources/query/query_examples.md @@ -50,7 +50,7 @@ These LogQL query examples have explanations of what the queries accomplish. != "grafana_com" |= "session opened" != "sudo: " - |regexp "(^(?P\\S+ {1,2}){11})" + | regexp "(^(?P\\S+ {1,2}){11})" | line_format "USER = {{.user}}" ``` diff --git a/docs/sources/send-data/fluentbit/_index.md b/docs/sources/send-data/fluentbit/_index.md index c9088fdc8f886..a8d052c7262a9 100644 --- a/docs/sources/send-data/fluentbit/_index.md +++ b/docs/sources/send-data/fluentbit/_index.md @@ -8,7 +8,9 @@ weight: 500 --- # Fluent Bit client -[Fluent Bit](https://fluentbit.io/) is a fast and lightweight logs and metrics processor and forwarder that can be configured with the [Grafana Loki output plugin](https://docs.fluentbit.io/manual/pipeline/outputs/loki) to ship logs to Loki. You can define which log files you want to collect using the [`Tail`](https://docs.fluentbit.io/manual/pipeline/inputs/tail) or [`Stdin`](https://docs.fluentbit.io/manual/pipeline/inputs/standard-input) data pipeline input. Additionally, Fluent Bit supports multiple `Filter` and `Parser` plugins (`Kubernetes`, `JSON`, etc.) to structure and alter log lines. +[Fluent Bit](https://fluentbit.io/) is a fast and lightweight logs and metrics processor and forwarder that can be configured with the Grafana Fluent Bit Plugin described here or with the [Fluent-bit Loki output plugin](https://docs.fluentbit.io/manual/pipeline/outputs/loki) to ship logs to Loki. +This plugin has more configuration options compared to the built-in Fluent Bit Loki plugin. +You can define which log files you want to collect using the [`Tail`](https://docs.fluentbit.io/manual/pipeline/inputs/tail) or [`Stdin`](https://docs.fluentbit.io/manual/pipeline/inputs/standard-input) data pipeline input. Additionally, Fluent Bit supports multiple `Filter` and `Parser` plugins (`Kubernetes`, `JSON`, etc.) to structure and alter log lines. ## Usage @@ -63,23 +65,59 @@ To ship logs from Docker containers to Grafana Cloud using Fluent Bit, you can u You can run Fluent Bit as a [Daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to collect all your Kubernetes workload logs. -To do so you can use our [Fluent Bit helm chart](https://github.com/grafana/helm-charts/tree/main/charts/fluent-bit): +To do so you can use the [Fluent Bit helm chart](https://github.com/fluent/helm-charts) with the following `values.yaml` changing the value of `FLUENT_LOKI_URL`: + +```yaml +image: + # Here we use the Docker image which has the plugin installed + repository: grafana/fluent-bit-plugin-loki + tag: main-e2ed1c0 + +args: + - "-e" + - "/fluent-bit/bin/out_grafana_loki.so" + - --workdir=/fluent-bit/etc + - --config=/fluent-bit/etc/conf/fluent-bit.conf + +env: + # Note that for security reasons you should fetch the credentials through a Kubernetes Secret https://kubernetes.io/docs/concepts/configuration/secret/ . You may use the envFrom for this. + - name: FLUENT_LOKI_URL + value: https://user:pass@your-loki.endpoint/loki/api/v1/push + +config: + inputs: | + [INPUT] + Name tail + Tag kube.* + Path /var/log/containers/*.log + # Be aware that local clusters like docker-desktop or kind use the docker log format and not the cri (https://docs.fluentbit.io/manual/installation/kubernetes#container-runtime-interface-cri-parser) + multiline.parser docker, cri + Mem_Buf_Limit 5MB + Skip_Long_Lines On + + outputs: | + [Output] + Name grafana-loki + Match kube.* + Url ${FLUENT_LOKI_URL} + Labels {job="fluent-bit"} + LabelKeys level,app # this sets the values for actual Loki streams and the other labels are converted to structured_metadata https://grafana.com/docs/loki/latest/get-started/labels/structured-metadata/ + BatchWait 1 + BatchSize 1001024 + LineFormat json + LogLevel info + AutoKubernetesLabels true +``` ```bash -helm repo add grafana https://grafana.github.io/helm-charts +helm repo add fluent https://fluent.github.io/helm-charts helm repo update -helm upgrade --install fluent-bit grafana/fluent-bit \ - --set loki.serviceName=loki.svc.cluster.local +helm install fluent-bit fluent/fluent-bit -f values.yaml ``` By default it will collect all containers logs and extract labels from Kubernetes API (`container_name`, `namespace`, etc..). -Alternatively you can install the Loki and Fluent Bit all together using: - -```bash -helm upgrade --install loki-stack grafana/loki-stack \ - --set fluent-bit.enabled=true,promtail.enabled=false -``` +If you also want to host your Loki instance inside the cluster install the [official Loki helm chart](https://grafana.com/docs/loki/latest/setup/install/helm/). ### AWS Elastic Container Service (ECS) diff --git a/docs/sources/setup/upgrade/_index.md b/docs/sources/setup/upgrade/_index.md index 663201820e1e6..84fac5835b31f 100644 --- a/docs/sources/setup/upgrade/_index.md +++ b/docs/sources/setup/upgrade/_index.md @@ -165,6 +165,7 @@ This new metric will provide a more clear signal that there is an issue with ing | `querier.tsdb-max-query-parallelism` | 128 | 512 | - | | `query-scheduler.max-outstanding-requests-per-tenant` | 32000 | 100 | - | | `validation.max-label-names-per-series` | 15 | 30 | - | +| `legacy-read-mode` | false | true | Deprecated. It will be removed in the next minor release. | {{% /responsive-table %}} #### Write dedupe cache is deprecated diff --git a/integration/client/client.go b/integration/client/client.go index dcf2c036dc9e9..2e5a86aa6b3de 100644 --- a/integration/client/client.go +++ b/integration/client/client.go @@ -479,12 +479,21 @@ type Header struct { Name, Value string } -// RunRangeQuery runs a query and returns an error if anything went wrong +// RunRangeQuery runs a 7d query and returns an error if anything went wrong +// This function is kept to keep backwards copatibility of existing tests. +// Better use (*Client).RunRangeQueryWithStartEnd() func (c *Client) RunRangeQuery(ctx context.Context, query string, extraHeaders ...Header) (*Response, error) { + end := c.Now.Add(time.Second) + start := c.Now.Add(-7 * 24 * time.Hour) + return c.RunRangeQueryWithStartEnd(ctx, query, start, end, extraHeaders...) +} + +// RunRangeQuery runs a query and returns an error if anything went wrong +func (c *Client) RunRangeQueryWithStartEnd(ctx context.Context, query string, start, end time.Time, extraHeaders ...Header) (*Response, error) { ctx, cancelFunc := context.WithTimeout(ctx, requestTimeout) defer cancelFunc() - buf, statusCode, err := c.run(ctx, c.rangeQueryURL(query), extraHeaders...) + buf, statusCode, err := c.run(ctx, c.rangeQueryURL(query, start, end), extraHeaders...) if err != nil { return nil, err } @@ -555,11 +564,11 @@ func (c *Client) parseResponse(buf []byte, statusCode int) (*Response, error) { return &lokiResp, nil } -func (c *Client) rangeQueryURL(query string) string { +func (c *Client) rangeQueryURL(query string, start, end time.Time) string { v := url.Values{} v.Set("query", query) - v.Set("start", formatTS(c.Now.Add(-7*24*time.Hour))) - v.Set("end", formatTS(c.Now.Add(time.Second))) + v.Set("start", formatTS(start)) + v.Set("end", formatTS(end)) u, err := url.Parse(c.baseURL) if err != nil { diff --git a/integration/cluster/cluster.go b/integration/cluster/cluster.go index 8ddeac00f1782..831da46f2cb99 100644 --- a/integration/cluster/cluster.go +++ b/integration/cluster/cluster.go @@ -43,7 +43,6 @@ server: grpc_server_max_recv_msg_size: 110485813 grpc_server_max_send_msg_size: 110485813 - common: path_prefix: {{.dataPath}} storage: @@ -70,14 +69,25 @@ storage_config: store-1: directory: {{.sharedDataPath}}/fs-store-1 boltdb_shipper: - active_index_directory: {{.dataPath}}/index + active_index_directory: {{.dataPath}}/boltdb-index cache_location: {{.dataPath}}/boltdb-cache tsdb_shipper: active_index_directory: {{.dataPath}}/tsdb-index cache_location: {{.dataPath}}/tsdb-cache + bloom_shipper: + working_directory: {{.dataPath}}/bloom-shipper + blocks_downloading_queue: + workers_count: 1 + +bloom_gateway: + enabled: false + +bloom_compactor: + enabled: false + working_directory: {{.dataPath}}/bloom-compactor compactor: - working_directory: {{.dataPath}}/retention + working_directory: {{.dataPath}}/compactor retention_enabled: true delete_request_store: store-1 @@ -154,14 +164,14 @@ func New(logLevel level.Value, opts ...func(*Cluster)) *Cluster { } resetMetricRegistry() - sharedPath, err := os.MkdirTemp("", "loki-shared-data") + sharedPath, err := os.MkdirTemp("", "loki-shared-data-") if err != nil { panic(err.Error()) } overridesFile := filepath.Join(sharedPath, "loki-overrides.yaml") - err = os.WriteFile(filepath.Join(sharedPath, "loki-overrides.yaml"), []byte(`overrides:`), 0777) + err = os.WriteFile(overridesFile, []byte(`overrides:`), 0777) if err != nil { panic(fmt.Errorf("error creating overrides file: %w", err)) } @@ -318,12 +328,12 @@ func port(addr string) string { func (c *Component) writeConfig() error { var err error - configFile, err := os.CreateTemp("", "loki-config") + configFile, err := os.CreateTemp("", fmt.Sprintf("loki-%s-config-*.yaml", c.name)) if err != nil { return fmt.Errorf("error creating config file: %w", err) } - c.dataPath, err = os.MkdirTemp("", "loki-data") + c.dataPath, err = os.MkdirTemp("", fmt.Sprintf("loki-%s-data-", c.name)) if err != nil { return fmt.Errorf("error creating data path: %w", err) } @@ -408,6 +418,8 @@ func (c *Component) run() error { c.configFile, "-limits.per-user-override-config", c.overridesFile, + "-limits.per-user-override-period", + "1s", ), flagset); err != nil { return err } diff --git a/integration/loki_micro_services_test.go b/integration/loki_micro_services_test.go index a4d03ed10a673..1f7dc836b5ff6 100644 --- a/integration/loki_micro_services_test.go +++ b/integration/loki_micro_services_test.go @@ -3,11 +3,14 @@ package integration import ( "context" "encoding/json" + "fmt" + "math/rand" "strings" "sync" "testing" "time" + "github.com/go-kit/log/level" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "github.com/prometheus/prometheus/model/labels" @@ -1056,6 +1059,174 @@ func TestCategorizedLabels(t *testing.T) { } } +func TestBloomFiltersEndToEnd(t *testing.T) { + commonFlags := []string{ + "-bloom-compactor.compaction-interval=2s", + "-bloom-compactor.enable-compaction=true", + "-bloom-compactor.enabled=true", + "-bloom-gateway.enable-filtering=true", + "-bloom-gateway.enabled=true", + "-compactor.compaction-interval=1s", + "-frontend.default-validity=0s", + "-ingester.flush-on-shutdown=true", + "-ingester.wal-enabled=false", + "-query-scheduler.use-scheduler-ring=false", + "-store.index-cache-read.embedded-cache.enabled=true", + } + + tenantID := randStringRunes() + + clu := cluster.New( + level.DebugValue(), + cluster.SchemaWithTSDB, + func(c *cluster.Cluster) { c.SetSchemaVer("v13") }, + ) + + defer func() { + assert.NoError(t, clu.Cleanup()) + }() + + var ( + tDistributor = clu.AddComponent( + "distributor", + append( + commonFlags, + "-target=distributor", + )..., + ) + tIndexGateway = clu.AddComponent( + "index-gateway", + append( + commonFlags, + "-target=index-gateway", + )..., + ) + _ = clu.AddComponent( + "bloom-gateway", + append( + commonFlags, + "-target=bloom-gateway", + )..., + ) + ) + require.NoError(t, clu.Run()) + + var ( + tIngester = clu.AddComponent( + "ingester", + append( + commonFlags, + "-target=ingester", + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + tQueryScheduler = clu.AddComponent( + "query-scheduler", + append( + commonFlags, + "-target=query-scheduler", + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + tCompactor = clu.AddComponent( + "compactor", + append( + commonFlags, + "-target=compactor", + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + _ = clu.AddComponent( + "bloom-compactor", + append( + commonFlags, + "-target=bloom-compactor", + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + ) + require.NoError(t, clu.Run()) + + // finally, run the query-frontend and querier. + var ( + tQueryFrontend = clu.AddComponent( + "query-frontend", + append( + commonFlags, + "-target=query-frontend", + "-frontend.scheduler-address="+tQueryScheduler.GRPCURL(), + "-common.compactor-address="+tCompactor.HTTPURL(), + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + _ = clu.AddComponent( + "querier", + append( + commonFlags, + "-target=querier", + "-querier.scheduler-address="+tQueryScheduler.GRPCURL(), + "-common.compactor-address="+tCompactor.HTTPURL(), + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + ) + require.NoError(t, clu.Run()) + + now := time.Now() + + cliDistributor := client.New(tenantID, "", tDistributor.HTTPURL()) + cliDistributor.Now = now + + cliIngester := client.New(tenantID, "", tIngester.HTTPURL()) + cliIngester.Now = now + + cliQueryFrontend := client.New(tenantID, "", tQueryFrontend.HTTPURL()) + cliQueryFrontend.Now = now + + cliIndexGateway := client.New(tenantID, "", tIndexGateway.HTTPURL()) + cliIndexGateway.Now = now + + lineTpl := `caller=loki_micro_services_test.go msg="push log line" id="%s"` + // ingest logs from 10 different pods + // each line contains a random, unique string + // that string is used to verify filtering using bloom gateway + uniqueStrings := make([]string, 600) + for i := 0; i < len(uniqueStrings); i++ { + id := randStringRunes() + id = fmt.Sprintf("%s-%d", id, i) + uniqueStrings[i] = id + pod := fmt.Sprintf("pod-%d", i%10) + line := fmt.Sprintf(lineTpl, id) + err := cliDistributor.PushLogLine(line, now.Add(-1*time.Hour).Add(time.Duration(i-len(uniqueStrings))*time.Second), nil, map[string]string{"pod": pod}) + require.NoError(t, err) + } + + // restart ingester to flush chunks and that there are zero chunks in memory + require.NoError(t, cliIngester.Flush()) + require.NoError(t, tIngester.Restart()) + + // wait for compactor to compact index and for bloom compactor to build bloom filters + time.Sleep(10 * time.Second) + + // use bloom gateway to perform needle in the haystack queries + randIdx := rand.Intn(len(uniqueStrings)) + q := fmt.Sprintf(`{job="varlog"} |= "%s"`, uniqueStrings[randIdx]) + end := now.Add(-1 * time.Second) + start := end.Add(-24 * time.Hour) + resp, err := cliQueryFrontend.RunRangeQueryWithStartEnd(context.Background(), q, start, end) + require.NoError(t, err) + + // verify response + require.Len(t, resp.Data.Stream, 1) + expectedLine := fmt.Sprintf(lineTpl, uniqueStrings[randIdx]) + require.Equal(t, expectedLine, resp.Data.Stream[0].Values[0][1]) + + // TODO(chaudum): + // verify that bloom blocks have actually been used for querying + // atm, we can only verify by logs, so we should add appropriate metrics for + // uploaded/downloaded blocks and metas +} + func getValueFromMF(mf *dto.MetricFamily, lbs []*dto.LabelPair) float64 { for _, m := range mf.Metric { if !assert.ObjectsAreEqualValues(lbs, m.GetLabel()) { diff --git a/operator/CHANGELOG.md b/operator/CHANGELOG.md index 9ea61a0dba4e5..ad9b319b625c9 100644 --- a/operator/CHANGELOG.md +++ b/operator/CHANGELOG.md @@ -1,5 +1,7 @@ ## Main +- [11671](https://github.com/grafana/loki/pull/11671) **JoaoBraveCoding**: Update mixins to fix structured metadata dashboards +- [11624](https://github.com/grafana/loki/pull/11624) **xperimental**: React to changes in ConfigMap used for storage CA - [11481](https://github.com/grafana/loki/pull/11481) **JoaoBraveCoding**: Adds AWS STS support - [11533](https://github.com/grafana/loki/pull/11533) **periklis**: Add serviceaccount per LokiStack resource - [11158](https://github.com/grafana/loki/pull/11158) **btaani**: operator: Add warning for old schema configuration diff --git a/operator/bundle/community-openshift/manifests/loki-operator.clusterserviceversion.yaml b/operator/bundle/community-openshift/manifests/loki-operator.clusterserviceversion.yaml index 36151790a2099..2915af504fd39 100644 --- a/operator/bundle/community-openshift/manifests/loki-operator.clusterserviceversion.yaml +++ b/operator/bundle/community-openshift/manifests/loki-operator.clusterserviceversion.yaml @@ -150,7 +150,7 @@ metadata: categories: OpenShift Optional, Logging & Tracing certified: "false" containerImage: docker.io/grafana/loki-operator:0.5.0 - createdAt: "2023-12-12T09:22:19Z" + createdAt: "2024-01-10T18:25:00Z" description: The Community Loki Operator provides Kubernetes native deployment and management of Loki and related logging components. features.operators.openshift.io/disconnected: "true" @@ -1591,6 +1591,12 @@ spec: - alertmanagers verbs: - patch + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers/api + verbs: + - create - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/bundle/community/manifests/loki-operator.clusterserviceversion.yaml b/operator/bundle/community/manifests/loki-operator.clusterserviceversion.yaml index 322bc606611f3..b78b8f6d30b98 100644 --- a/operator/bundle/community/manifests/loki-operator.clusterserviceversion.yaml +++ b/operator/bundle/community/manifests/loki-operator.clusterserviceversion.yaml @@ -150,7 +150,7 @@ metadata: categories: OpenShift Optional, Logging & Tracing certified: "false" containerImage: docker.io/grafana/loki-operator:0.5.0 - createdAt: "2023-12-12T09:22:17Z" + createdAt: "2024-01-10T18:24:59Z" description: The Community Loki Operator provides Kubernetes native deployment and management of Loki and related logging components. operators.operatorframework.io/builder: operator-sdk-unknown @@ -1571,6 +1571,12 @@ spec: - alertmanagers verbs: - patch + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers/api + verbs: + - create - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/bundle/openshift/manifests/loki-operator.clusterserviceversion.yaml b/operator/bundle/openshift/manifests/loki-operator.clusterserviceversion.yaml index f4a951400e946..b0fca996ce78f 100644 --- a/operator/bundle/openshift/manifests/loki-operator.clusterserviceversion.yaml +++ b/operator/bundle/openshift/manifests/loki-operator.clusterserviceversion.yaml @@ -150,7 +150,7 @@ metadata: categories: OpenShift Optional, Logging & Tracing certified: "false" containerImage: quay.io/openshift-logging/loki-operator:0.1.0 - createdAt: "2023-12-12T09:22:21Z" + createdAt: "2024-01-10T18:25:02Z" description: | The Loki Operator for OCP provides a means for configuring and managing a Loki stack for cluster logging. ## Prerequisites and Requirements @@ -1576,6 +1576,12 @@ spec: - alertmanagers verbs: - patch + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers/api + verbs: + - create - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/config/rbac/role.yaml b/operator/config/rbac/role.yaml index d7b881ef8e33d..09dc60b8c33b9 100644 --- a/operator/config/rbac/role.yaml +++ b/operator/config/rbac/role.yaml @@ -175,6 +175,12 @@ rules: - alertmanagers verbs: - patch +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers/api + verbs: + - create - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/controllers/loki/lokistack_controller.go b/operator/controllers/loki/lokistack_controller.go index 49b5bdab069e0..629ee85d5edd7 100644 --- a/operator/controllers/loki/lokistack_controller.go +++ b/operator/controllers/loki/lokistack_controller.go @@ -94,12 +94,7 @@ var ( }) createUpdateOrDeletePred = builder.WithPredicates(predicate.Funcs{ UpdateFunc: func(e event.UpdateEvent) bool { - if e.ObjectOld.GetGeneration() == 0 && len(e.ObjectOld.GetAnnotations()) == 0 { - return e.ObjectOld.GetResourceVersion() != e.ObjectNew.GetResourceVersion() - } - - return e.ObjectOld.GetGeneration() != e.ObjectNew.GetGeneration() || - cmp.Diff(e.ObjectOld.GetAnnotations(), e.ObjectNew.GetAnnotations()) != "" + return e.ObjectOld.GetResourceVersion() != e.ObjectNew.GetResourceVersion() }, CreateFunc: func(e event.CreateEvent) bool { return true }, DeleteFunc: func(e event.DeleteEvent) bool { return true }, @@ -123,6 +118,7 @@ type LokiStackReconciler struct { // +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings;clusterroles;roles;rolebindings,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=get;list;watch;create;update;delete // +kubebuilder:rbac:groups=monitoring.coreos.com,resources=alertmanagers,verbs=patch +// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=alertmanagers/api,verbs=create // +kubebuilder:rbac:urls=/api/v2/alerts,verbs=create // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update // +kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses,verbs=get;list;watch;create;update @@ -206,7 +202,8 @@ func (r *LokiStackReconciler) buildController(bld k8s.Builder) error { Owns(&rbacv1.Role{}, updateOrDeleteOnlyPred). Owns(&rbacv1.RoleBinding{}, updateOrDeleteOnlyPred). Watches(&corev1.Service{}, r.enqueueForAlertManagerServices(), createUpdateOrDeletePred). - Watches(&corev1.Secret{}, r.enqueueForStorageSecret(), createUpdateOrDeletePred) + Watches(&corev1.Secret{}, r.enqueueForStorageSecret(), createUpdateOrDeletePred). + Watches(&corev1.ConfigMap{}, r.enqueueForStorageCA(), createUpdateOrDeletePred) if r.FeatureGates.LokiStackAlerts { bld = bld.Owns(&monitoringv1.PrometheusRule{}, updateOrDeleteOnlyPred) @@ -323,3 +320,35 @@ func (r *LokiStackReconciler) enqueueForStorageSecret() handler.EventHandler { return requests }) } + +func (r *LokiStackReconciler) enqueueForStorageCA() handler.EventHandler { + return handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []reconcile.Request { + lokiStacks := &lokiv1.LokiStackList{} + if err := r.Client.List(ctx, lokiStacks, client.InNamespace(obj.GetNamespace())); err != nil { + r.Log.Error(err, "Error listing LokiStack resources for storage CA update") + return nil + } + + var requests []reconcile.Request + for _, stack := range lokiStacks.Items { + if stack.Spec.Storage.TLS == nil { + continue + } + + storageTLS := stack.Spec.Storage.TLS + if obj.GetName() != storageTLS.CA { + continue + } + + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: stack.Namespace, + Name: stack.Name, + }, + }) + r.Log.Info("Enqueued request for LokiStack because of Storage CA resource change", "LokiStack", stack.Name, "ConfigMap", obj.GetName()) + } + + return requests + }) +} diff --git a/operator/controllers/loki/lokistack_controller_test.go b/operator/controllers/loki/lokistack_controller_test.go index d8eae5a1ec66f..7421b63331b5d 100644 --- a/operator/controllers/loki/lokistack_controller_test.go +++ b/operator/controllers/loki/lokistack_controller_test.go @@ -203,8 +203,8 @@ func TestLokiStackController_RegisterWatchedResources(t *testing.T) { table := []test{ { src: &openshiftconfigv1.APIServer{}, - index: 2, - watchesCallsCount: 3, + index: 3, + watchesCallsCount: 4, featureGates: configv1.FeatureGates{ OpenShift: configv1.OpenShiftFeatureGates{ ClusterTLSPolicy: true, @@ -214,8 +214,8 @@ func TestLokiStackController_RegisterWatchedResources(t *testing.T) { }, { src: &openshiftconfigv1.Proxy{}, - index: 2, - watchesCallsCount: 3, + index: 3, + watchesCallsCount: 4, featureGates: configv1.FeatureGates{ OpenShift: configv1.OpenShiftFeatureGates{ ClusterProxy: true, @@ -226,14 +226,21 @@ func TestLokiStackController_RegisterWatchedResources(t *testing.T) { { src: &corev1.Service{}, index: 0, - watchesCallsCount: 2, + watchesCallsCount: 3, featureGates: configv1.FeatureGates{}, pred: createUpdateOrDeletePred, }, { src: &corev1.Secret{}, index: 1, - watchesCallsCount: 2, + watchesCallsCount: 3, + featureGates: configv1.FeatureGates{}, + pred: createUpdateOrDeletePred, + }, + { + src: &corev1.ConfigMap{}, + index: 2, + watchesCallsCount: 3, featureGates: configv1.FeatureGates{}, pred: createUpdateOrDeletePred, }, diff --git a/operator/internal/handlers/internal/gateway/base_domain.go b/operator/internal/handlers/internal/gateway/base_domain.go index 5bdea31658d1d..893659ca5d29b 100644 --- a/operator/internal/handlers/internal/gateway/base_domain.go +++ b/operator/internal/handlers/internal/gateway/base_domain.go @@ -6,7 +6,6 @@ import ( "github.com/ViaQ/logerr/v2/kverrors" configv1 "github.com/openshift/api/config/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" @@ -14,11 +13,11 @@ import ( "github.com/grafana/loki/operator/internal/status" ) -// GetOpenShiftBaseDomain returns the cluster DNS base domain on OpenShift +// getOpenShiftBaseDomain returns the cluster DNS base domain on OpenShift // clusters to auto-create redirect URLs for OpenShift Auth or an error. // If the config.openshift.io/DNS object is not found the whole lokistack // resoure is set to a degraded state. -func GetOpenShiftBaseDomain(ctx context.Context, k k8s.Client, req ctrl.Request) (string, error) { +func getOpenShiftBaseDomain(ctx context.Context, k k8s.Client) (string, error) { var cluster configv1.DNS key := client.ObjectKey{Name: "cluster"} if err := k.Get(ctx, key, &cluster); err != nil { diff --git a/operator/internal/handlers/internal/gateway/gateway.go b/operator/internal/handlers/internal/gateway/gateway.go new file mode 100644 index 0000000000000..0b05801f2e9aa --- /dev/null +++ b/operator/internal/handlers/internal/gateway/gateway.go @@ -0,0 +1,87 @@ +package gateway + +import ( + "context" + "fmt" + + "github.com/go-logr/logr" + + configv1 "github.com/grafana/loki/operator/apis/config/v1" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/handlers/internal/openshift" + "github.com/grafana/loki/operator/internal/manifests" + "github.com/grafana/loki/operator/internal/status" +) + +// BuildOptions returns the options needed to generate Kubernetes resource +// manifests for the lokistack-gateway. +// The returned error can be a status.DegradedError in the following cases: +// - The tenants spec is missing. +// - The tenants spec is invalid. +func BuildOptions(ctx context.Context, log logr.Logger, k k8s.Client, stack *lokiv1.LokiStack, fg configv1.FeatureGates) (string, manifests.Tenants, error) { + var ( + err error + baseDomain string + secrets []*manifests.TenantSecrets + configs map[string]manifests.TenantConfig + tenants manifests.Tenants + ) + + if !fg.LokiStackGateway { + return "", tenants, nil + } + + if stack.Spec.Tenants == nil { + return "", tenants, &status.DegradedError{ + Message: "Invalid tenants configuration: TenantsSpec cannot be nil when gateway flag is enabled", + Reason: lokiv1.ReasonInvalidTenantsConfiguration, + Requeue: false, + } + } + + if err = validateModes(stack); err != nil { + return "", tenants, &status.DegradedError{ + Message: fmt.Sprintf("Invalid tenants configuration: %s", err), + Reason: lokiv1.ReasonInvalidTenantsConfiguration, + Requeue: false, + } + } + + switch stack.Spec.Tenants.Mode { + case lokiv1.OpenshiftLogging, lokiv1.OpenshiftNetwork: + baseDomain, err = getOpenShiftBaseDomain(ctx, k) + if err != nil { + return "", tenants, err + } + + if stack.Spec.Proxy == nil { + // If the LokiStack has no proxy set but there is a cluster-wide proxy setting, + // set the LokiStack proxy to that. + ocpProxy, proxyErr := openshift.GetProxy(ctx, k) + if proxyErr != nil { + return "", tenants, proxyErr + } + + stack.Spec.Proxy = ocpProxy + } + default: + secrets, err = getTenantSecrets(ctx, k, stack) + if err != nil { + return "", tenants, err + } + } + + // extract the existing tenant's id, cookieSecret if exists, otherwise create new. + configs, err = getTenantConfigFromSecret(ctx, k, stack) + if err != nil { + log.Error(err, "error in getting tenant secret data") + } + + tenants = manifests.Tenants{ + Secrets: secrets, + Configs: configs, + } + + return baseDomain, tenants, nil +} diff --git a/operator/internal/handlers/internal/gateway/gateway_test.go b/operator/internal/handlers/internal/gateway/gateway_test.go new file mode 100644 index 0000000000000..2c8f846f55825 --- /dev/null +++ b/operator/internal/handlers/internal/gateway/gateway_test.go @@ -0,0 +1,390 @@ +package gateway + +import ( + "context" + "io" + "testing" + + "github.com/ViaQ/logerr/v2/log" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + configv1 "github.com/grafana/loki/operator/apis/config/v1" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" + "github.com/grafana/loki/operator/internal/status" +) + +var ( + logger = log.NewLogger("testing", log.WithOutput(io.Discard)) + + defaultSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "endpoint": []byte("s3://your-endpoint"), + "region": []byte("a-region"), + "bucketnames": []byte("bucket1,bucket2"), + "access_key_id": []byte("a-secret-id"), + "access_key_secret": []byte("a-secret-key"), + }, + } + + defaultGatewaySecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-gateway-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "clientID": []byte("client-123"), + "clientSecret": []byte("client-secret-xyz"), + "issuerCAPath": []byte("/tmp/test/ca.pem"), + }, + } + + invalidSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{}, + } +) + +func TestBuildOptions_WhenInvalidTenantsConfiguration_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid tenants configuration: mandatory configuration - missing OPA Url", + Reason: lokiv1.ReasonInvalidTenantsConfiguration, + Requeue: false, + } + + fg := configv1.FeatureGates{ + LokiStackGateway: true, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: nil, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, err := BuildOptions(context.TODO(), logger, k, stack, fg) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenMissingGatewaySecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing secrets for tenant test", + Reason: lokiv1.ReasonMissingGatewayTenantSecret, + Requeue: true, + } + + fg := configv1.FeatureGates{ + LokiStackGateway: true, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + o, ok := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && ok { + k.SetClientObject(o, stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, err := BuildOptions(context.TODO(), logger, k, stack, fg) + + // make sure error is returned to re-trigger reconciliation + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenInvalidGatewaySecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid gateway tenant secret contents", + Reason: lokiv1.ReasonInvalidGatewayTenantSecret, + Requeue: true, + } + + fg := configv1.FeatureGates{ + LokiStackGateway: true, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: invalidSecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + o, ok := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && ok { + k.SetClientObject(o, stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + if name.Name == invalidSecret.Name { + k.SetClientObject(object, &invalidSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, err := BuildOptions(context.TODO(), logger, k, stack, fg) + + // make sure error is returned to re-trigger reconciliation + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_MissingTenantsSpec_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid tenants configuration: TenantsSpec cannot be nil when gateway flag is enabled", + Reason: lokiv1.ReasonInvalidTenantsConfiguration, + Requeue: false, + } + + fg := configv1.FeatureGates{ + LokiStackGateway: true, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Tenants: nil, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + o, ok := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && ok { + k.SetClientObject(o, stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, err := BuildOptions(context.TODO(), logger, k, stack, fg) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} diff --git a/operator/internal/handlers/internal/gateway/modes.go b/operator/internal/handlers/internal/gateway/modes.go index fd6bf5fae3515..8fd9855b352dc 100644 --- a/operator/internal/handlers/internal/gateway/modes.go +++ b/operator/internal/handlers/internal/gateway/modes.go @@ -6,8 +6,7 @@ import ( lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" ) -// ValidateModes validates the tenants mode specification. -func ValidateModes(stack lokiv1.LokiStack) error { +func validateModes(stack *lokiv1.LokiStack) error { if stack.Spec.Tenants.Mode == lokiv1.Static { if stack.Spec.Tenants.Authentication == nil { return kverrors.New("mandatory configuration - missing tenants' authentication configuration") diff --git a/operator/internal/handlers/internal/gateway/modes_test.go b/operator/internal/handlers/internal/gateway/modes_test.go index f54d348f6b25f..f7899c1eae85c 100644 --- a/operator/internal/handlers/internal/gateway/modes_test.go +++ b/operator/internal/handlers/internal/gateway/modes_test.go @@ -13,13 +13,13 @@ func TestValidateModes_StaticMode(t *testing.T) { type test struct { name string wantErr string - stack lokiv1.LokiStack + stack *lokiv1.LokiStack } table := []test{ { name: "missing authentication spec", wantErr: "mandatory configuration - missing tenants' authentication configuration", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -39,7 +39,7 @@ func TestValidateModes_StaticMode(t *testing.T) { { name: "missing roles spec", wantErr: "mandatory configuration - missing roles configuration", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -74,7 +74,7 @@ func TestValidateModes_StaticMode(t *testing.T) { { name: "missing role bindings spec", wantErr: "mandatory configuration - missing role bindings configuration", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -117,7 +117,7 @@ func TestValidateModes_StaticMode(t *testing.T) { { name: "incompatible OPA URL provided", wantErr: "incompatible configuration - OPA URL not required for mode static", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -174,7 +174,7 @@ func TestValidateModes_StaticMode(t *testing.T) { { name: "all set", wantErr: "", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -231,7 +231,7 @@ func TestValidateModes_StaticMode(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - err := ValidateModes(tst.stack) + err := validateModes(tst.stack) if tst.wantErr != "" { require.EqualError(t, err, tst.wantErr) } @@ -243,13 +243,13 @@ func TestValidateModes_DynamicMode(t *testing.T) { type test struct { name string wantErr string - stack lokiv1.LokiStack + stack *lokiv1.LokiStack } table := []test{ { name: "missing authentication spec", wantErr: "mandatory configuration - missing tenants configuration", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -269,7 +269,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { { name: "missing OPA URL spec", wantErr: "mandatory configuration - missing OPA Url", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -304,7 +304,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { { name: "incompatible roles configuration provided", wantErr: "incompatible configuration - static roles not required for mode dynamic", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -349,7 +349,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { { name: "incompatible roleBindings configuration provided", wantErr: "incompatible configuration - static roleBindings not required for mode dynamic", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -398,7 +398,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { { name: "all set", wantErr: "", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -438,7 +438,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - err := ValidateModes(tst.stack) + err := validateModes(tst.stack) if tst.wantErr != "" { require.EqualError(t, err, tst.wantErr) } @@ -450,13 +450,13 @@ func TestValidateModes_OpenshiftLoggingMode(t *testing.T) { type test struct { name string wantErr string - stack lokiv1.LokiStack + stack *lokiv1.LokiStack } table := []test{ { name: "incompatible authentication spec provided", wantErr: "incompatible configuration - custom tenants configuration not required", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -488,7 +488,7 @@ func TestValidateModes_OpenshiftLoggingMode(t *testing.T) { { name: "incompatible authorization spec provided", wantErr: "incompatible configuration - custom tenants configuration not required", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -514,7 +514,7 @@ func TestValidateModes_OpenshiftLoggingMode(t *testing.T) { { name: "all set", wantErr: "", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -537,7 +537,7 @@ func TestValidateModes_OpenshiftLoggingMode(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - err := ValidateModes(tst.stack) + err := validateModes(tst.stack) if tst.wantErr != "" { require.EqualError(t, err, tst.wantErr) } diff --git a/operator/internal/handlers/internal/gateway/tenant_configsecret.go b/operator/internal/handlers/internal/gateway/tenant_configsecret.go index c5b06c9c5c87b..f4e6c493bc069 100644 --- a/operator/internal/handlers/internal/gateway/tenant_configsecret.go +++ b/operator/internal/handlers/internal/gateway/tenant_configsecret.go @@ -6,10 +6,10 @@ import ( "github.com/ViaQ/logerr/v2/kverrors" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/json" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/yaml" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/manifests" ) @@ -35,11 +35,11 @@ type openShiftSpec struct { CookieSecret string `json:"cookieSecret"` } -// GetTenantConfigSecretData returns the tenantName, tenantId, cookieSecret +// getTenantConfigFromSecret returns the tenantName, tenantId, cookieSecret // clusters to auto-create redirect URLs for OpenShift Auth or an error. -func GetTenantConfigSecretData(ctx context.Context, k k8s.Client, req ctrl.Request) (map[string]manifests.TenantConfig, error) { +func getTenantConfigFromSecret(ctx context.Context, k k8s.Client, stack *lokiv1.LokiStack) (map[string]manifests.TenantConfig, error) { var tenantSecret corev1.Secret - key := client.ObjectKey{Name: manifests.GatewayName(req.Name), Namespace: req.Namespace} + key := client.ObjectKey{Name: manifests.GatewayName(stack.Name), Namespace: stack.Namespace} if err := k.Get(ctx, key, &tenantSecret); err != nil { return nil, kverrors.Wrap(err, "couldn't find tenant secret.") } diff --git a/operator/internal/handlers/internal/gateway/tenant_configsecret_test.go b/operator/internal/handlers/internal/gateway/tenant_configsecret_test.go index f0035a89a16ff..15e85a2295465 100644 --- a/operator/internal/handlers/internal/gateway/tenant_configsecret_test.go +++ b/operator/internal/handlers/internal/gateway/tenant_configsecret_test.go @@ -10,9 +10,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" "github.com/grafana/loki/operator/internal/manifests" ) @@ -38,8 +38,8 @@ tenants: func TestGetTenantConfigSecretData_SecretExist(t *testing.T) { k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ + s := &lokiv1.LokiStack{ + ObjectMeta: metav1.ObjectMeta{ Name: "lokistack-dev", Namespace: "some-ns", }, @@ -60,7 +60,7 @@ func TestGetTenantConfigSecretData_SecretExist(t *testing.T) { return nil } - ts, err := GetTenantConfigSecretData(context.TODO(), k, r) + ts, err := getTenantConfigFromSecret(context.TODO(), k, s) require.NotNil(t, ts) require.NoError(t, err) @@ -86,8 +86,8 @@ func TestGetTenantConfigSecretData_SecretExist(t *testing.T) { func TestGetTenantConfigSecretData_SecretNotExist(t *testing.T) { k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ + s := &lokiv1.LokiStack{ + ObjectMeta: metav1.ObjectMeta{ Name: "lokistack-dev", Namespace: "some-ns", }, @@ -97,7 +97,7 @@ func TestGetTenantConfigSecretData_SecretNotExist(t *testing.T) { return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") } - ts, err := GetTenantConfigSecretData(context.TODO(), k, r) + ts, err := getTenantConfigFromSecret(context.TODO(), k, s) require.Nil(t, ts) require.Error(t, err) } diff --git a/operator/internal/handlers/internal/gateway/tenant_secrets.go b/operator/internal/handlers/internal/gateway/tenant_secrets.go index fd2775dfa06ac..6cc39ae05e254 100644 --- a/operator/internal/handlers/internal/gateway/tenant_secrets.go +++ b/operator/internal/handlers/internal/gateway/tenant_secrets.go @@ -7,7 +7,6 @@ import ( "github.com/ViaQ/logerr/v2/kverrors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" @@ -16,14 +15,13 @@ import ( "github.com/grafana/loki/operator/internal/status" ) -// GetTenantSecrets returns the list to gateway tenant secrets for a tenant mode. +// getTenantSecrets returns the list to gateway tenant secrets for a tenant mode. // For modes static and dynamic the secrets are fetched from external provided // secrets. For modes openshift-logging and openshift-network a secret per default tenants are created. // All secrets live in the same namespace as the lokistack request. -func GetTenantSecrets( +func getTenantSecrets( ctx context.Context, k k8s.Client, - req ctrl.Request, stack *lokiv1.LokiStack, ) ([]*manifests.TenantSecrets, error) { var ( @@ -34,7 +32,7 @@ func GetTenantSecrets( for _, tenant := range stack.Spec.Tenants.Authentication { switch { case tenant.OIDC != nil: - key := client.ObjectKey{Name: tenant.OIDC.Secret.Name, Namespace: req.Namespace} + key := client.ObjectKey{Name: tenant.OIDC.Secret.Name, Namespace: stack.Namespace} if err := k.Get(ctx, key, &gatewaySecret); err != nil { if apierrors.IsNotFound(err) { return nil, &status.DegradedError{ @@ -60,7 +58,7 @@ func GetTenantSecrets( OIDCSecret: oidcSecret, } if tenant.OIDC.IssuerCA != nil { - caPath, err := extractCAPath(ctx, k, req.Namespace, tenant.TenantName, tenant.OIDC.IssuerCA) + caPath, err := extractCAPath(ctx, k, stack.Namespace, tenant.TenantName, tenant.OIDC.IssuerCA) if err != nil { return nil, err } @@ -68,7 +66,7 @@ func GetTenantSecrets( } tenantSecrets = append(tenantSecrets, tennantSecret) case tenant.MTLS != nil: - caPath, err := extractCAPath(ctx, k, req.Namespace, tenant.TenantName, tenant.MTLS.CA) + caPath, err := extractCAPath(ctx, k, stack.Namespace, tenant.TenantName, tenant.MTLS.CA) if err != nil { return nil, err } diff --git a/operator/internal/handlers/internal/gateway/tenant_secrets_test.go b/operator/internal/handlers/internal/gateway/tenant_secrets_test.go index d0292108d8290..d0ccc0962e0b9 100644 --- a/operator/internal/handlers/internal/gateway/tenant_secrets_test.go +++ b/operator/internal/handlers/internal/gateway/tenant_secrets_test.go @@ -9,7 +9,6 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" @@ -93,13 +92,6 @@ func TestGetTenantSecrets(t *testing.T) { } { t.Run(strings.Join([]string{string(mode), tc.name}, "_"), func(t *testing.T) { k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - s := &lokiv1.LokiStack{ ObjectMeta: metav1.ObjectMeta{ Name: "mystack", @@ -119,7 +111,7 @@ func TestGetTenantSecrets(t *testing.T) { } return nil } - ts, err := GetTenantSecrets(context.TODO(), k, r, s) + ts, err := getTenantSecrets(context.TODO(), k, s) require.NoError(t, err) require.ElementsMatch(t, ts, tc.expected) }) diff --git a/operator/internal/handlers/internal/rules/cleanup.go b/operator/internal/handlers/internal/rules/cleanup.go index 81805947efddf..abd5bacd5c032 100644 --- a/operator/internal/handlers/internal/rules/cleanup.go +++ b/operator/internal/handlers/internal/rules/cleanup.go @@ -4,25 +4,49 @@ import ( "context" "github.com/ViaQ/logerr/v2/kverrors" + "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/labels" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + v1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/manifests" ) -// RemoveRulesConfigMap removes the rules configmaps if any exists. -func RemoveRulesConfigMap(ctx context.Context, req ctrl.Request, c client.Client) error { +// Cleanup removes the ruler component's statefulset and configmaps if available, or +// else it returns an error to retry the reconciliation loop. +func Cleanup(ctx context.Context, log logr.Logger, k k8s.Client, stack *v1.LokiStack) error { + if stack.Spec.Rules != nil && stack.Spec.Rules.Enabled { + return nil + } + + stackKey := client.ObjectKeyFromObject(stack) + + // Clean up ruler resources + if err := removeRulesConfigMap(ctx, k, stackKey); err != nil { + log.Error(err, "failed to remove rules ConfigMap") + return err + } + + if err := removeRuler(ctx, k, stackKey); err != nil { + log.Error(err, "failed to remove ruler StatefulSet") + return err + } + + return nil +} + +func removeRulesConfigMap(ctx context.Context, c client.Client, key client.ObjectKey) error { var rulesCmList corev1.ConfigMapList err := c.List(ctx, &rulesCmList, &client.ListOptions{ - Namespace: req.Namespace, + Namespace: key.Namespace, LabelSelector: labels.SelectorFromSet(labels.Set{ "app.kubernetes.io/component": manifests.LabelRulerComponent, - "app.kubernetes.io/instance": req.Name, + "app.kubernetes.io/instance": key.Name, }), }) if err != nil { @@ -41,10 +65,9 @@ func RemoveRulesConfigMap(ctx context.Context, req ctrl.Request, c client.Client return nil } -// RemoveRuler removes the ruler statefulset if it exists. -func RemoveRuler(ctx context.Context, req ctrl.Request, c client.Client) error { +func removeRuler(ctx context.Context, c client.Client, stack client.ObjectKey) error { // Check if the Statefulset exists before proceeding. - key := client.ObjectKey{Name: manifests.RulerName(req.Name), Namespace: req.Namespace} + key := client.ObjectKey{Name: manifests.RulerName(stack.Name), Namespace: stack.Namespace} var ruler appsv1.StatefulSet if err := c.Get(ctx, key, &ruler); err != nil { diff --git a/operator/internal/handlers/internal/rules/cleanup_test.go b/operator/internal/handlers/internal/rules/cleanup_test.go new file mode 100644 index 0000000000000..5ecd9f69c345f --- /dev/null +++ b/operator/internal/handlers/internal/rules/cleanup_test.go @@ -0,0 +1,223 @@ +package rules + +import ( + "context" + "io" + "testing" + + "github.com/ViaQ/logerr/v2/log" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" +) + +var ( + logger = log.NewLogger("testing", log.WithOutput(io.Discard)) + + defaultSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "endpoint": []byte("s3://your-endpoint"), + "region": []byte("a-region"), + "bucketnames": []byte("bucket1,bucket2"), + "access_key_id": []byte("a-secret-id"), + "access_key_secret": []byte("a-secret-key"), + }, + } + + defaultGatewaySecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-gateway-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "clientID": []byte("client-123"), + "clientSecret": []byte("client-secret-xyz"), + "issuerCAPath": []byte("/tmp/test/ca.pem"), + }, + } + + rulesCM = corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack-rules-0", + Namespace: "some-ns", + }, + } + + rulerSS = appsv1.StatefulSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "StatefulSet", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack-ruler", + Namespace: "some-ns", + }, + } +) + +func TestCleanup_RemovesRulerResourcesWhenDisabled(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + stack := lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Rules: &lokiv1.RulesSpec{ + Enabled: true, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { + _, ok := out.(*lokiv1.RulerConfig) + if ok { + return apierrors.NewNotFound(schema.GroupResource{}, "no ruler config") + } + + _, isLokiStack := out.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(out, &stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(out, &defaultSecret) + return nil + } + if defaultGatewaySecret.Name == name.Name { + k.SetClientObject(out, &defaultGatewaySecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") + } + + k.CreateStub = func(_ context.Context, o client.Object, _ ...client.CreateOption) error { + assert.Equal(t, r.Namespace, o.GetNamespace()) + return nil + } + + k.StatusStub = func() client.StatusWriter { return sw } + + k.DeleteStub = func(_ context.Context, o client.Object, _ ...client.DeleteOption) error { + assert.Equal(t, r.Namespace, o.GetNamespace()) + return nil + } + + k.ListStub = func(_ context.Context, list client.ObjectList, options ...client.ListOption) error { + switch list.(type) { + case *corev1.ConfigMapList: + k.SetClientObjectList(list, &corev1.ConfigMapList{ + Items: []corev1.ConfigMap{ + rulesCM, + }, + }) + } + return nil + } + + err := Cleanup(context.TODO(), logger, k, &stack) + require.NoError(t, err) + + // make sure delete not called + require.Zero(t, k.DeleteCallCount()) + + // Disable the ruler + stack.Spec.Rules.Enabled = false + + // Get should return ruler resources + k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { + _, ok := out.(*lokiv1.RulerConfig) + if ok { + return apierrors.NewNotFound(schema.GroupResource{}, "no ruler config") + } + if rulesCM.Name == name.Name { + k.SetClientObject(out, &rulesCM) + return nil + } + if rulerSS.Name == name.Name { + k.SetClientObject(out, &rulerSS) + return nil + } + + _, isLokiStack := out.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(out, &stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(out, &defaultSecret) + return nil + } + if defaultGatewaySecret.Name == name.Name { + k.SetClientObject(out, &defaultGatewaySecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") + } + + err = Cleanup(context.TODO(), logger, k, &stack) + require.NoError(t, err) + + // make sure delete was called twice (delete rules configmap and ruler statefulset) + require.Equal(t, 2, k.DeleteCallCount()) +} diff --git a/operator/internal/handlers/internal/rules/config.go b/operator/internal/handlers/internal/rules/config.go index f66b92ee06c11..ec4413fc49ecd 100644 --- a/operator/internal/handlers/internal/rules/config.go +++ b/operator/internal/handlers/internal/rules/config.go @@ -5,19 +5,16 @@ import ( "github.com/ViaQ/logerr/v2/kverrors" apierrors "k8s.io/apimachinery/pkg/api/errors" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s" ) -// GetRulerConfig returns the ruler config spec for a lokistack resource or an error. +// getRulerConfig returns the ruler config spec for a lokistack resource or an error. // If the config is not found, we skip without an error. -func GetRulerConfig(ctx context.Context, k k8s.Client, req ctrl.Request) (*lokiv1.RulerConfigSpec, error) { +func getRulerConfig(ctx context.Context, k k8s.Client, key client.ObjectKey) (*lokiv1.RulerConfigSpec, error) { var rc lokiv1.RulerConfig - - key := client.ObjectKey{Name: req.Name, Namespace: req.Namespace} if err := k.Get(ctx, key, &rc); err != nil { if apierrors.IsNotFound(err) { return nil, nil diff --git a/operator/internal/handlers/internal/rules/rules.go b/operator/internal/handlers/internal/rules/rules.go index ac4a6d78f0306..e21335e98c095 100644 --- a/operator/internal/handlers/internal/rules/rules.go +++ b/operator/internal/handlers/internal/rules/rules.go @@ -4,20 +4,114 @@ import ( "context" "github.com/ViaQ/logerr/v2/kverrors" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/handlers/internal/openshift" + "github.com/grafana/loki/operator/internal/manifests" + manifestsocp "github.com/grafana/loki/operator/internal/manifests/openshift" + "github.com/grafana/loki/operator/internal/status" ) -// List returns a slice of AlertingRules and a slice of RecordingRules for the given spec or an error. Three cases apply: -// - Return only matching rules in the stack namespace if no namespace selector given. -// - Return only matching rules in the stack namespace and in namespaces matching the namespace selector. -// - Return no rules if rules selector does not apply at all. -func List(ctx context.Context, k k8s.Client, stackNs string, rs *lokiv1.RulesSpec) ([]lokiv1.AlertingRule, []lokiv1.RecordingRule, error) { +// BuildOptions returns the ruler options needed to generate Kubernetes resource manifests. +// The returned error can be a status.DegradedError in the following cases: +// - When remote write is enabled and the authorization Secret is missing. +// - When remote write is enabled and the authorization Secret data is invalid. +func BuildOptions( + ctx context.Context, + log logr.Logger, + k k8s.Client, + stack *lokiv1.LokiStack, +) ([]lokiv1.AlertingRule, []lokiv1.RecordingRule, manifests.Ruler, manifestsocp.Options, error) { + if stack.Spec.Rules == nil || !stack.Spec.Rules.Enabled { + return nil, nil, manifests.Ruler{}, manifestsocp.Options{}, nil + } + + var ( + err error + alertingRules []lokiv1.AlertingRule + recordingRules []lokiv1.RecordingRule + rulerConfig *lokiv1.RulerConfigSpec + rulerSecret *manifests.RulerSecret + ruler manifests.Ruler + ocpOpts manifestsocp.Options + + stackKey = client.ObjectKeyFromObject(stack) + ) + + alertingRules, recordingRules, err = listRules(ctx, k, stack.Namespace, stack.Spec.Rules) + if err != nil { + log.Error(err, "failed to lookup rules", "spec", stack.Spec.Rules) + } + + rulerConfig, err = getRulerConfig(ctx, k, stackKey) + if err != nil { + log.Error(err, "failed to lookup ruler config", "key", stackKey) + } + + if rulerConfig != nil && rulerConfig.RemoteWriteSpec != nil && rulerConfig.RemoteWriteSpec.ClientSpec != nil { + var rs corev1.Secret + key := client.ObjectKey{Name: rulerConfig.RemoteWriteSpec.ClientSpec.AuthorizationSecretName, Namespace: stack.Namespace} + if err = k.Get(ctx, key, &rs); err != nil { + if apierrors.IsNotFound(err) { + return nil, nil, ruler, ocpOpts, &status.DegradedError{ + Message: "Missing ruler remote write authorization secret", + Reason: lokiv1.ReasonMissingRulerSecret, + Requeue: false, + } + } + return nil, nil, ruler, ocpOpts, kverrors.Wrap(err, "failed to lookup lokistack ruler secret", "name", key) + } + + rulerSecret, err = ExtractRulerSecret(&rs, rulerConfig.RemoteWriteSpec.ClientSpec.AuthorizationType) + if err != nil { + return nil, nil, ruler, ocpOpts, &status.DegradedError{ + Message: "Invalid ruler remote write authorization secret contents", + Reason: lokiv1.ReasonInvalidRulerSecret, + Requeue: false, + } + } + } + + ocpAmEnabled, err := openshift.AlertManagerSVCExists(ctx, stack.Spec, k) + if err != nil { + log.Error(err, "failed to check OCP AlertManager") + return nil, nil, ruler, ocpOpts, err + } + + ocpUWAmEnabled, err := openshift.UserWorkloadAlertManagerSVCExists(ctx, stack.Spec, k) + if err != nil { + log.Error(err, "failed to check OCP User Workload AlertManager") + return nil, nil, ruler, ocpOpts, err + } + + ruler = manifests.Ruler{ + Spec: rulerConfig, + Secret: rulerSecret, + } + + ocpOpts = manifestsocp.Options{ + BuildOpts: manifestsocp.BuildOptions{ + AlertManagerEnabled: ocpAmEnabled, + UserWorkloadAlertManagerEnabled: ocpUWAmEnabled, + }, + } + + return alertingRules, recordingRules, ruler, ocpOpts, nil +} + +// listRules returns a slice of AlertingRules and a slice of RecordingRules for the given spec or an error. +// Three cases apply: +// - Return only matching rules in the stack namespace if no namespace selector is given. +// - Return only matching rules in the stack namespace and in namespaces matching the namespace selector. +// - Return no rules if rules selector does not apply at all. +func listRules(ctx context.Context, k k8s.Client, stackNs string, rs *lokiv1.RulesSpec) ([]lokiv1.AlertingRule, []lokiv1.RecordingRule, error) { nsl, err := selectRulesNamespaces(ctx, k, stackNs, rs) if err != nil { return nil, nil, err diff --git a/operator/internal/handlers/internal/rules/rules_test.go b/operator/internal/handlers/internal/rules/rules_test.go index 8bc52afb6a9a4..e33a2ac928a6a 100644 --- a/operator/internal/handlers/internal/rules/rules_test.go +++ b/operator/internal/handlers/internal/rules/rules_test.go @@ -1,4 +1,4 @@ -package rules_test +package rules import ( "context" @@ -11,13 +11,252 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" - "github.com/grafana/loki/operator/internal/handlers/internal/rules" + "github.com/grafana/loki/operator/internal/status" ) +func TestBuildOptions_WhenMissingRemoteWriteSecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + stack := lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Rules: &lokiv1.RulesSpec{ + Enabled: true, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + rulerCfg := &lokiv1.RulerConfig{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.RulerConfigSpec{ + RemoteWriteSpec: &lokiv1.RemoteWriteSpec{ + Enabled: true, + ClientSpec: &lokiv1.RemoteWriteClientSpec{ + AuthorizationType: lokiv1.BasicAuthorization, + AuthorizationSecretName: "test", + }, + }, + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing ruler remote write authorization secret", + Reason: lokiv1.ReasonMissingRulerSecret, + Requeue: false, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { + _, isRulerConfig := out.(*lokiv1.RulerConfig) + if r.Name == name.Name && r.Namespace == name.Namespace && isRulerConfig { + k.SetClientObject(out, rulerCfg) + return nil + } + + _, isLokiStack := out.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(out, &stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(out, &defaultSecret) + return nil + } + if defaultGatewaySecret.Name == name.Name { + k.SetClientObject(out, &defaultGatewaySecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, _, _, err := BuildOptions(context.TODO(), logger, k, &stack) + + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenInvalidRemoteWriteSecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + stack := lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Rules: &lokiv1.RulesSpec{ + Enabled: true, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + rulerCfg := &lokiv1.RulerConfig{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.RulerConfigSpec{ + RemoteWriteSpec: &lokiv1.RemoteWriteSpec{ + Enabled: true, + ClientSpec: &lokiv1.RemoteWriteClientSpec{ + AuthorizationType: lokiv1.BasicAuthorization, + AuthorizationSecretName: "some-client-secret", + }, + }, + }, + } + + invalidSecret := corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-client-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{}, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid ruler remote write authorization secret contents", + Reason: lokiv1.ReasonInvalidRulerSecret, + Requeue: false, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { + _, isRulerConfig := out.(*lokiv1.RulerConfig) + if r.Name == name.Name && r.Namespace == name.Namespace && isRulerConfig { + k.SetClientObject(out, rulerCfg) + return nil + } + + _, isLokiStack := out.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(out, &stack) + return nil + } + if invalidSecret.Name == name.Name { + k.SetClientObject(out, &invalidSecret) + return nil + } + if defaultGatewaySecret.Name == name.Name { + k.SetClientObject(out, &defaultGatewaySecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, _, _, err := BuildOptions(context.TODO(), logger, k, &stack) + + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + func TestList_AlertingRulesMatchSelector_WithDefaultStackNamespaceRules(t *testing.T) { const stackNs = "some-ns" @@ -83,7 +322,7 @@ func TestList_AlertingRulesMatchSelector_WithDefaultStackNamespaceRules(t *testi return nil } - rules, _, err := rules.List(context.TODO(), k, stackNs, rs) + rules, _, err := listRules(context.TODO(), k, stackNs, rs) require.NoError(t, err) require.NotEmpty(t, rules) @@ -185,7 +424,7 @@ func TestList_AlertingRulesMatchSelector_FilteredByNamespaceSelector(t *testing. return nil } - rules, _, err := rules.List(context.TODO(), k, stackNs, rs) + rules, _, err := listRules(context.TODO(), k, stackNs, rs) require.NoError(t, err) require.NotEmpty(t, rules) @@ -257,7 +496,7 @@ func TestList_RecordingRulesMatchSelector_WithDefaultStackNamespaceRules(t *test return nil } - _, rules, err := rules.List(context.TODO(), k, stackNs, rs) + _, rules, err := listRules(context.TODO(), k, stackNs, rs) require.NoError(t, err) require.NotEmpty(t, rules) @@ -358,7 +597,7 @@ func TestList_RecordingRulesMatchSelector_FilteredByNamespaceSelector(t *testing return nil } - _, rules, err := rules.List(context.TODO(), k, stackNs, rs) + _, rules, err := listRules(context.TODO(), k, stackNs, rs) require.NoError(t, err) require.NotEmpty(t, rules) diff --git a/operator/internal/handlers/internal/storage/ca_configmap.go b/operator/internal/handlers/internal/storage/ca_configmap.go index ccb4f93d06a34..904e63373a207 100644 --- a/operator/internal/handlers/internal/storage/ca_configmap.go +++ b/operator/internal/handlers/internal/storage/ca_configmap.go @@ -1,9 +1,67 @@ package storage -import corev1 "k8s.io/api/core/v1" +import ( + "context" + "crypto/sha1" + "fmt" -// IsValidCAConfigMap checks if the given CA configMap has an -// non-empty entry for the key -func IsValidCAConfigMap(cm *corev1.ConfigMap, key string) bool { - return cm.Data[key] != "" + "github.com/ViaQ/logerr/v2/kverrors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" + + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/status" +) + +const ( + defaultCAKey = "service-ca.crt" +) + +type caKeyError string + +func (e caKeyError) Error() string { + return fmt.Sprintf("key not present or data empty: %s", string(e)) +} + +func getCAConfigMap(ctx context.Context, k k8s.Client, stack *lokiv1.LokiStack, name string) (*corev1.ConfigMap, error) { + var cm corev1.ConfigMap + key := client.ObjectKey{Name: name, Namespace: stack.Namespace} + if err := k.Get(ctx, key, &cm); err != nil { + if apierrors.IsNotFound(err) { + return nil, &status.DegradedError{ + Message: "Missing object storage CA config map", + Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, + Requeue: false, + } + } + return nil, kverrors.Wrap(err, "failed to lookup lokistack object storage CA config map", "name", key) + } + + return &cm, nil +} + +// checkCAConfigMap checks if the given CA configMap has an non-empty entry for the key used as CA certificate. +// If the key is present it will return a hash of the current key name and contents. +func checkCAConfigMap(cm *corev1.ConfigMap, key string) (string, error) { + data := cm.Data[key] + if data == "" { + return "", caKeyError(key) + } + + h := sha1.New() + if _, err := h.Write([]byte(key)); err != nil { + return "", err + } + + if _, err := h.Write(hashSeparator); err != nil { + return "", err + } + + if _, err := h.Write([]byte(data)); err != nil { + return "", err + } + + return fmt.Sprintf("%x", h.Sum(nil)), nil } diff --git a/operator/internal/handlers/internal/storage/ca_configmap_test.go b/operator/internal/handlers/internal/storage/ca_configmap_test.go index 1e164f5a25413..33d5156defe95 100644 --- a/operator/internal/handlers/internal/storage/ca_configmap_test.go +++ b/operator/internal/handlers/internal/storage/ca_configmap_test.go @@ -1,19 +1,18 @@ -package storage_test +package storage import ( "testing" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" - - "github.com/grafana/loki/operator/internal/handlers/internal/storage" ) -func TestIsValidConfigMap(t *testing.T) { +func TestCheckValidConfigMap(t *testing.T) { type test struct { - name string - cm *corev1.ConfigMap - valid bool + name string + cm *corev1.ConfigMap + wantHash string + wantErrorMsg string } table := []test{ { @@ -23,11 +22,13 @@ func TestIsValidConfigMap(t *testing.T) { "service-ca.crt": "has-some-data", }, }, - valid: true, + wantHash: "de6ae206d4920549d21c24ad9721e87a9b1ec7dc", + wantErrorMsg: "", }, { - name: "missing `service-ca.crt` key", - cm: &corev1.ConfigMap{}, + name: "missing `service-ca.crt` key", + cm: &corev1.ConfigMap{}, + wantErrorMsg: "key not present or data empty: service-ca.crt", }, { name: "missing CA content", @@ -36,6 +37,7 @@ func TestIsValidConfigMap(t *testing.T) { "service-ca.crt": "", }, }, + wantErrorMsg: "key not present or data empty: service-ca.crt", }, } for _, tst := range table { @@ -43,8 +45,14 @@ func TestIsValidConfigMap(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - ok := storage.IsValidCAConfigMap(tst.cm, "service-ca.crt") - require.Equal(t, tst.valid, ok) + hash, err := checkCAConfigMap(tst.cm, "service-ca.crt") + + require.Equal(t, tst.wantHash, hash) + if tst.wantErrorMsg == "" { + require.NoError(t, err) + } else { + require.EqualError(t, err, tst.wantErrorMsg) + } }) } } diff --git a/operator/internal/handlers/internal/storage/secrets.go b/operator/internal/handlers/internal/storage/secrets.go index 1341728e7cecf..0ef5f197a625e 100644 --- a/operator/internal/handlers/internal/storage/secrets.go +++ b/operator/internal/handlers/internal/storage/secrets.go @@ -1,24 +1,46 @@ package storage import ( + "context" "crypto/sha1" "fmt" "sort" "github.com/ViaQ/logerr/v2/kverrors" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/manifests/storage" + "github.com/grafana/loki/operator/internal/status" ) var hashSeparator = []byte(",") -// ExtractSecret reads a k8s secret into a manifest object storage struct if valid. -func ExtractSecret(s *corev1.Secret, secretType lokiv1.ObjectStorageSecretType) (*storage.Options, error) { +func getSecret(ctx context.Context, k k8s.Client, stack *lokiv1.LokiStack) (*corev1.Secret, error) { + var storageSecret corev1.Secret + key := client.ObjectKey{Name: stack.Spec.Storage.Secret.Name, Namespace: stack.Namespace} + if err := k.Get(ctx, key, &storageSecret); err != nil { + if apierrors.IsNotFound(err) { + return nil, &status.DegradedError{ + Message: "Missing object storage secret", + Reason: lokiv1.ReasonMissingObjectStorageSecret, + Requeue: false, + } + } + return nil, kverrors.Wrap(err, "failed to lookup lokistack storage secret", "name", key) + } + + return &storageSecret, nil +} + +// extractSecret reads a k8s secret into a manifest object storage struct if valid. +func extractSecret(s *corev1.Secret, secretType lokiv1.ObjectStorageSecretType) (storage.Options, error) { hash, err := hashSecretData(s) if err != nil { - return nil, kverrors.Wrap(err, "error calculating hash for secret", "type", secretType) + return storage.Options{}, kverrors.Wrap(err, "error calculating hash for secret", "type", secretType) } storageOpts := storage.Options{ @@ -39,13 +61,13 @@ func ExtractSecret(s *corev1.Secret, secretType lokiv1.ObjectStorageSecretType) case lokiv1.ObjectStorageSecretAlibabaCloud: storageOpts.AlibabaCloud, err = extractAlibabaCloudConfigSecret(s) default: - return nil, kverrors.New("unknown secret type", "type", secretType) + return storage.Options{}, kverrors.New("unknown secret type", "type", secretType) } if err != nil { - return nil, err + return storage.Options{}, err } - return &storageOpts, nil + return storageOpts, nil } func hashSecretData(s *corev1.Secret) (string, error) { diff --git a/operator/internal/handlers/internal/storage/secrets_test.go b/operator/internal/handlers/internal/storage/secrets_test.go index 46ddc133f9f42..c72c63ea1ee12 100644 --- a/operator/internal/handlers/internal/storage/secrets_test.go +++ b/operator/internal/handlers/internal/storage/secrets_test.go @@ -135,7 +135,7 @@ func TestAzureExtract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - opts, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretAzure) + opts, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretAzure) if !tst.wantErr { require.NoError(t, err) require.NotEmpty(t, opts.SecretName) @@ -186,7 +186,7 @@ func TestGCSExtract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - _, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretGCS) + _, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretGCS) if !tst.wantErr { require.NoError(t, err) } @@ -360,7 +360,7 @@ func TestS3Extract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - opts, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretS3) + opts, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretS3) if !tst.wantErr { require.NoError(t, err) require.NotEmpty(t, opts.SecretName) @@ -509,7 +509,7 @@ func TestSwiftExtract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - opts, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretSwift) + opts, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretSwift) if !tst.wantErr { require.NoError(t, err) require.NotEmpty(t, opts.SecretName) @@ -583,7 +583,7 @@ func TestAlibabaCloudExtract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - opts, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretAlibabaCloud) + opts, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretAlibabaCloud) if !tst.wantErr { require.NoError(t, err) require.NotEmpty(t, opts.SecretName) diff --git a/operator/internal/handlers/internal/storage/storage.go b/operator/internal/handlers/internal/storage/storage.go new file mode 100644 index 0000000000000..e1657121ccd6d --- /dev/null +++ b/operator/internal/handlers/internal/storage/storage.go @@ -0,0 +1,91 @@ +package storage + +import ( + "context" + "fmt" + "time" + + configv1 "github.com/grafana/loki/operator/apis/config/v1" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/manifests/storage" + "github.com/grafana/loki/operator/internal/status" +) + +// BuildOptions returns the object storage options to generate Kubernetes resource manifests +// which require access to object storage buckets. +// The returned error can be a status.DegradedError in the following cases: +// - The user-provided object storage secret is missing. +// - The object storage Secret data is invalid. +// - The object storage schema config is invalid. +// - The object storage CA ConfigMap is missing if one referenced. +// - The object storage CA ConfigMap data is invalid. +func BuildOptions(ctx context.Context, k k8s.Client, stack *lokiv1.LokiStack, fg configv1.FeatureGates) (storage.Options, error) { + storageSecret, err := getSecret(ctx, k, stack) + if err != nil { + return storage.Options{}, err + } + + objStore, err := extractSecret(storageSecret, stack.Spec.Storage.Secret.Type) + if err != nil { + return storage.Options{}, &status.DegradedError{ + Message: fmt.Sprintf("Invalid object storage secret contents: %s", err), + Reason: lokiv1.ReasonInvalidObjectStorageSecret, + Requeue: false, + } + } + objStore.OpenShiftEnabled = fg.OpenShift.Enabled + + storageSchemas, err := storage.BuildSchemaConfig( + time.Now().UTC(), + stack.Spec.Storage, + stack.Status.Storage, + ) + if err != nil { + return storage.Options{}, &status.DegradedError{ + Message: fmt.Sprintf("Invalid object storage schema contents: %s", err), + Reason: lokiv1.ReasonInvalidObjectStorageSchema, + Requeue: false, + } + } + + objStore.Schemas = storageSchemas + + if stack.Spec.Storage.TLS == nil { + return objStore, nil + } + + tlsConfig := stack.Spec.Storage.TLS + if tlsConfig.CA == "" { + return storage.Options{}, &status.DegradedError{ + Message: "Missing object storage CA config map", + Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, + Requeue: false, + } + } + + cm, err := getCAConfigMap(ctx, k, stack, tlsConfig.CA) + if err != nil { + return storage.Options{}, err + } + + caKey := defaultCAKey + if tlsConfig.CAKey != "" { + caKey = tlsConfig.CAKey + } + + var caHash string + caHash, err = checkCAConfigMap(cm, caKey) + if err != nil { + return storage.Options{}, &status.DegradedError{ + Message: fmt.Sprintf("Invalid object storage CA configmap contents: %s", err), + Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, + Requeue: false, + } + } + + objStore.SecretSHA1 = fmt.Sprintf("%s;%s", objStore.SecretSHA1, caHash) + objStore.TLS = &storage.TLSConfig{CA: cm.Name, Key: caKey} + + return objStore, nil +} diff --git a/operator/internal/handlers/internal/storage/storage_test.go b/operator/internal/handlers/internal/storage/storage_test.go new file mode 100644 index 0000000000000..f56e446d6da8f --- /dev/null +++ b/operator/internal/handlers/internal/storage/storage_test.go @@ -0,0 +1,477 @@ +package storage + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + configv1 "github.com/grafana/loki/operator/apis/config/v1" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" + "github.com/grafana/loki/operator/internal/status" +) + +var ( + featureGates = configv1.FeatureGates{ + ServiceMonitors: false, + ServiceMonitorTLSEndpoints: false, + BuiltInCertManagement: configv1.BuiltInCertManagement{ + Enabled: true, + CACertValidity: "10m", + CACertRefresh: "5m", + CertValidity: "2m", + CertRefresh: "1m", + }, + } + + defaultSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "endpoint": []byte("s3://your-endpoint"), + "region": []byte("a-region"), + "bucketnames": []byte("bucket1,bucket2"), + "access_key_id": []byte("a-secret-id"), + "access_key_secret": []byte("a-secret-key"), + }, + } + + invalidSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{}, + } + + invalidCAConfigMap = corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-ca-configmap", + Namespace: "some-ns", + }, + Data: map[string]string{}, + } +) + +func TestBuildOptions_WhenMissingSecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing object storage secret", + Reason: lokiv1.ReasonMissingObjectStorageSecret, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenInvalidSecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid object storage secret contents: missing secret field", + Reason: lokiv1.ReasonInvalidObjectStorageSecret, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: invalidSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + if name.Name == invalidSecret.Name { + k.SetClientObject(object, &invalidSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WithInvalidStorageSchema_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid object storage schema contents: spec does not contain any schemas", + Reason: lokiv1.ReasonInvalidObjectStorageSchema, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{}, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + }, + Status: lokiv1.LokiStackStatus{ + Storage: lokiv1.LokiStackStorageStatus{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + { + Version: lokiv1.ObjectStorageSchemaV12, + EffectiveDate: "2021-10-11", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + if name.Name == defaultSecret.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenMissingCAConfigMap_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing object storage CA config map", + Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + TLS: &lokiv1.ObjectStorageTLSSpec{ + CASpec: lokiv1.CASpec{ + CA: "not-existing", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + + if name.Name == defaultSecret.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenEmptyCAConfigMapName_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing object storage CA config map", + Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + TLS: &lokiv1.ObjectStorageTLSSpec{ + CASpec: lokiv1.CASpec{ + CA: "", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + + if name.Name == defaultSecret.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenInvalidCAConfigMap_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid object storage CA configmap contents: key not present or data empty: service-ca.crt", + Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + TLS: &lokiv1.ObjectStorageTLSSpec{ + CASpec: lokiv1.CASpec{ + CA: invalidCAConfigMap.Name, + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + if name.Name == defaultSecret.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + + if name.Name == invalidCAConfigMap.Name { + k.SetClientObject(object, &invalidCAConfigMap) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} diff --git a/operator/internal/handlers/lokistack_create_or_update.go b/operator/internal/handlers/lokistack_create_or_update.go index 49c84af4dcf4b..b64713f2d0fda 100644 --- a/operator/internal/handlers/lokistack_create_or_update.go +++ b/operator/internal/handlers/lokistack_create_or_update.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "os" - "time" "github.com/ViaQ/logerr/v2/kverrors" "github.com/go-logr/logr" @@ -20,22 +19,15 @@ import ( lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/handlers/internal/gateway" - "github.com/grafana/loki/operator/internal/handlers/internal/openshift" "github.com/grafana/loki/operator/internal/handlers/internal/rules" "github.com/grafana/loki/operator/internal/handlers/internal/serviceaccounts" "github.com/grafana/loki/operator/internal/handlers/internal/storage" "github.com/grafana/loki/operator/internal/handlers/internal/tlsprofile" "github.com/grafana/loki/operator/internal/manifests" - manifests_openshift "github.com/grafana/loki/operator/internal/manifests/openshift" - storageoptions "github.com/grafana/loki/operator/internal/manifests/storage" "github.com/grafana/loki/operator/internal/metrics" "github.com/grafana/loki/operator/internal/status" ) -const ( - defaultCAKey = "service-ca.crt" -) - // CreateOrUpdateLokiStack handles LokiStack create and update events. func CreateOrUpdateLokiStack( ctx context.Context, @@ -67,202 +59,23 @@ func CreateOrUpdateLokiStack( gwImg = manifests.DefaultLokiStackGatewayImage } - var storageSecret corev1.Secret - key := client.ObjectKey{Name: stack.Spec.Storage.Secret.Name, Namespace: stack.Namespace} - if err := k.Get(ctx, key, &storageSecret); err != nil { - if apierrors.IsNotFound(err) { - return &status.DegradedError{ - Message: "Missing object storage secret", - Reason: lokiv1.ReasonMissingObjectStorageSecret, - Requeue: false, - } - } - return kverrors.Wrap(err, "failed to lookup lokistack storage secret", "name", key) - } - - objStore, err := storage.ExtractSecret(&storageSecret, stack.Spec.Storage.Secret.Type) + objStore, err := storage.BuildOptions(ctx, k, &stack, fg) if err != nil { - return &status.DegradedError{ - Message: fmt.Sprintf("Invalid object storage secret contents: %s", err), - Reason: lokiv1.ReasonInvalidObjectStorageSecret, - Requeue: false, - } + return err } - objStore.OpenShiftEnabled = fg.OpenShift.Enabled - storageSchemas, err := storageoptions.BuildSchemaConfig( - time.Now().UTC(), - stack.Spec.Storage, - stack.Status.Storage, - ) + baseDomain, tenants, err := gateway.BuildOptions(ctx, ll, k, &stack, fg) if err != nil { - return &status.DegradedError{ - Message: fmt.Sprintf("Invalid object storage schema contents: %s", err), - Reason: lokiv1.ReasonInvalidObjectStorageSchema, - Requeue: false, - } - } - - objStore.Schemas = storageSchemas - - if stack.Spec.Storage.TLS != nil { - tlsConfig := stack.Spec.Storage.TLS - - if tlsConfig.CA == "" { - return &status.DegradedError{ - Message: "Missing object storage CA config map", - Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, - Requeue: false, - } - } - - var cm corev1.ConfigMap - key := client.ObjectKey{Name: tlsConfig.CA, Namespace: stack.Namespace} - if err = k.Get(ctx, key, &cm); err != nil { - if apierrors.IsNotFound(err) { - return &status.DegradedError{ - Message: "Missing object storage CA config map", - Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, - Requeue: false, - } - } - return kverrors.Wrap(err, "failed to lookup lokistack object storage CA config map", "name", key) - } - - caKey := defaultCAKey - if tlsConfig.CAKey != "" { - caKey = tlsConfig.CAKey - } - - if !storage.IsValidCAConfigMap(&cm, caKey) { - return &status.DegradedError{ - Message: "Invalid object storage CA configmap contents: missing key or no contents", - Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, - Requeue: false, - } - } - - objStore.TLS = &storageoptions.TLSConfig{CA: cm.Name, Key: caKey} + return err } - var ( - baseDomain string - tenantSecrets []*manifests.TenantSecrets - tenantConfigs map[string]manifests.TenantConfig - ) - if fg.LokiStackGateway && stack.Spec.Tenants == nil { - return &status.DegradedError{ - Message: "Invalid tenants configuration - TenantsSpec cannot be nil when gateway flag is enabled", - Reason: lokiv1.ReasonInvalidTenantsConfiguration, - Requeue: false, - } - } else if fg.LokiStackGateway && stack.Spec.Tenants != nil { - if err = gateway.ValidateModes(stack); err != nil { - return &status.DegradedError{ - Message: fmt.Sprintf("Invalid tenants configuration: %s", err), - Reason: lokiv1.ReasonInvalidTenantsConfiguration, - Requeue: false, - } - } - - switch stack.Spec.Tenants.Mode { - case lokiv1.OpenshiftLogging, lokiv1.OpenshiftNetwork: - baseDomain, err = gateway.GetOpenShiftBaseDomain(ctx, k, req) - if err != nil { - return err - } - - if stack.Spec.Proxy == nil { - // If the LokiStack has no proxy set but there is a cluster-wide proxy setting, - // set the LokiStack proxy to that. - ocpProxy, proxyErr := openshift.GetProxy(ctx, k) - if proxyErr != nil { - return proxyErr - } - - stack.Spec.Proxy = ocpProxy - } - default: - tenantSecrets, err = gateway.GetTenantSecrets(ctx, k, req, &stack) - if err != nil { - return err - } - } - - // extract the existing tenant's id, cookieSecret if exists, otherwise create new. - tenantConfigs, err = gateway.GetTenantConfigSecretData(ctx, k, req) - if err != nil { - ll.Error(err, "error in getting tenant secret data") - } + if err = rules.Cleanup(ctx, ll, k, &stack); err != nil { + return err } - var ( - alertingRules []lokiv1.AlertingRule - recordingRules []lokiv1.RecordingRule - rulerConfig *lokiv1.RulerConfigSpec - rulerSecret *manifests.RulerSecret - ocpAmEnabled bool - ocpUWAmEnabled bool - ) - if stack.Spec.Rules != nil && stack.Spec.Rules.Enabled { - alertingRules, recordingRules, err = rules.List(ctx, k, req.Namespace, stack.Spec.Rules) - if err != nil { - ll.Error(err, "failed to lookup rules", "spec", stack.Spec.Rules) - } - - rulerConfig, err = rules.GetRulerConfig(ctx, k, req) - if err != nil { - ll.Error(err, "failed to lookup ruler config", "key", req.NamespacedName) - } - - if rulerConfig != nil && rulerConfig.RemoteWriteSpec != nil && rulerConfig.RemoteWriteSpec.ClientSpec != nil { - var rs corev1.Secret - key := client.ObjectKey{Name: rulerConfig.RemoteWriteSpec.ClientSpec.AuthorizationSecretName, Namespace: stack.Namespace} - if err = k.Get(ctx, key, &rs); err != nil { - if apierrors.IsNotFound(err) { - return &status.DegradedError{ - Message: "Missing ruler remote write authorization secret", - Reason: lokiv1.ReasonMissingRulerSecret, - Requeue: false, - } - } - return kverrors.Wrap(err, "failed to lookup lokistack ruler secret", "name", key) - } - - rulerSecret, err = rules.ExtractRulerSecret(&rs, rulerConfig.RemoteWriteSpec.ClientSpec.AuthorizationType) - if err != nil { - return &status.DegradedError{ - Message: "Invalid ruler remote write authorization secret contents", - Reason: lokiv1.ReasonInvalidRulerSecret, - Requeue: false, - } - } - } - - ocpAmEnabled, err = openshift.AlertManagerSVCExists(ctx, stack.Spec, k) - if err != nil { - ll.Error(err, "failed to check OCP AlertManager") - return err - } - - ocpUWAmEnabled, err = openshift.UserWorkloadAlertManagerSVCExists(ctx, stack.Spec, k) - if err != nil { - ll.Error(err, "failed to check OCP User Workload AlertManager") - return err - } - } else { - // Clean up ruler resources - err = rules.RemoveRulesConfigMap(ctx, req, k) - if err != nil { - ll.Error(err, "failed to remove rules ConfigMap") - return err - } - - err = rules.RemoveRuler(ctx, req, k) - if err != nil { - ll.Error(err, "failed to remove ruler StatefulSet") - return err - } + alertingRules, recordingRules, ruler, ocpOptions, err := rules.BuildOptions(ctx, ll, k, &stack) + if err != nil { + return err } certRotationRequiredAt := "" @@ -289,25 +102,14 @@ func CreateOrUpdateLokiStack( GatewayBaseDomain: baseDomain, Stack: stack.Spec, Gates: fg, - ObjectStorage: *objStore, + ObjectStorage: objStore, CertRotationRequiredAt: certRotationRequiredAt, AlertingRules: alertingRules, RecordingRules: recordingRules, - Ruler: manifests.Ruler{ - Spec: rulerConfig, - Secret: rulerSecret, - }, - Timeouts: timeoutConfig, - Tenants: manifests.Tenants{ - Secrets: tenantSecrets, - Configs: tenantConfigs, - }, - OpenShiftOptions: manifests_openshift.Options{ - BuildOpts: manifests_openshift.BuildOptions{ - AlertManagerEnabled: ocpAmEnabled, - UserWorkloadAlertManagerEnabled: ocpUWAmEnabled, - }, - }, + Ruler: ruler, + Timeouts: timeoutConfig, + Tenants: tenants, + OpenShiftOptions: ocpOptions, } ll.Info("begin building manifests") @@ -354,7 +156,7 @@ func CreateOrUpdateLokiStack( // updated and another resource is not. This would cause the status to // be possibly misaligned with the configmap, which could lead to // a user possibly being unable to read logs. - if err := status.SetStorageSchemaStatus(ctx, k, req, storageSchemas); err != nil { + if err := status.SetStorageSchemaStatus(ctx, k, req, objStore.Schemas); err != nil { ll.Error(err, "failed to set storage schema status") return err } diff --git a/operator/internal/handlers/lokistack_create_or_update_test.go b/operator/internal/handlers/lokistack_create_or_update_test.go index 79928b4a82e50..ad80f45b817a1 100644 --- a/operator/internal/handlers/lokistack_create_or_update_test.go +++ b/operator/internal/handlers/lokistack_create_or_update_test.go @@ -13,7 +13,6 @@ import ( routev1 "github.com/openshift/api/route/v1" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -73,42 +72,6 @@ var ( "issuerCAPath": []byte("/tmp/test/ca.pem"), }, } - - rulesCM = corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack-rules-0", - Namespace: "some-ns", - }, - } - - rulerSS = appsv1.StatefulSet{ - TypeMeta: metav1.TypeMeta{ - Kind: "StatefulSet", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack-ruler", - Namespace: "some-ns", - }, - } - - invalidSecret = corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: "some-stack-secret", - Namespace: "some-ns", - }, - Data: map[string][]byte{}, - } - - invalidCAConfigMap = corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "some-stack-ca-configmap", - Namespace: "some-ns", - }, - Data: map[string]string{}, - } ) func TestMain(m *testing.M) { @@ -573,8 +536,6 @@ func TestCreateOrUpdateLokiStack_WhenCreateReturnsError_ContinueWithOtherObjects }, } - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { _, isLokiStack := object.(*lokiv1.LokiStack) if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { @@ -681,8 +642,6 @@ func TestCreateOrUpdateLokiStack_WhenUpdateReturnsError_ContinueWithOtherObjects }, } - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { _, isLokiStack := object.(*lokiv1.LokiStack) if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { @@ -710,69 +669,7 @@ func TestCreateOrUpdateLokiStack_WhenUpdateReturnsError_ContinueWithOtherObjects require.Error(t, err) } -func TestCreateOrUpdateLokiStack_WhenMissingSecret_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Missing object storage secret", - Reason: lokiv1.ReasonMissingObjectStorageSecret, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenInvalidSecret_SetDegraded(t *testing.T) { +func TestCreateOrUpdateLokiStack_WhenInvalidQueryTimeout_SetDegraded(t *testing.T) { sw := &k8sfakes.FakeStatusWriter{} k := &k8sfakes.FakeClient{} r := ctrl.Request{ @@ -783,8 +680,8 @@ func TestCreateOrUpdateLokiStack_WhenInvalidSecret_SetDegraded(t *testing.T) { } degradedErr := &status.DegradedError{ - Message: "Invalid object storage secret contents: missing secret field", - Reason: lokiv1.ReasonInvalidObjectStorageSecret, + Message: `Error parsing query timeout: time: invalid duration "invalid"`, + Reason: lokiv1.ReasonQueryTimeoutInvalid, Requeue: false, } @@ -802,179 +699,37 @@ func TestCreateOrUpdateLokiStack_WhenInvalidSecret_SetDegraded(t *testing.T) { Storage: lokiv1.ObjectStorageSpec{ Schemas: []lokiv1.ObjectStorageSchema{ { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", + Version: lokiv1.ObjectStorageSchemaV12, + EffectiveDate: "2023-05-22", }, }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: invalidSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - if name.Name == invalidSecret.Name { - k.SetClientObject(object, &invalidSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WithInvalidStorageSchema_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid object storage schema contents: spec does not contain any schemas", - Reason: lokiv1.ReasonInvalidObjectStorageSchema, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{}, Secret: lokiv1.ObjectStorageSecretSpec{ Name: defaultSecret.Name, Type: lokiv1.ObjectStorageSecretS3, }, }, - }, - Status: lokiv1.LokiStackStatus{ - Storage: lokiv1.LokiStackStorageStatus{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - { - Version: lokiv1.ObjectStorageSchemaV12, - EffectiveDate: "2021-10-11", - }, - }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "openshift", }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - if name.Name == defaultSecret.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenMissingCAConfigMap_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Missing object storage CA config map", - Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - TLS: &lokiv1.ObjectStorageTLSSpec{ - CASpec: lokiv1.CASpec{ - CA: "not-existing", + Limits: &lokiv1.LimitsSpec{ + Global: &lokiv1.LimitsTemplateSpec{ + QueryLimits: &lokiv1.QueryLimitSpec{ + QueryTimeout: "invalid", }, }, }, }, } - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. + // Create looks up the CR first, so we need to return our fake stack k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + if r.Name == name.Name && r.Namespace == name.Namespace { k.SetClientObject(object, stack) - return nil } - - if name.Name == defaultSecret.Name { + if defaultSecret.Name == name.Name { k.SetClientObject(object, &defaultSecret) - return nil } - - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + return nil } k.StatusStub = func() client.StatusWriter { return sw } @@ -985,642 +740,3 @@ func TestCreateOrUpdateLokiStack_WhenMissingCAConfigMap_SetDegraded(t *testing.T require.Error(t, err) require.Equal(t, degradedErr, err) } - -func TestCreateOrUpdateLokiStack_WhenInvalidCAConfigMap_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid object storage CA configmap contents: missing key or no contents", - Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - TLS: &lokiv1.ObjectStorageTLSSpec{ - CASpec: lokiv1.CASpec{ - CA: invalidCAConfigMap.Name, - }, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - if name.Name == defaultSecret.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - - if name.Name == invalidCAConfigMap.Name { - k.SetClientObject(object, &invalidCAConfigMap) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenInvalidTenantsConfiguration_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid tenants configuration: mandatory configuration - missing OPA Url", - Reason: lokiv1.ReasonInvalidTenantsConfiguration, - Requeue: false, - } - - ff := configv1.FeatureGates{ - LokiStackGateway: true, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "dynamic", - Authentication: []lokiv1.AuthenticationSpec{ - { - TenantName: "test", - TenantID: "1234", - OIDC: &lokiv1.OIDCSpec{ - Secret: &lokiv1.TenantSecretSpec{ - Name: defaultGatewaySecret.Name, - }, - }, - }, - }, - Authorization: nil, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, ff) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenMissingGatewaySecret_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Missing secrets for tenant test", - Reason: lokiv1.ReasonMissingGatewayTenantSecret, - Requeue: true, - } - - ff := configv1.FeatureGates{ - LokiStackGateway: true, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "dynamic", - Authentication: []lokiv1.AuthenticationSpec{ - { - TenantName: "test", - TenantID: "1234", - OIDC: &lokiv1.OIDCSpec{ - Secret: &lokiv1.TenantSecretSpec{ - Name: defaultGatewaySecret.Name, - }, - }, - }, - }, - Authorization: &lokiv1.AuthorizationSpec{ - OPA: &lokiv1.OPASpec{ - URL: "some-url", - }, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - o, ok := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && ok { - k.SetClientObject(o, stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, ff) - - // make sure error is returned to re-trigger reconciliation - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenInvalidGatewaySecret_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid gateway tenant secret contents", - Reason: lokiv1.ReasonInvalidGatewayTenantSecret, - Requeue: true, - } - - ff := configv1.FeatureGates{ - LokiStackGateway: true, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "dynamic", - Authentication: []lokiv1.AuthenticationSpec{ - { - TenantName: "test", - TenantID: "1234", - OIDC: &lokiv1.OIDCSpec{ - Secret: &lokiv1.TenantSecretSpec{ - Name: invalidSecret.Name, - }, - }, - }, - }, - Authorization: &lokiv1.AuthorizationSpec{ - OPA: &lokiv1.OPASpec{ - URL: "some-url", - }, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - o, ok := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && ok { - k.SetClientObject(o, stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - if name.Name == invalidSecret.Name { - k.SetClientObject(object, &invalidSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, ff) - - // make sure error is returned to re-trigger reconciliation - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_MissingTenantsSpec_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid tenants configuration - TenantsSpec cannot be nil when gateway flag is enabled", - Reason: lokiv1.ReasonInvalidTenantsConfiguration, - Requeue: false, - } - - ff := configv1.FeatureGates{ - LokiStackGateway: true, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: nil, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - o, ok := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && ok { - k.SetClientObject(o, stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, ff) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenInvalidQueryTimeout_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: `Error parsing query timeout: time: invalid duration "invalid"`, - Reason: lokiv1.ReasonQueryTimeoutInvalid, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV12, - EffectiveDate: "2023-05-22", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "openshift", - }, - Limits: &lokiv1.LimitsSpec{ - Global: &lokiv1.LimitsTemplateSpec{ - QueryLimits: &lokiv1.QueryLimitSpec{ - QueryTimeout: "invalid", - }, - }, - }, - }, - } - - // Create looks up the CR first, so we need to return our fake stack - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - if r.Name == name.Name && r.Namespace == name.Namespace { - k.SetClientObject(object, stack) - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - } - return nil - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_RemovesRulerResourcesWhenDisabled(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - stack := lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Rules: &lokiv1.RulesSpec{ - Enabled: true, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "dynamic", - Authentication: []lokiv1.AuthenticationSpec{ - { - TenantName: "test", - TenantID: "1234", - OIDC: &lokiv1.OIDCSpec{ - Secret: &lokiv1.TenantSecretSpec{ - Name: defaultGatewaySecret.Name, - }, - }, - }, - }, - Authorization: &lokiv1.AuthorizationSpec{ - OPA: &lokiv1.OPASpec{ - URL: "some-url", - }, - }, - }, - }, - } - - k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { - _, ok := out.(*lokiv1.RulerConfig) - if ok { - return apierrors.NewNotFound(schema.GroupResource{}, "no ruler config") - } - - _, isLokiStack := out.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(out, &stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(out, &defaultSecret) - return nil - } - if defaultGatewaySecret.Name == name.Name { - k.SetClientObject(out, &defaultGatewaySecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") - } - - k.CreateStub = func(_ context.Context, o client.Object, _ ...client.CreateOption) error { - assert.Equal(t, r.Namespace, o.GetNamespace()) - return nil - } - - k.StatusStub = func() client.StatusWriter { return sw } - - k.DeleteStub = func(_ context.Context, o client.Object, _ ...client.DeleteOption) error { - assert.Equal(t, r.Namespace, o.GetNamespace()) - return nil - } - - k.ListStub = func(_ context.Context, list client.ObjectList, options ...client.ListOption) error { - switch list.(type) { - case *corev1.ConfigMapList: - k.SetClientObjectList(list, &corev1.ConfigMapList{ - Items: []corev1.ConfigMap{ - rulesCM, - }, - }) - } - return nil - } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - require.NoError(t, err) - - // make sure create was called - require.NotZero(t, k.CreateCallCount()) - - // make sure delete not called - require.Zero(t, k.DeleteCallCount()) - - // Disable the ruler - stack.Spec.Rules.Enabled = false - - // Get should return ruler resources - k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { - _, ok := out.(*lokiv1.RulerConfig) - if ok { - return apierrors.NewNotFound(schema.GroupResource{}, "no ruler config") - } - if rulesCM.Name == name.Name { - k.SetClientObject(out, &rulesCM) - return nil - } - if rulerSS.Name == name.Name { - k.SetClientObject(out, &rulerSS) - return nil - } - - _, isLokiStack := out.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(out, &stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(out, &defaultSecret) - return nil - } - if defaultGatewaySecret.Name == name.Name { - k.SetClientObject(out, &defaultGatewaySecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") - } - err = CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - require.NoError(t, err) - - // make sure delete was called twice (delete rules configmap and ruler statefulset) - require.Equal(t, 2, k.DeleteCallCount()) -} diff --git a/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-reads.json b/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-reads.json index e1adb4dd6cc0a..df5ea66e6d2a5 100644 --- a/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-reads.json +++ b/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-reads.json @@ -217,9 +217,9 @@ "group": "A", "mode": "normal" } - } - }, - "unit": "s" + }, + "unit": "s" + } }, "fill": 1, "id": 3, @@ -493,9 +493,9 @@ "group": "A", "mode": "normal" } - } - }, - "unit": "s" + }, + "unit": "s" + } }, "fill": 1, "id": 6, @@ -769,9 +769,9 @@ "group": "A", "mode": "normal" } - } - }, - "unit": "s" + }, + "unit": "s" + } }, "fill": 1, "id": 9, @@ -1045,9 +1045,9 @@ "group": "A", "mode": "normal" } - } - }, - "unit": "s" + }, + "unit": "s" + } }, "fill": 1, "id": 15, diff --git a/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-writes.json b/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-writes.json index 58107485d370c..8053d353b1135 100644 --- a/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-writes.json +++ b/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-writes.json @@ -66,7 +66,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{namespace=\"$namespace\",job=~\".+-distributor-http\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{status}}", @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (namespace_job_route:loki_request_duration_seconds_bucket:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"loki_api_v1_push\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (namespace_job_route:loki_request_duration_seconds_bucket:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (namespace_job_route:loki_request_duration_seconds_bucket:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"loki_api_v1_push\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (namespace_job_route:loki_request_duration_seconds_bucket:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(namespace_job_route:loki_request_duration_seconds_sum:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"loki_api_v1_push\"}) / sum(namespace_job_route:loki_request_duration_seconds_count:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"loki_api_v1_push\"})", + "expr": "1e3 * sum(namespace_job_route:loki_request_duration_seconds_sum:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}) / sum(namespace_job_route:loki_request_duration_seconds_count:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", @@ -246,7 +246,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\",}[$__rate_interval])) / sum(rate(loki_distributor_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\",}[$__rate_interval]))", + "expr": "sum (rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",}[$__rate_interval])) / sum(rate(loki_distributor_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "bytes", @@ -322,7 +322,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (tenant) (rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\",}[$__rate_interval])) / ignoring(tenant) group_left sum(rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\",}[$__rate_interval]))", + "expr": "sum by (tenant) (rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",}[$__rate_interval])) / ignoring(tenant) group_left sum(rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", diff --git a/operator/internal/manifests/openshift/rbac.go b/operator/internal/manifests/openshift/rbac.go index ebd464274c437..46e5837a2c262 100644 --- a/operator/internal/manifests/openshift/rbac.go +++ b/operator/internal/manifests/openshift/rbac.go @@ -108,6 +108,17 @@ func BuildRulerClusterRole(opts Options) *rbacv1.ClusterRole { "create", }, }, + { + APIGroups: []string{ + "monitoring.coreos.com", + }, + Resources: []string{ + "alertmanagers/api", + }, + Verbs: []string{ + "create", + }, + }, }, } } diff --git a/operator/jsonnet/config.libsonnet b/operator/jsonnet/config.libsonnet index efdc1c6103d5c..82dc625da2a4f 100644 --- a/operator/jsonnet/config.libsonnet +++ b/operator/jsonnet/config.libsonnet @@ -238,7 +238,6 @@ local utils = (import 'github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonn distributor:: [ utils.selector.eq('namespace', '$namespace'), utils.selector.re('job', '.+-distributor-http'), - utils.selector.eq('route', 'loki_api_v1_push'), ], ingester:: [ utils.selector.eq('namespace', '$namespace'), diff --git a/operator/jsonnet/jsonnetfile.json b/operator/jsonnet/jsonnetfile.json index 4b25fb159b3d8..2bc2549a3c600 100644 --- a/operator/jsonnet/jsonnetfile.json +++ b/operator/jsonnet/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "production/loki-mixin" } }, - "version": "bd505f8e2d37172ff35a89f4ac42efec9566a263" + "version": "0694d797dec010393567704211638219c1971b46" } ], "legacyImports": true diff --git a/operator/jsonnet/jsonnetfile.lock.json b/operator/jsonnet/jsonnetfile.lock.json index 27d2e6e8756c6..3a0710db7565f 100644 --- a/operator/jsonnet/jsonnetfile.lock.json +++ b/operator/jsonnet/jsonnetfile.lock.json @@ -38,8 +38,8 @@ "subdir": "production/loki-mixin" } }, - "version": "bd505f8e2d37172ff35a89f4ac42efec9566a263", - "sum": "yiXXBAcWfMkYSJthU2OZSgHHmveWvmRT6aM1V0MaAjs=" + "version": "0694d797dec010393567704211638219c1971b46", + "sum": "Pw/9T/ZRjXLqTivU5xkJnrP5kFdET2FDUjjG1G96GmQ=" }, { "source": { diff --git a/pkg/bloomcompactor/bloomcompactor.go b/pkg/bloomcompactor/bloomcompactor.go index 40e4646c7044c..dbe307ff18822 100644 --- a/pkg/bloomcompactor/bloomcompactor.go +++ b/pkg/bloomcompactor/bloomcompactor.go @@ -51,6 +51,7 @@ import ( "github.com/grafana/loki/pkg/storage" v1 "github.com/grafana/loki/pkg/storage/bloom/v1" chunk_client "github.com/grafana/loki/pkg/storage/chunk/client" + "github.com/grafana/loki/pkg/storage/chunk/client/local" "github.com/grafana/loki/pkg/storage/config" "github.com/grafana/loki/pkg/storage/stores/shipper/bloomshipper" "github.com/grafana/loki/pkg/storage/stores/shipper/indexshipper" @@ -166,10 +167,18 @@ func New( return nil, errors.Wrap(err, "create index shipper") } + // The ObjectClient does not expose the key encoder it uses, + // so check the concrete type and set the FSEncoder if needed. + var keyEncoder chunk_client.KeyEncoder + switch objectClient.(type) { + case *local.FSObjectClient: + keyEncoder = chunk_client.FSEncoder + } + c.storeClients[periodicConfig.From] = storeClient{ object: objectClient, index: index_storage.NewIndexStorageClient(objectClient, periodicConfig.IndexTables.PathPrefix), - chunk: chunk_client.NewClient(objectClient, nil, schemaConfig), + chunk: chunk_client.NewClient(objectClient, keyEncoder, schemaConfig), indexShipper: indexShipper, } } @@ -275,7 +284,7 @@ func (c *Compactor) compactTable(ctx context.Context, logger log.Logger, tableNa return fmt.Errorf("index store client not found for period starting at %s", schemaCfg.From.String()) } - _, tenants, err := sc.index.ListFiles(ctx, tableName, false) + _, tenants, err := sc.index.ListFiles(ctx, tableName, true) if err != nil { return fmt.Errorf("failed to list files for table %s: %w", tableName, err) } @@ -497,13 +506,12 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, localDst := createLocalDirName(c.cfg.WorkingDirectory, job) blockOptions := v1.NewBlockOptions(bt.GetNGramLength(), bt.GetNGramSkip()) - // TODO(poyzannur) enable once debugging is over - //defer func() { - // //clean up the bloom directory - // if err := os.RemoveAll(localDst); err != nil { - // level.Error(logger).Log("msg", "failed to remove block directory", "dir", localDst, "err", err) - // } - //}() + defer func() { + //clean up the bloom directory + if err := os.RemoveAll(localDst); err != nil { + level.Error(logger).Log("msg", "failed to remove block directory", "dir", localDst, "err", err) + } + }() var resultingBlock bloomshipper.Block defer func() { @@ -527,8 +535,7 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, return err } - fpRate := c.limits.BloomFalsePositiveRate(job.tenantID) - resultingBlock, err = compactNewChunks(ctx, logger, job, fpRate, bt, storeClient.chunk, builder) + resultingBlock, err = compactNewChunks(ctx, logger, job, bt, storeClient.chunk, builder, c.limits) if err != nil { return level.Error(logger).Log("msg", "failed compacting new chunks", "err", err) } @@ -537,7 +544,7 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, // When already compacted metas exists, we need to merge all blocks with amending blooms with new series level.Info(logger).Log("msg", "already compacted metas exists, use mergeBlockBuilder") - var populate = createPopulateFunc(ctx, logger, job, storeClient, bt) + var populate = createPopulateFunc(ctx, job, storeClient, bt, c.limits) seriesIter := makeSeriesIterFromSeriesMeta(job) @@ -551,6 +558,7 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, }() if err != nil { + level.Error(logger).Log("err", err) return err } @@ -565,6 +573,7 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, level.Error(logger).Log("msg", "failed merging existing blocks with new chunks", "err", err) return err } + } archivePath := filepath.Join(c.cfg.WorkingDirectory, uuid.New().String()) @@ -575,13 +584,12 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, return err } - // TODO(poyzannur) enable once debugging is over - //defer func() { - // err = os.Remove(archivePath) - // if err != nil { - // level.Error(logger).Log("msg", "failed removing archive file", "err", err, "file", archivePath) - // } - //}() + defer func() { + err = os.Remove(archivePath) + if err != nil { + level.Error(logger).Log("msg", "failed removing archive file", "err", err, "file", archivePath) + } + }() // Do not change the signature of PutBlocks yet. // Once block size is limited potentially, compactNewChunks will return multiple blocks, hence a list is appropriate. diff --git a/pkg/bloomcompactor/chunkcompactor.go b/pkg/bloomcompactor/chunkcompactor.go index f0f59882b31c4..c4993ccc62a59 100644 --- a/pkg/bloomcompactor/chunkcompactor.go +++ b/pkg/bloomcompactor/chunkcompactor.go @@ -22,7 +22,7 @@ import ( ) type compactorTokenizer interface { - PopulateSeriesWithBloom(bloom *v1.SeriesWithBloom, chunks []chunk.Chunk) error + PopulateSeriesWithBloom(bloom *v1.SeriesWithBloom, chunkBatchesIterator v1.Iterator[[]chunk.Chunk]) error } type chunkClient interface { @@ -86,7 +86,7 @@ func makeChunkRefs(chksMetas []tsdbindex.ChunkMeta, tenant string, fp model.Fing return chunkRefs } -func buildBloomFromSeries(seriesMeta seriesMeta, fpRate float64, tokenizer compactorTokenizer, chunks []chunk.Chunk) (v1.SeriesWithBloom, error) { +func buildBloomFromSeries(seriesMeta seriesMeta, fpRate float64, tokenizer compactorTokenizer, chunks v1.Iterator[[]chunk.Chunk]) (v1.SeriesWithBloom, error) { // Create a bloom for this series bloomForChks := v1.SeriesWithBloom{ Series: &v1.Series{ @@ -155,21 +155,20 @@ func createLocalDirName(workingDir string, job Job) string { } // Compacts given list of chunks, uploads them to storage and returns a list of bloomBlocks -func compactNewChunks( - ctx context.Context, +func compactNewChunks(ctx context.Context, logger log.Logger, job Job, - fpRate float64, bt compactorTokenizer, storeClient chunkClient, builder blockBuilder, + limits Limits, ) (bloomshipper.Block, error) { // Ensure the context has not been canceled (ie. compactor shutdown has been triggered). if err := ctx.Err(); err != nil { return bloomshipper.Block{}, err } - bloomIter := newLazyBloomBuilder(ctx, job, storeClient, bt, fpRate, logger) + bloomIter := newLazyBloomBuilder(ctx, job, storeClient, bt, logger, limits) // Build and upload bloomBlock to storage block, err := buildBlockFromBlooms(ctx, logger, builder, bloomIter, job) @@ -182,13 +181,14 @@ func compactNewChunks( } type lazyBloomBuilder struct { - ctx context.Context - metas v1.Iterator[seriesMeta] - tenant string - client chunkClient - bt compactorTokenizer - fpRate float64 - logger log.Logger + ctx context.Context + metas v1.Iterator[seriesMeta] + tenant string + client chunkClient + bt compactorTokenizer + fpRate float64 + logger log.Logger + chunksBatchSize int cur v1.SeriesWithBloom // retured by At() err error // returned by Err() @@ -198,15 +198,16 @@ type lazyBloomBuilder struct { // which are used by the blockBuilder to write a bloom block. // We use an interator to avoid loading all blooms into memory first, before // building the block. -func newLazyBloomBuilder(ctx context.Context, job Job, client chunkClient, bt compactorTokenizer, fpRate float64, logger log.Logger) *lazyBloomBuilder { +func newLazyBloomBuilder(ctx context.Context, job Job, client chunkClient, bt compactorTokenizer, logger log.Logger, limits Limits) *lazyBloomBuilder { return &lazyBloomBuilder{ - ctx: ctx, - metas: v1.NewSliceIter(job.seriesMetas), - client: client, - tenant: job.tenantID, - bt: bt, - fpRate: fpRate, - logger: logger, + ctx: ctx, + metas: v1.NewSliceIter(job.seriesMetas), + client: client, + tenant: job.tenantID, + bt: bt, + fpRate: limits.BloomFalsePositiveRate(job.tenantID), + logger: logger, + chunksBatchSize: limits.BloomCompactorChunksBatchSize(job.tenantID), } } @@ -218,20 +219,18 @@ func (it *lazyBloomBuilder) Next() bool { } meta := it.metas.At() - // Get chunks data from list of chunkRefs - chks, err := it.client.GetChunks(it.ctx, makeChunkRefs(meta.chunkRefs, it.tenant, meta.seriesFP)) + batchesIterator, err := newChunkBatchesIterator(it.ctx, it.client, makeChunkRefs(meta.chunkRefs, it.tenant, meta.seriesFP), it.chunksBatchSize) if err != nil { it.err = err it.cur = v1.SeriesWithBloom{} - level.Debug(it.logger).Log("err in getChunks", err) + level.Debug(it.logger).Log("msg", "err creating chunks batches iterator", "err", err) return false } - - it.cur, err = buildBloomFromSeries(meta, it.fpRate, it.bt, chks) + it.cur, err = buildBloomFromSeries(meta, it.fpRate, it.bt, batchesIterator) if err != nil { it.err = err it.cur = v1.SeriesWithBloom{} - level.Debug(it.logger).Log("err in buildBloomFromSeries", err) + level.Debug(it.logger).Log("msg", "err in buildBloomFromSeries", "err", err) return false } return true diff --git a/pkg/bloomcompactor/chunkcompactor_test.go b/pkg/bloomcompactor/chunkcompactor_test.go index 2d31e05f18f83..8bc94fd26537a 100644 --- a/pkg/bloomcompactor/chunkcompactor_test.go +++ b/pkg/bloomcompactor/chunkcompactor_test.go @@ -59,7 +59,7 @@ func TestChunkCompactor_BuildBloomFromSeries(t *testing.T) { chunks := []chunk.Chunk{createTestChunk(fp, label)} mbt := mockBloomTokenizer{} - bloom, err := buildBloomFromSeries(seriesMeta, fpRate, &mbt, chunks) + bloom, err := buildBloomFromSeries(seriesMeta, fpRate, &mbt, v1.NewSliceIter([][]chunk.Chunk{chunks})) require.NoError(t, err) require.Equal(t, seriesMeta.seriesFP, bloom.Series.Fingerprint) require.Equal(t, chunks, mbt.chunks) @@ -110,7 +110,7 @@ func TestChunkCompactor_CompactNewChunks(t *testing.T) { pbb := mockPersistentBlockBuilder{} // Run Compaction - compactedBlock, err := compactNewChunks(context.Background(), logger, job, fpRate, &mbt, &mcc, &pbb) + compactedBlock, err := compactNewChunks(context.Background(), logger, job, &mbt, &mcc, &pbb, mockLimits{fpRate: fpRate}) // Validate Compaction Succeeds require.NoError(t, err) @@ -169,7 +169,7 @@ func TestLazyBloomBuilder(t *testing.T) { mbt := &mockBloomTokenizer{} mcc := &mockChunkClient{} - it := newLazyBloomBuilder(context.Background(), job, mcc, mbt, fpRate, logger) + it := newLazyBloomBuilder(context.Background(), job, mcc, mbt, logger, mockLimits{chunksDownloadingBatchSize: 10, fpRate: fpRate}) // first seriesMeta has 1 chunks require.True(t, it.Next()) @@ -199,8 +199,10 @@ type mockBloomTokenizer struct { chunks []chunk.Chunk } -func (mbt *mockBloomTokenizer) PopulateSeriesWithBloom(_ *v1.SeriesWithBloom, c []chunk.Chunk) error { - mbt.chunks = append(mbt.chunks, c...) +func (mbt *mockBloomTokenizer) PopulateSeriesWithBloom(_ *v1.SeriesWithBloom, c v1.Iterator[[]chunk.Chunk]) error { + for c.Next() { + mbt.chunks = append(mbt.chunks, c.At()...) + } return nil } diff --git a/pkg/bloomcompactor/chunksbatchesiterator.go b/pkg/bloomcompactor/chunksbatchesiterator.go new file mode 100644 index 0000000000000..a4494b02b7e47 --- /dev/null +++ b/pkg/bloomcompactor/chunksbatchesiterator.go @@ -0,0 +1,48 @@ +package bloomcompactor + +import ( + "context" + "errors" + + "github.com/grafana/loki/pkg/storage/chunk" +) + +type chunksBatchesIterator struct { + context context.Context + client chunkClient + chunksToDownload []chunk.Chunk + batchSize int + + currentBatch []chunk.Chunk + err error +} + +func newChunkBatchesIterator(context context.Context, client chunkClient, chunksToDownload []chunk.Chunk, batchSize int) (*chunksBatchesIterator, error) { + if batchSize <= 0 { + return nil, errors.New("batchSize must be greater than 0") + } + return &chunksBatchesIterator{context: context, client: client, chunksToDownload: chunksToDownload, batchSize: batchSize}, nil +} + +func (c *chunksBatchesIterator) Next() bool { + if len(c.chunksToDownload) == 0 { + return false + } + batchSize := c.batchSize + chunksToDownloadCount := len(c.chunksToDownload) + if chunksToDownloadCount < batchSize { + batchSize = chunksToDownloadCount + } + chunksToDownload := c.chunksToDownload[:batchSize] + c.chunksToDownload = c.chunksToDownload[batchSize:] + c.currentBatch, c.err = c.client.GetChunks(c.context, chunksToDownload) + return c.err == nil +} + +func (c *chunksBatchesIterator) Err() error { + return c.err +} + +func (c *chunksBatchesIterator) At() []chunk.Chunk { + return c.currentBatch +} diff --git a/pkg/bloomcompactor/chunksbatchesiterator_test.go b/pkg/bloomcompactor/chunksbatchesiterator_test.go new file mode 100644 index 0000000000000..170f2662b508b --- /dev/null +++ b/pkg/bloomcompactor/chunksbatchesiterator_test.go @@ -0,0 +1,96 @@ +package bloomcompactor + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/grafana/loki/pkg/storage/chunk" + tsdbindex "github.com/grafana/loki/pkg/storage/stores/shipper/indexshipper/tsdb/index" +) + +func Test_chunksBatchesIterator(t *testing.T) { + tests := map[string]struct { + batchSize int + chunksToDownload []chunk.Chunk + constructorError error + + hadNextCount int + }{ + "expected error if batch size is set to 0": { + batchSize: 0, + constructorError: errors.New("batchSize must be greater than 0"), + }, + "expected no error if there are no chunks": { + hadNextCount: 0, + batchSize: 10, + }, + "expected 1 call to the client": { + chunksToDownload: createFakeChunks(10), + hadNextCount: 1, + batchSize: 20, + }, + "expected 1 call to the client(2)": { + chunksToDownload: createFakeChunks(10), + hadNextCount: 1, + batchSize: 10, + }, + "expected 2 calls to the client": { + chunksToDownload: createFakeChunks(10), + hadNextCount: 2, + batchSize: 6, + }, + "expected 10 calls to the client": { + chunksToDownload: createFakeChunks(10), + hadNextCount: 10, + batchSize: 1, + }, + } + for name, data := range tests { + t.Run(name, func(t *testing.T) { + client := &fakeClient{} + iterator, err := newChunkBatchesIterator(context.Background(), client, data.chunksToDownload, data.batchSize) + if data.constructorError != nil { + require.Equal(t, err, data.constructorError) + return + } + hadNextCount := 0 + var downloadedChunks []chunk.Chunk + for iterator.Next() { + hadNextCount++ + downloaded := iterator.At() + downloadedChunks = append(downloadedChunks, downloaded...) + require.LessOrEqual(t, len(downloaded), data.batchSize) + } + require.NoError(t, iterator.Err()) + require.Equal(t, data.chunksToDownload, downloadedChunks) + require.Equal(t, data.hadNextCount, client.callsCount) + require.Equal(t, data.hadNextCount, hadNextCount) + }) + } +} + +func createFakeChunks(count int) []chunk.Chunk { + metas := make([]tsdbindex.ChunkMeta, 0, count) + for i := 0; i < count; i++ { + metas = append(metas, tsdbindex.ChunkMeta{ + Checksum: uint32(i), + MinTime: int64(i), + MaxTime: int64(i + 100), + KB: uint32(i * 100), + Entries: uint32(i * 10), + }) + } + return makeChunkRefs(metas, "fake", 0xFFFF) +} + +type fakeClient struct { + callsCount int +} + +func (f *fakeClient) GetChunks(_ context.Context, chunks []chunk.Chunk) ([]chunk.Chunk, error) { + f.callsCount++ + return chunks, nil +} diff --git a/pkg/bloomcompactor/config.go b/pkg/bloomcompactor/config.go index c3969ac6af386..884034fdd043d 100644 --- a/pkg/bloomcompactor/config.go +++ b/pkg/bloomcompactor/config.go @@ -41,6 +41,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { type Limits interface { downloads.Limits BloomCompactorShardSize(tenantID string) int + BloomCompactorChunksBatchSize(userID string) int BloomCompactorMaxTableAge(tenantID string) time.Duration BloomCompactorEnabled(tenantID string) bool BloomNGramLength(tenantID string) int diff --git a/pkg/bloomcompactor/mergecompactor.go b/pkg/bloomcompactor/mergecompactor.go index 0cf55cef86a7c..6e2143f75135c 100644 --- a/pkg/bloomcompactor/mergecompactor.go +++ b/pkg/bloomcompactor/mergecompactor.go @@ -2,6 +2,7 @@ package bloomcompactor import ( "context" + "fmt" "github.com/grafana/dskit/concurrency" @@ -74,7 +75,7 @@ func makeBlockIterFromBlocks(ctx context.Context, logger log.Logger, return blockIters, blockPaths, nil } -func createPopulateFunc(ctx context.Context, logger log.Logger, job Job, storeClient storeClient, bt *v1.BloomTokenizer) func(series *v1.Series, bloom *v1.Bloom) error { +func createPopulateFunc(ctx context.Context, job Job, storeClient storeClient, bt *v1.BloomTokenizer, limits Limits) func(series *v1.Series, bloom *v1.Bloom) error { return func(series *v1.Series, bloom *v1.Bloom) error { bloomForChks := v1.SeriesWithBloom{ Series: series, @@ -95,12 +96,11 @@ func createPopulateFunc(ctx context.Context, logger log.Logger, job Job, storeCl } } - chks, err := storeClient.chunk.GetChunks(ctx, chunkRefs) + batchesIterator, err := newChunkBatchesIterator(ctx, storeClient.chunk, chunkRefs, limits.BloomCompactorChunksBatchSize(job.tenantID)) if err != nil { - level.Error(logger).Log("msg", "failed downloading chunks", "err", err) - return err + return fmt.Errorf("error creating chunks batches iterator: %w", err) } - err = bt.PopulateSeriesWithBloom(&bloomForChks, chks) + err = bt.PopulateSeriesWithBloom(&bloomForChks, batchesIterator) if err != nil { return err } diff --git a/pkg/bloomcompactor/sharding_test.go b/pkg/bloomcompactor/sharding_test.go index fc77536f6061f..4e79752279fb9 100644 --- a/pkg/bloomcompactor/sharding_test.go +++ b/pkg/bloomcompactor/sharding_test.go @@ -128,9 +128,22 @@ func TestShuffleSharding(t *testing.T) { type mockLimits struct { *validation.Overrides - bloomCompactorShardSize int + bloomCompactorShardSize int + chunksDownloadingBatchSize int + fpRate float64 +} + +func (m mockLimits) BloomFalsePositiveRate(_ string) float64 { + return m.fpRate } func (m mockLimits) BloomCompactorShardSize(_ string) int { return m.bloomCompactorShardSize } + +func (m mockLimits) BloomCompactorChunksBatchSize(_ string) int { + if m.chunksDownloadingBatchSize != 0 { + return m.chunksDownloadingBatchSize + } + return 1 +} diff --git a/pkg/bloomgateway/bloomgateway.go b/pkg/bloomgateway/bloomgateway.go index 403378e016a9f..b0c3251a0843d 100644 --- a/pkg/bloomgateway/bloomgateway.go +++ b/pkg/bloomgateway/bloomgateway.go @@ -180,9 +180,8 @@ func New(cfg Config, schemaCfg config.SchemaConfig, storageCfg storage.Config, o sharding: shardingStrategy, pendingTasks: makePendingTasks(pendingTasksInitialCap), workerConfig: workerConfig{ - maxWaitTime: 200 * time.Millisecond, - maxItems: 100, - processBlocksSequentially: false, + maxWaitTime: 200 * time.Millisecond, + maxItems: 100, }, workerMetrics: newWorkerMetrics(reg, constants.Loki, metricsSubsystem), queueMetrics: queue.NewMetrics(reg, constants.Loki, metricsSubsystem), @@ -323,7 +322,7 @@ func (g *Gateway) FilterChunkRefs(ctx context.Context, req *logproto.FilterChunk case res := <-resCh: responses = append(responses, res) // log line is helpful for debugging tests - // level.Debug(g.logger).Log("msg", "got partial result", "task", task.ID, "tenant", tenantID, "fp", uint64(res.Fp), "chunks", res.Removals.Len(), "progress", fmt.Sprintf("%d/%d", len(responses), requestCount)) + level.Debug(g.logger).Log("msg", "got partial result", "task", task.ID, "tenant", tenantID, "fp_int", uint64(res.Fp), "fp_hex", res.Fp, "chunks_to_remove", res.Removals.Len(), "progress", fmt.Sprintf("%d/%d", len(responses), requestCount)) // wait for all parts of the full response if len(responses) == requestCount { for _, o := range responses { diff --git a/pkg/bloomgateway/bloomgateway_test.go b/pkg/bloomgateway/bloomgateway_test.go index b34e3d55852a5..183a2aad2190e 100644 --- a/pkg/bloomgateway/bloomgateway_test.go +++ b/pkg/bloomgateway/bloomgateway_test.go @@ -269,89 +269,74 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) { }) t.Run("use fuse queriers to filter chunks", func(t *testing.T) { - for _, tc := range []struct { - name string - value bool - }{ - {"sequentially", true}, - {"callback", false}, - } { - t.Run(tc.name, func(t *testing.T) { - - reg := prometheus.NewRegistry() - gw, err := New(cfg, schemaCfg, storageCfg, limits, ss, cm, logger, reg) - require.NoError(t, err) - - now := mktime("2023-10-03 10:00") - - // replace store implementation and re-initialize workers and sub-services - bqs, data := createBlockQueriers(t, 5, now.Add(-8*time.Hour), now, 0, 1024) - gw.bloomStore = newMockBloomStore(bqs) - gw.workerConfig.processBlocksSequentially = tc.value - err = gw.initServices() - require.NoError(t, err) - - t.Log("process blocks in worker sequentially", gw.workerConfig.processBlocksSequentially) - - err = services.StartAndAwaitRunning(context.Background(), gw) - require.NoError(t, err) - t.Cleanup(func() { - err = services.StopAndAwaitTerminated(context.Background(), gw) - require.NoError(t, err) - }) + reg := prometheus.NewRegistry() + gw, err := New(cfg, schemaCfg, storageCfg, limits, ss, cm, logger, reg) + require.NoError(t, err) - chunkRefs := createQueryInputFromBlockData(t, tenantID, data, 100) - - t.Run("no match - return empty response", func(t *testing.T) { - inputChunkRefs := groupRefs(t, chunkRefs) - req := &logproto.FilterChunkRefRequest{ - From: now.Add(-8 * time.Hour), - Through: now, - Refs: inputChunkRefs, - Filters: []syntax.LineFilter{ - {Ty: labels.MatchEqual, Match: "does not match"}, - }, - } - ctx := user.InjectOrgID(context.Background(), tenantID) - res, err := gw.FilterChunkRefs(ctx, req) - require.NoError(t, err) - - expectedResponse := &logproto.FilterChunkRefResponse{ - ChunkRefs: []*logproto.GroupedChunkRefs{}, - } - require.Equal(t, expectedResponse, res) - }) + now := mktime("2023-10-03 10:00") - t.Run("match - return filtered", func(t *testing.T) { - inputChunkRefs := groupRefs(t, chunkRefs) - // hack to get indexed key for a specific series - // the indexed key range for a series is defined as - // i * keysPerSeries ... i * keysPerSeries + keysPerSeries - 1 - // where i is the nth series in a block - // fortunately, i is also used as Checksum for the single chunk of a series - // see mkBasicSeriesWithBlooms() in pkg/storage/bloom/v1/test_util.go - key := inputChunkRefs[0].Refs[0].Checksum*1000 + 500 - - req := &logproto.FilterChunkRefRequest{ - From: now.Add(-8 * time.Hour), - Through: now, - Refs: inputChunkRefs, - Filters: []syntax.LineFilter{ - {Ty: labels.MatchEqual, Match: fmt.Sprintf("series %d", key)}, - }, - } - ctx := user.InjectOrgID(context.Background(), tenantID) - res, err := gw.FilterChunkRefs(ctx, req) - require.NoError(t, err) - - expectedResponse := &logproto.FilterChunkRefResponse{ - ChunkRefs: inputChunkRefs[:1], - } - require.Equal(t, expectedResponse, res) - }) + // replace store implementation and re-initialize workers and sub-services + bqs, data := createBlockQueriers(t, 5, now.Add(-8*time.Hour), now, 0, 1024) + gw.bloomStore = newMockBloomStore(bqs) + err = gw.initServices() + require.NoError(t, err) - }) - } + err = services.StartAndAwaitRunning(context.Background(), gw) + require.NoError(t, err) + t.Cleanup(func() { + err = services.StopAndAwaitTerminated(context.Background(), gw) + require.NoError(t, err) + }) + + chunkRefs := createQueryInputFromBlockData(t, tenantID, data, 100) + + t.Run("no match - return empty response", func(t *testing.T) { + inputChunkRefs := groupRefs(t, chunkRefs) + req := &logproto.FilterChunkRefRequest{ + From: now.Add(-8 * time.Hour), + Through: now, + Refs: inputChunkRefs, + Filters: []syntax.LineFilter{ + {Ty: labels.MatchEqual, Match: "does not match"}, + }, + } + ctx := user.InjectOrgID(context.Background(), tenantID) + res, err := gw.FilterChunkRefs(ctx, req) + require.NoError(t, err) + + expectedResponse := &logproto.FilterChunkRefResponse{ + ChunkRefs: []*logproto.GroupedChunkRefs{}, + } + require.Equal(t, expectedResponse, res) + }) + + t.Run("match - return filtered", func(t *testing.T) { + inputChunkRefs := groupRefs(t, chunkRefs) + // hack to get indexed key for a specific series + // the indexed key range for a series is defined as + // i * keysPerSeries ... i * keysPerSeries + keysPerSeries - 1 + // where i is the nth series in a block + // fortunately, i is also used as Checksum for the single chunk of a series + // see mkBasicSeriesWithBlooms() in pkg/storage/bloom/v1/test_util.go + key := inputChunkRefs[0].Refs[0].Checksum*1000 + 500 + + req := &logproto.FilterChunkRefRequest{ + From: now.Add(-8 * time.Hour), + Through: now, + Refs: inputChunkRefs, + Filters: []syntax.LineFilter{ + {Ty: labels.MatchEqual, Match: fmt.Sprintf("series %d", key)}, + }, + } + ctx := user.InjectOrgID(context.Background(), tenantID) + res, err := gw.FilterChunkRefs(ctx, req) + require.NoError(t, err) + + expectedResponse := &logproto.FilterChunkRefResponse{ + ChunkRefs: inputChunkRefs[:1], + } + require.Equal(t, expectedResponse, res) + }) }) } diff --git a/pkg/bloomgateway/worker.go b/pkg/bloomgateway/worker.go index e82a0daea63c1..a8f9c56d50bab 100644 --- a/pkg/bloomgateway/worker.go +++ b/pkg/bloomgateway/worker.go @@ -20,8 +20,6 @@ import ( type workerConfig struct { maxWaitTime time.Duration maxItems int - - processBlocksSequentially bool } type workerMetrics struct { @@ -188,11 +186,7 @@ func (w *worker) running(ctx context.Context) error { blockRefs = append(blockRefs, b.blockRef) } - if w.cfg.processBlocksSequentially { - err = w.processBlocksSequentially(taskCtx, tasks[0].Tenant, day, blockRefs, boundedRefs) - } else { - err = w.processBlocksWithCallback(taskCtx, tasks[0].Tenant, day, blockRefs, boundedRefs) - } + err = w.processBlocksWithCallback(taskCtx, tasks[0].Tenant, day, blockRefs, boundedRefs) if err != nil { for _, t := range tasks { t.ErrCh <- err @@ -227,20 +221,6 @@ func (w *worker) processBlocksWithCallback(taskCtx context.Context, tenant strin }) } -func (w *worker) processBlocksSequentially(taskCtx context.Context, tenant string, day time.Time, blockRefs []bloomshipper.BlockRef, boundedRefs []boundedTasks) error { - storeFetchStart := time.Now() - blockQueriers, err := w.store.GetBlockQueriersForBlockRefs(taskCtx, tenant, blockRefs) - w.metrics.storeAccessLatency.WithLabelValues(w.id, "GetBlockQueriersForBlockRefs").Observe(time.Since(storeFetchStart).Seconds()) - if err != nil { - return err - } - - for i := range blockQueriers { - processBlock(blockQueriers[i].BlockQuerier, day, boundedRefs[i].tasks) - } - return nil -} - func processBlock(blockQuerier *v1.BlockQuerier, day time.Time, tasks []Task) { schema, err := blockQuerier.Schema() if err != nil { diff --git a/pkg/logql/log/parser.go b/pkg/logql/log/parser.go index be059a2831560..c03e7c91cb960 100644 --- a/pkg/logql/log/parser.go +++ b/pkg/logql/log/parser.go @@ -493,11 +493,13 @@ func (l *LogfmtExpressionParser) Process(_ int64, line []byte, lbs *LabelsBuilde return "", false } - if !lbs.ParserLabelHints().ShouldExtract(sanitized) { + _, alwaysExtract := keys[sanitized] + if !alwaysExtract && !lbs.ParserLabelHints().ShouldExtract(sanitized) { return "", false } return sanitized, true }) + if !ok { continue } @@ -530,6 +532,7 @@ func (l *LogfmtExpressionParser) Process(_ int64, line []byte, lbs *LabelsBuilde } } } + if l.strict && l.dec.Err() != nil { addErrLabel(errLogfmt, l.dec.Err(), lbs) return line, true diff --git a/pkg/logql/log/parser_hints.go b/pkg/logql/log/parser_hints.go index cdb61015dd4dd..a8b1f73f3109d 100644 --- a/pkg/logql/log/parser_hints.go +++ b/pkg/logql/log/parser_hints.go @@ -58,10 +58,6 @@ type Hints struct { } func (p *Hints) ShouldExtract(key string) bool { - if len(p.requiredLabels) == 0 { - return true - } - for _, l := range p.extracted { if l == key { return false @@ -74,7 +70,7 @@ func (p *Hints) ShouldExtract(key string) bool { } } - return false + return len(p.requiredLabels) == 0 } func (p *Hints) ShouldExtractPrefix(prefix string) bool { @@ -95,19 +91,25 @@ func (p *Hints) NoLabels() bool { } func (p *Hints) RecordExtracted(key string) { - for _, l := range p.requiredLabels { - if l == key { - p.extracted = append(p.extracted, key) - return - } - } + p.extracted = append(p.extracted, key) } func (p *Hints) AllRequiredExtracted() bool { - if len(p.requiredLabels) == 0 { + if len(p.requiredLabels) == 0 || len(p.extracted) < len(p.requiredLabels) { return false } - return len(p.extracted) == len(p.requiredLabels) + + found := 0 + for _, l := range p.requiredLabels { + for _, e := range p.extracted { + if l == e { + found++ + break + } + } + } + + return len(p.requiredLabels) == found } func (p *Hints) Reset() { @@ -172,9 +174,6 @@ func NewParserHint(requiredLabelNames, groups []string, without, noLabels bool, return ph } - ph.requiredLabels = hints - ph.shouldPreserveError = containsError(hints) - return &Hints{requiredLabels: hints, extracted: extracted, shouldPreserveError: containsError(hints)} } diff --git a/pkg/logql/log/parser_hints_test.go b/pkg/logql/log/parser_hints_test.go index ac232bfd871b4..42d0134bc1d8f 100644 --- a/pkg/logql/log/parser_hints_test.go +++ b/pkg/logql/log/parser_hints_test.go @@ -28,7 +28,10 @@ var ( "response": { "status": 204, "latency_seconds": "30.001" - } + }, + "message": { + "message": "foo", + } }`) packedLine = []byte(`{ @@ -58,14 +61,14 @@ func Test_ParserHints(t *testing.T) { jsonLine, true, 1.0, - `{app="nginx", cluster="us-central-west", cluster_extracted="us-east-west", protocol="HTTP/2.0", remote_user="foo", request_host="foo.grafana.net", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_latency_seconds="30.001", response_status="204", upstream_addr="10.0.0.1:80"}`, + `{app="nginx", cluster="us-central-west", cluster_extracted="us-east-west", message_message="foo", protocol="HTTP/2.0", remote_user="foo", request_host="foo.grafana.net", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_latency_seconds="30.001", response_status="204", upstream_addr="10.0.0.1:80"}`, }, { `sum without (request_host,app,cluster) (rate({app="nginx"} | json | __error__="" | response_status = 204 [1m]))`, jsonLine, true, 1.0, - `{cluster_extracted="us-east-west", protocol="HTTP/2.0", remote_user="foo", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_latency_seconds="30.001", response_status="204", upstream_addr="10.0.0.1:80"}`, + `{cluster_extracted="us-east-west", message_message="foo", protocol="HTTP/2.0", remote_user="foo", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_latency_seconds="30.001", response_status="204", upstream_addr="10.0.0.1:80"}`, }, { `sum by (request_host,app) (rate({app="nginx"} | json | __error__="" | response_status = 204 [1m]))`, @@ -114,14 +117,14 @@ func Test_ParserHints(t *testing.T) { jsonLine, true, 30.001, - `{app="nginx", cluster="us-central-west", cluster_extracted="us-east-west", protocol="HTTP/2.0", remote_user="foo", request_host="foo.grafana.net", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_status="204", upstream_addr="10.0.0.1:80"}`, + `{app="nginx", cluster="us-central-west", cluster_extracted="us-east-west", message_message="foo", protocol="HTTP/2.0", remote_user="foo", request_host="foo.grafana.net", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_status="204", upstream_addr="10.0.0.1:80"}`, }, { `sum without (request_host,app,cluster)(rate({app="nginx"} | json | response_status = 204 | unwrap response_latency_seconds [1m]))`, jsonLine, true, 30.001, - `{cluster_extracted="us-east-west", protocol="HTTP/2.0", remote_user="foo", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_status="204", upstream_addr="10.0.0.1:80"}`, + `{cluster_extracted="us-east-west", message_message="foo", protocol="HTTP/2.0", remote_user="foo", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_status="204", upstream_addr="10.0.0.1:80"}`, }, { `sum(rate({app="nginx"} | logfmt | org_id=3677 | unwrap Ingester_TotalReached[1m]))`, @@ -214,6 +217,13 @@ func Test_ParserHints(t *testing.T) { 0, ``, }, + { + `sum by (message_message,app)(count_over_time({app="nginx"} | json | response_status = 204 and remote_user = "foo"[1m]))`, + jsonLine, + true, + 1, + `{app="nginx", message_message="foo"}`, + }, } { tt := tt t.Run(tt.expr, func(t *testing.T) { diff --git a/pkg/logql/optimize.go b/pkg/logql/optimize.go index 2f9c80a64f918..9b885b0fd229c 100644 --- a/pkg/logql/optimize.go +++ b/pkg/logql/optimize.go @@ -8,7 +8,7 @@ func optimizeSampleExpr(expr syntax.SampleExpr) (syntax.SampleExpr, error) { // we skip sharding AST for now, it's not easy to clone them since they are not part of the language. expr.Walk(func(e syntax.Expr) { switch e.(type) { - case *ConcatSampleExpr, *DownstreamSampleExpr, *QuantileSketchEvalExpr, *QuantileSketchMergeExpr: + case *ConcatSampleExpr, DownstreamSampleExpr, *QuantileSketchEvalExpr, *QuantileSketchMergeExpr: skip = true return } diff --git a/pkg/logql/shardmapper.go b/pkg/logql/shardmapper.go index 4a06b5f804e84..e8d78a438c9bb 100644 --- a/pkg/logql/shardmapper.go +++ b/pkg/logql/shardmapper.go @@ -128,7 +128,7 @@ func (m ShardMapper) mapBinOpExpr(e *syntax.BinOpExpr, r *downstreamRecorder) (* if err != nil { return nil, 0, err } - if isNoOp(e.SampleExpr, rhsMapped) && !isLiteralOrVector(rhsMapped) { + if isNoOp(e.RHS, rhsMapped) && !isLiteralOrVector(rhsMapped) { // TODO: check if literal or vector rhsMapped = DownstreamSampleExpr{ shard: nil, diff --git a/pkg/logql/shardmapper_test.go b/pkg/logql/shardmapper_test.go index 7a02640c81491..96955109a9413 100644 --- a/pkg/logql/shardmapper_test.go +++ b/pkg/logql/shardmapper_test.go @@ -1446,16 +1446,84 @@ func TestMapping(t *testing.T) { }, }, }, + { + in: `quantile_over_time(0.99, {a="foo"} | unwrap bytes [1s]) by (a, b) > 1`, + expr: &syntax.BinOpExpr{ + SampleExpr: DownstreamSampleExpr{ + SampleExpr: &syntax.RangeAggregationExpr{ + Operation: syntax.OpRangeTypeQuantile, + Params: float64p(0.99), + Left: &syntax.LogRange{ + Left: &syntax.MatchersExpr{ + Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "a", "foo")}, + }, + Unwrap: &syntax.UnwrapExpr{ + Identifier: "bytes", + }, + Interval: 1 * time.Second, + }, + Grouping: &syntax.Grouping{ + Groups: []string{"a", "b"}, + }, + }, + }, + RHS: &syntax.LiteralExpr{ + Val: 1, + }, + Op: syntax.OpTypeGT, + Opts: &syntax.BinOpOptions{ + ReturnBool: false, + VectorMatching: &syntax.VectorMatching{}, + }, + }, + }, + { + in: `1 < quantile_over_time(0.99, {a="foo"} | unwrap bytes [1s]) by (a, b)`, + expr: &syntax.BinOpExpr{ + SampleExpr: &syntax.LiteralExpr{ + Val: 1, + }, + RHS: DownstreamSampleExpr{ + SampleExpr: &syntax.RangeAggregationExpr{ + Operation: syntax.OpRangeTypeQuantile, + Params: float64p(0.99), + Left: &syntax.LogRange{ + Left: &syntax.MatchersExpr{ + Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "a", "foo")}, + }, + Unwrap: &syntax.UnwrapExpr{ + Identifier: "bytes", + }, + Interval: 1 * time.Second, + }, + Grouping: &syntax.Grouping{ + Groups: []string{"a", "b"}, + }, + }, + }, + Op: syntax.OpTypeLT, + Opts: &syntax.BinOpOptions{ + ReturnBool: false, + VectorMatching: &syntax.VectorMatching{}, + }, + }, + }, } { t.Run(tc.in, func(t *testing.T) { ast, err := syntax.ParseExpr(tc.in) require.Equal(t, tc.err, err) mapped, _, err := m.Map(ast, nilShardMetrics.downstreamRecorder()) + switch e := mapped.(type) { + case syntax.SampleExpr: + optimized, err := optimizeSampleExpr(e) + require.NoError(t, err) + require.Equal(t, mapped.String(), optimized.String()) + } require.Equal(t, tc.err, err) - require.Equal(t, mapped.String(), tc.expr.String()) - require.Equal(t, mapped, tc.expr) + require.Equal(t, tc.expr.String(), mapped.String()) + require.Equal(t, tc.expr, mapped) }) } } diff --git a/pkg/loki/loki.go b/pkg/loki/loki.go index 0c4a50f813ed9..6ef0572d6bad0 100644 --- a/pkg/loki/loki.go +++ b/pkg/loki/loki.go @@ -101,7 +101,7 @@ type Config struct { Tracing tracing.Config `yaml:"tracing"` Analytics analytics.Config `yaml:"analytics"` - LegacyReadTarget bool `yaml:"legacy_read_target,omitempty" doc:"hidden"` + LegacyReadTarget bool `yaml:"legacy_read_target,omitempty" doc:"hidden|deprecated"` Common common.Config `yaml:"common,omitempty"` @@ -136,9 +136,8 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) { "It will, however, distort metrics, because it is counted as live memory. ", ) - //TODO(trevorwhitney): flip this to false with Loki 3.0 - f.BoolVar(&c.LegacyReadTarget, "legacy-read-mode", true, "Set to false to disable the legacy read mode and use new scalable mode with 3rd backend target. "+ - "The default will be flipped to false in the next Loki release.") + f.BoolVar(&c.LegacyReadTarget, "legacy-read-mode", false, "Deprecated. Set to true to enable the legacy read mode which includes the components from the backend target. "+ + "This setting is deprecated and will be removed in the next minor release.") f.DurationVar(&c.ShutdownDelay, "shutdown-delay", 0, "How long to wait between SIGTERM and shutdown. After receiving SIGTERM, Loki will report 503 Service Unavailable status via /ready endpoint.") @@ -647,7 +646,7 @@ func (t *Loki) setupModuleManager() error { Read: {QueryFrontend, Querier}, Write: {Ingester, Distributor}, - Backend: {QueryScheduler, Ruler, Compactor, IndexGateway}, + Backend: {QueryScheduler, Ruler, Compactor, IndexGateway, BloomGateway, BloomCompactor}, All: {QueryScheduler, QueryFrontend, Querier, Ingester, Distributor, Ruler, Compactor}, } @@ -694,13 +693,12 @@ func (t *Loki) setupModuleManager() error { } // Add bloom gateway ring in client mode to IndexGateway service dependencies if bloom filtering is enabled. - if t.Cfg.isModuleEnabled(IndexGateway) && t.Cfg.BloomGateway.Enabled { + if t.Cfg.BloomGateway.Enabled { deps[IndexGateway] = append(deps[IndexGateway], BloomGatewayRing) } - //TODO(poyzannur) not sure this is needed for BloomCompactor if t.Cfg.LegacyReadTarget { - deps[Read] = append(deps[Read], QueryScheduler, Ruler, Compactor, IndexGateway, BloomGateway, BloomCompactor) + deps[Read] = append(deps[Read], deps[Backend]...) } if t.Cfg.InternalServer.Enable { diff --git a/pkg/loki/modules.go b/pkg/loki/modules.go index 1342a105f34b6..8282098c85aec 100644 --- a/pkg/loki/modules.go +++ b/pkg/loki/modules.go @@ -800,12 +800,25 @@ func (disabledShuffleShardingLimits) MaxQueriersPerUser(_ string) uint { return func (disabledShuffleShardingLimits) MaxQueryCapacity(_ string) float64 { return 0 } +// ingesterQueryOptions exists simply to avoid dependency cycles when using querier.Config directly in queryrange.NewMiddleware +type ingesterQueryOptions struct { + querier.Config +} + +func (i ingesterQueryOptions) QueryStoreOnly() bool { + return i.Config.QueryStoreOnly +} +func (i ingesterQueryOptions) QueryIngestersWithin() time.Duration { + return i.Config.QueryIngestersWithin +} + func (t *Loki) initQueryFrontendMiddleware() (_ services.Service, err error) { level.Debug(util_log.Logger).Log("msg", "initializing query frontend tripperware") middleware, stopper, err := queryrange.NewMiddleware( t.Cfg.QueryRange, t.Cfg.Querier.Engine, + ingesterQueryOptions{t.Cfg.Querier}, util_log.Logger, t.Overrides, t.Cfg.SchemaConfig, diff --git a/pkg/loki/modules_test.go b/pkg/loki/modules_test.go index 19980e2944120..0d07242b75370 100644 --- a/pkg/loki/modules_test.go +++ b/pkg/loki/modules_test.go @@ -175,6 +175,9 @@ func TestIndexGatewayRingMode_when_TargetIsLegacyReadOrBackend(t *testing.T) { { name: "leagcy read", target: Read, + transformer: func(cfg *Config) { + cfg.LegacyReadTarget = true + }, }, { name: "backend", diff --git a/pkg/querier/queryrange/limits/definitions.go b/pkg/querier/queryrange/limits/definitions.go index bd84e144fa47d..57b2e03c6697b 100644 --- a/pkg/querier/queryrange/limits/definitions.go +++ b/pkg/querier/queryrange/limits/definitions.go @@ -15,6 +15,7 @@ type Limits interface { logql.Limits QuerySplitDuration(string) time.Duration MetadataQuerySplitDuration(string) time.Duration + IngesterQuerySplitDuration(string) time.Duration MaxQuerySeries(context.Context, string) int MaxEntriesLimitPerQuery(context.Context, string) int MinShardingLookback(string) time.Duration diff --git a/pkg/querier/queryrange/limits_test.go b/pkg/querier/queryrange/limits_test.go index 3b82c1dc9eabb..0de342e42644f 100644 --- a/pkg/querier/queryrange/limits_test.go +++ b/pkg/querier/queryrange/limits_test.go @@ -58,7 +58,7 @@ func Test_seriesLimiter(t *testing.T) { cfg.CacheIndexStatsResults = false // split in 7 with 2 in // max. l := WithSplitByLimits(fakeLimits{maxSeries: 1, maxQueryParallelism: 2}, time.Hour) - tpw, stopper, err := NewMiddleware(cfg, testEngineOpts, util_log.Logger, l, config.SchemaConfig{ + tpw, stopper, err := NewMiddleware(cfg, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{ Configs: testSchemas, }, nil, false, nil, constants.Loki) if stopper != nil { @@ -228,7 +228,7 @@ func Test_MaxQueryParallelismDisable(t *testing.T) { } func Test_MaxQueryLookBack(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{ + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{ maxQueryLookback: 1 * time.Hour, maxQueryParallelism: 1, }, config.SchemaConfig{ diff --git a/pkg/querier/queryrange/log_result_cache.go b/pkg/querier/queryrange/log_result_cache.go index c15568d9075ac..fd26b67412a6b 100644 --- a/pkg/querier/queryrange/log_result_cache.go +++ b/pkg/querier/queryrange/log_result_cache.go @@ -106,7 +106,8 @@ func (l *logResultCache) Do(ctx context.Context, req queryrangebase.Request) (qu interval := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, l.limits.QuerySplitDuration) // skip caching by if interval is unset - if interval == 0 { + // skip caching when limit is 0 as it would get registerted as empty result in the cache even if that time range contains log lines. + if interval == 0 || lokiReq.Limit == 0 { return l.next.Do(ctx, req) } // The first subquery might not be aligned. @@ -181,7 +182,7 @@ func (l *logResultCache) handleMiss(ctx context.Context, cacheKey string, req *L func (l *logResultCache) handleHit(ctx context.Context, cacheKey string, cachedRequest *LokiRequest, lokiReq *LokiRequest) (queryrangebase.Response, error) { l.metrics.CacheHit.Inc() // we start with an empty response - result := emptyResponse(cachedRequest) + result := emptyResponse(lokiReq) // if the request is the same and cover the whole time range, // we can just return the cached result. if cachedRequest.StartTs.UnixNano() <= lokiReq.StartTs.UnixNano() && cachedRequest.EndTs.UnixNano() >= lokiReq.EndTs.UnixNano() { diff --git a/pkg/querier/queryrange/log_result_cache_test.go b/pkg/querier/queryrange/log_result_cache_test.go index 5d67be33b84fd..5da4aee7c4be3 100644 --- a/pkg/querier/queryrange/log_result_cache_test.go +++ b/pkg/querier/queryrange/log_result_cache_test.go @@ -580,6 +580,54 @@ func Test_LogResultNonOverlappingCache(t *testing.T) { fake.AssertExpectations(t) } +func Test_LogResultCacheDifferentLimit(t *testing.T) { + var ( + ctx = user.InjectOrgID(context.Background(), "foo") + lrc = NewLogResultCache( + log.NewNopLogger(), + fakeLimits{ + splitDuration: map[string]time.Duration{"foo": time.Minute}, + }, + cache.NewMockCache(), + nil, + nil, + nil, + ) + ) + + req1 := &LokiRequest{ + StartTs: time.Unix(0, time.Minute.Nanoseconds()), + EndTs: time.Unix(0, 2*time.Minute.Nanoseconds()), + Limit: entriesLimit, + } + + req2 := &LokiRequest{ + StartTs: time.Unix(0, time.Minute.Nanoseconds()), + EndTs: time.Unix(0, 2*time.Minute.Nanoseconds()), + Limit: 10, + } + + fake := newFakeResponse([]mockResponse{ + { + RequestResponse: queryrangebase.RequestResponse{ + Request: req1, + Response: emptyResponse(req1), + }, + }, + }) + + h := lrc.Wrap(fake) + + resp, err := h.Do(ctx, req1) + require.NoError(t, err) + require.Equal(t, emptyResponse(req1), resp) + resp, err = h.Do(ctx, req2) + require.NoError(t, err) + require.Equal(t, emptyResponse(req2), resp) + + fake.AssertExpectations(t) +} + func TestExtractLokiResponse(t *testing.T) { for _, tc := range []struct { name string @@ -677,6 +725,7 @@ func newFakeResponse(responses []mockResponse) fakeResponse { for _, r := range responses { m.On("Do", mock.Anything, r.Request).Return(r.Response, r.err).Once() } + return fakeResponse{ Mock: m, } diff --git a/pkg/querier/queryrange/roundtrip.go b/pkg/querier/queryrange/roundtrip.go index 5f0aef4a1ab49..6d0d62af7a88a 100644 --- a/pkg/querier/queryrange/roundtrip.go +++ b/pkg/querier/queryrange/roundtrip.go @@ -120,6 +120,7 @@ func newResultsCacheFromConfig(cfg base.ResultsCacheConfig, registerer prometheu func NewMiddleware( cfg Config, engineOpts logql.EngineOpts, + iqo util.IngesterQueryOptions, log log.Logger, limits Limits, schema config.SchemaConfig, @@ -176,36 +177,38 @@ func NewMiddleware( var codec base.Codec = DefaultCodec - indexStatsTripperware, err := NewIndexStatsTripperware(cfg, log, limits, schema, codec, statsCache, + split := newDefaultSplitter(limits, iqo) + + indexStatsTripperware, err := NewIndexStatsTripperware(cfg, log, limits, schema, codec, split, statsCache, cacheGenNumLoader, retentionEnabled, metrics, metricsNamespace) if err != nil { return nil, nil, err } - metricsTripperware, err := NewMetricTripperware(cfg, engineOpts, log, limits, schema, codec, resultsCache, + metricsTripperware, err := NewMetricTripperware(cfg, engineOpts, log, limits, schema, codec, newMetricQuerySplitter(limits, iqo), resultsCache, cacheGenNumLoader, retentionEnabled, PrometheusExtractor{}, metrics, indexStatsTripperware, metricsNamespace) if err != nil { return nil, nil, err } - limitedTripperware, err := NewLimitedTripperware(cfg, engineOpts, log, limits, schema, metrics, indexStatsTripperware, codec) + limitedTripperware, err := NewLimitedTripperware(cfg, engineOpts, log, limits, schema, metrics, indexStatsTripperware, codec, split) if err != nil { return nil, nil, err } // NOTE: When we would start caching response from non-metric queries we would have to consider cache gen headers as well in // MergeResponse implementation for Loki codecs same as it is done in Cortex at https://github.com/cortexproject/cortex/blob/21bad57b346c730d684d6d0205efef133422ab28/pkg/querier/queryrange/query_range.go#L170 - logFilterTripperware, err := NewLogFilterTripperware(cfg, engineOpts, log, limits, schema, codec, resultsCache, metrics, indexStatsTripperware, metricsNamespace) + logFilterTripperware, err := NewLogFilterTripperware(cfg, engineOpts, log, limits, schema, codec, split, resultsCache, metrics, indexStatsTripperware, metricsNamespace) if err != nil { return nil, nil, err } - seriesTripperware, err := NewSeriesTripperware(cfg, log, limits, metrics, schema, codec, seriesCache, cacheGenNumLoader, retentionEnabled, metricsNamespace) + seriesTripperware, err := NewSeriesTripperware(cfg, log, limits, metrics, schema, codec, split, seriesCache, cacheGenNumLoader, retentionEnabled, metricsNamespace) if err != nil { return nil, nil, err } - labelsTripperware, err := NewLabelsTripperware(cfg, log, limits, codec, labelsCache, cacheGenNumLoader, retentionEnabled, metrics, schema, metricsNamespace) + labelsTripperware, err := NewLabelsTripperware(cfg, log, limits, codec, split, labelsCache, cacheGenNumLoader, retentionEnabled, metrics, schema, metricsNamespace) if err != nil { return nil, nil, err } @@ -215,7 +218,7 @@ func NewMiddleware( return nil, nil, err } - seriesVolumeTripperware, err := NewVolumeTripperware(cfg, log, limits, schema, codec, volumeCache, cacheGenNumLoader, retentionEnabled, metrics, metricsNamespace) + seriesVolumeTripperware, err := NewVolumeTripperware(cfg, log, limits, schema, codec, split, volumeCache, cacheGenNumLoader, retentionEnabled, metrics, metricsNamespace) if err != nil { return nil, nil, err } @@ -406,18 +409,7 @@ func getOperation(path string) string { } // NewLogFilterTripperware creates a new frontend tripperware responsible for handling log requests. -func NewLogFilterTripperware( - cfg Config, - engineOpts logql.EngineOpts, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - merger base.Merger, - c cache.Cache, - metrics *Metrics, - indexStatsTripperware base.Middleware, - metricsNamespace string, -) (base.Middleware, error) { +func NewLogFilterTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, split splitter, c cache.Cache, metrics *Metrics, indexStatsTripperware base.Middleware, metricsNamespace string) (base.Middleware, error) { return base.MiddlewareFunc(func(next base.Handler) base.Handler { statsHandler := indexStatsTripperware.Wrap(next) @@ -426,7 +418,7 @@ func NewLogFilterTripperware( NewLimitsMiddleware(limits), NewQuerySizeLimiterMiddleware(schema.Configs, engineOpts, log, limits, statsHandler), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), } if cfg.CacheResults { @@ -481,16 +473,7 @@ func NewLogFilterTripperware( } // NewLimitedTripperware creates a new frontend tripperware responsible for handling log requests which are label matcher only, no filter expression. -func NewLimitedTripperware( - _ Config, - engineOpts logql.EngineOpts, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - metrics *Metrics, - indexStatsTripperware base.Middleware, - merger base.Merger, -) (base.Middleware, error) { +func NewLimitedTripperware(_ Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, metrics *Metrics, indexStatsTripperware base.Middleware, merger base.Merger, split splitter) (base.Middleware, error) { return base.MiddlewareFunc(func(next base.Handler) base.Handler { statsHandler := indexStatsTripperware.Wrap(next) @@ -499,7 +482,7 @@ func NewLimitedTripperware( NewLimitsMiddleware(limits), NewQuerySizeLimiterMiddleware(schema.Configs, engineOpts, log, limits, statsHandler), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, WithMaxParallelism(limits, limitedQuerySplits), merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, WithMaxParallelism(limits, limitedQuerySplits), merger, split, metrics.SplitByMetrics), NewQuerierSizeLimiterMiddleware(schema.Configs, engineOpts, log, limits, statsHandler), } @@ -518,6 +501,7 @@ func NewSeriesTripperware( metrics *Metrics, schema config.SchemaConfig, merger base.Merger, + split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, @@ -558,7 +542,7 @@ func NewSeriesTripperware( StatsCollectorMiddleware(), NewLimitsMiddleware(limits), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), } if cfg.CacheSeriesResults { @@ -567,7 +551,6 @@ func NewSeriesTripperware( base.InstrumentMiddleware("series_results_cache", metrics.InstrumentMiddlewareMetrics), cacheMiddleware, ) - } if cfg.MaxRetries > 0 { @@ -601,6 +584,7 @@ func NewLabelsTripperware( log log.Logger, limits Limits, merger base.Merger, + split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, @@ -643,7 +627,7 @@ func NewLabelsTripperware( StatsCollectorMiddleware(), NewLimitsMiddleware(limits), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), } if cfg.CacheLabelResults { @@ -652,7 +636,6 @@ func NewLabelsTripperware( base.InstrumentMiddleware("label_results_cache", metrics.InstrumentMiddlewareMetrics), cacheMiddleware, ) - } if cfg.MaxRetries > 0 { @@ -669,21 +652,7 @@ func NewLabelsTripperware( } // NewMetricTripperware creates a new frontend tripperware responsible for handling metric queries -func NewMetricTripperware( - cfg Config, - engineOpts logql.EngineOpts, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - merger base.Merger, - c cache.Cache, - cacheGenNumLoader base.CacheGenNumberLoader, - retentionEnabled bool, - extractor base.Extractor, - metrics *Metrics, - indexStatsTripperware base.Middleware, - metricsNamespace string, -) (base.Middleware, error) { +func NewMetricTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, extractor base.Extractor, metrics *Metrics, indexStatsTripperware base.Middleware, metricsNamespace string) (base.Middleware, error) { cacheKey := cacheKeyLimits{limits, cfg.Transformer} var queryCacheMiddleware base.Middleware if cfg.CacheResults { @@ -737,7 +706,7 @@ func NewMetricTripperware( queryRangeMiddleware, NewQuerySizeLimiterMiddleware(schema.Configs, engineOpts, log, limits, statsHandler), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitMetricByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), ) if cfg.CacheResults { @@ -793,16 +762,7 @@ func NewMetricTripperware( } // NewInstantMetricTripperware creates a new frontend tripperware responsible for handling metric queries -func NewInstantMetricTripperware( - cfg Config, - engineOpts logql.EngineOpts, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - metrics *Metrics, - indexStatsTripperware base.Middleware, - metricsNamespace string, -) (base.Middleware, error) { +func NewInstantMetricTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, metrics *Metrics, indexStatsTripperware base.Middleware, metricsNamespace string) (base.Middleware, error) { return base.MiddlewareFunc(func(next base.Handler) base.Handler { statsHandler := indexStatsTripperware.Wrap(next) @@ -844,21 +804,10 @@ func NewInstantMetricTripperware( }), nil } -func NewVolumeTripperware( - cfg Config, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - merger base.Merger, - c cache.Cache, - cacheGenNumLoader base.CacheGenNumberLoader, - retentionEnabled bool, - metrics *Metrics, - metricsNamespace string, -) (base.Middleware, error) { +func NewVolumeTripperware(cfg Config, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, metrics *Metrics, metricsNamespace string) (base.Middleware, error) { // Parallelize the volume requests, so it doesn't send a huge request to a single index-gw (i.e. {app=~".+"} for 30d). // Indices are sharded by 24 hours, so we split the volume request in 24h intervals. - limits = WithSplitByLimits(limits, 24*time.Hour) + limits = WithSplitByLimits(limits, indexStatsQuerySplitInterval) var cacheMiddleware base.Middleware if cfg.CacheVolumeResults { var err error @@ -894,6 +843,7 @@ func NewVolumeTripperware( cacheMiddleware, cfg, merger, + split, limits, log, metrics, @@ -962,18 +912,7 @@ func volumeFeatureFlagRoundTripper(nextTW base.Middleware, limits Limits) base.M }) } -func NewIndexStatsTripperware( - cfg Config, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - merger base.Merger, - c cache.Cache, - cacheGenNumLoader base.CacheGenNumberLoader, - retentionEnabled bool, - metrics *Metrics, - metricsNamespace string, -) (base.Middleware, error) { +func NewIndexStatsTripperware(cfg Config, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, metrics *Metrics, metricsNamespace string) (base.Middleware, error) { limits = WithSplitByLimits(limits, indexStatsQuerySplitInterval) var cacheMiddleware base.Middleware @@ -1011,6 +950,7 @@ func NewIndexStatsTripperware( cacheMiddleware, cfg, merger, + split, limits, log, metrics, @@ -1028,6 +968,7 @@ func sharedIndexTripperware( cacheMiddleware base.Middleware, cfg Config, merger base.Merger, + split splitter, limits Limits, log log.Logger, metrics *Metrics, @@ -1038,7 +979,7 @@ func sharedIndexTripperware( middlewares := []base.Middleware{ NewLimitsMiddleware(limits), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), } if cacheMiddleware != nil { diff --git a/pkg/querier/queryrange/roundtrip_test.go b/pkg/querier/queryrange/roundtrip_test.go index 883f9b14226bc..fe8799fffe799 100644 --- a/pkg/querier/queryrange/roundtrip_test.go +++ b/pkg/querier/queryrange/roundtrip_test.go @@ -189,7 +189,7 @@ func TestMetricsTripperware(t *testing.T) { noCacheTestCfg := testConfig noCacheTestCfg.CacheResults = false noCacheTestCfg.CacheIndexStatsResults = false - tpw, stopper, err := NewMiddleware(noCacheTestCfg, testEngineOpts, util_log.Logger, l, config.SchemaConfig{ + tpw, stopper, err := NewMiddleware(noCacheTestCfg, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{ Configs: testSchemasTSDB, }, nil, false, nil, constants.Loki) if stopper != nil { @@ -240,7 +240,7 @@ func TestMetricsTripperware(t *testing.T) { require.Error(t, err) // Configure with cache - tpw, stopper, err = NewMiddleware(testConfig, testEngineOpts, util_log.Logger, l, config.SchemaConfig{ + tpw, stopper, err = NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{ Configs: testSchemasTSDB, }, nil, false, nil, constants.Loki) if stopper != nil { @@ -278,7 +278,7 @@ func TestLogFilterTripperware(t *testing.T) { noCacheTestCfg := testConfig noCacheTestCfg.CacheResults = false noCacheTestCfg.CacheIndexStatsResults = false - tpw, stopper, err := NewMiddleware(noCacheTestCfg, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: testSchemasTSDB}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(noCacheTestCfg, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: testSchemasTSDB}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -347,7 +347,7 @@ func TestInstantQueryTripperware(t *testing.T) { queryTimeout: 1 * time.Minute, maxSeries: 1, } - tpw, stopper, err := NewMiddleware(testShardingConfigNoCache, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: testSchemasTSDB}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testShardingConfigNoCache, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: testSchemasTSDB}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -403,7 +403,7 @@ func TestSeriesTripperware(t *testing.T) { "1": 24 * time.Hour, }, } - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -441,7 +441,7 @@ func TestLabelsTripperware(t *testing.T) { "1": 24 * time.Hour, }, } - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -487,7 +487,7 @@ func TestLabelsTripperware(t *testing.T) { } func TestIndexStatsTripperware(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -537,7 +537,7 @@ func TestVolumeTripperware(t *testing.T) { volumeEnabled: true, maxSeries: 42, } - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -593,7 +593,7 @@ func TestVolumeTripperware(t *testing.T) { }) t.Run("range queries return a prometheus style metrics response, putting volumes in buckets based on the step", func(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, volumeEnabled: true}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, volumeEnabled: true}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -784,7 +784,7 @@ func TestNewTripperware_Caches(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - _, stopper, err := NewMiddleware(tc.config, testEngineOpts, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + _, stopper, err := NewMiddleware(tc.config, testEngineOpts, nil, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -814,7 +814,7 @@ func TestNewTripperware_Caches(t *testing.T) { } func TestLogNoFilter(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -871,7 +871,7 @@ func TestPostQueries(t *testing.T) { } func TestTripperware_EntriesLimit(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{maxEntriesLimitPerQuery: 5000, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{maxEntriesLimitPerQuery: 5000, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -920,7 +920,7 @@ func TestTripperware_RequiredLabels(t *testing.T) { } { t.Run(test.qs, func(t *testing.T) { limits := fakeLimits{maxEntriesLimitPerQuery: 5000, maxQueryParallelism: 1, requiredLabels: []string{"app"}} - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -1027,7 +1027,7 @@ func TestTripperware_RequiredNumberLabels(t *testing.T) { maxQueryParallelism: 1, requiredNumberLabels: tc.requiredNumberLabels, } - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -1218,7 +1218,7 @@ func TestMetricsTripperware_SplitShardStats(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - tpw, stopper, err := NewMiddleware(statsTestCfg, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: statsSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(statsTestCfg, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: statsSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -1245,6 +1245,7 @@ type fakeLimits struct { maxSeries int splitDuration map[string]time.Duration metadataSplitDuration map[string]time.Duration + ingesterSplitDuration map[string]time.Duration minShardingLookback time.Duration queryTimeout time.Duration requiredLabels []string @@ -1269,6 +1270,13 @@ func (f fakeLimits) MetadataQuerySplitDuration(key string) time.Duration { return f.metadataSplitDuration[key] } +func (f fakeLimits) IngesterQuerySplitDuration(key string) time.Duration { + if f.ingesterSplitDuration == nil { + return 0 + } + return f.ingesterSplitDuration[key] +} + func (f fakeLimits) MaxQueryLength(context.Context, string) time.Duration { if f.maxQueryLength == 0 { return time.Hour * 7 @@ -1344,6 +1352,19 @@ func (f fakeLimits) TSDBMaxBytesPerShard(_ string) int { return valid.DefaultTSDBMaxBytesPerShard } +type ingesterQueryOpts struct { + queryStoreOnly bool + queryIngestersWithin time.Duration +} + +func (i ingesterQueryOpts) QueryStoreOnly() bool { + return i.queryStoreOnly +} + +func (i ingesterQueryOpts) QueryIngestersWithin() time.Duration { + return i.queryIngestersWithin +} + func counter() (*int, base.Handler) { count := 0 var lock sync.Mutex diff --git a/pkg/querier/queryrange/split_by_interval.go b/pkg/querier/queryrange/split_by_interval.go index 9e2eda4b19423..b332fe5e612e7 100644 --- a/pkg/querier/queryrange/split_by_interval.go +++ b/pkg/querier/queryrange/split_by_interval.go @@ -21,7 +21,6 @@ import ( "github.com/grafana/loki/pkg/logql/syntax" "github.com/grafana/loki/pkg/querier/queryrange/queryrangebase" "github.com/grafana/loki/pkg/storage/config" - "github.com/grafana/loki/pkg/util" "github.com/grafana/loki/pkg/util/validation" ) @@ -56,13 +55,11 @@ type splitByInterval struct { limits Limits merger queryrangebase.Merger metrics *SplitByMetrics - splitter Splitter + splitter splitter } -type Splitter func(req queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) - // SplitByIntervalMiddleware creates a new Middleware that splits log requests by a given interval. -func SplitByIntervalMiddleware(configs []config.PeriodConfig, limits Limits, merger queryrangebase.Merger, splitter Splitter, metrics *SplitByMetrics) queryrangebase.Middleware { +func SplitByIntervalMiddleware(configs []config.PeriodConfig, limits Limits, merger queryrangebase.Merger, splitter splitter, metrics *SplitByMetrics) queryrangebase.Middleware { if metrics == nil { metrics = NewSplitByMetrics(nil) } @@ -197,7 +194,7 @@ func (h *splitByInterval) Do(ctx context.Context, r queryrangebase.Request) (que return h.next.Do(ctx, r) } - intervals, err := h.splitter(r, interval) + intervals, err := h.splitter.split(time.Now().UTC(), tenantIDs, r, interval) if err != nil { return nil, err } @@ -251,73 +248,6 @@ func (h *splitByInterval) Do(ctx context.Context, r queryrangebase.Request) (que return h.merger.MergeResponse(resps...) } -func splitByTime(req queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) { - var reqs []queryrangebase.Request - - switch r := req.(type) { - case *LokiRequest: - util.ForInterval(interval, r.StartTs, r.EndTs, false, func(start, end time.Time) { - reqs = append(reqs, &LokiRequest{ - Query: r.Query, - Limit: r.Limit, - Step: r.Step, - Interval: r.Interval, - Direction: r.Direction, - Path: r.Path, - StartTs: start, - EndTs: end, - Plan: r.Plan, - }) - }) - case *LokiSeriesRequest: - // metadata queries have end time inclusive. - // Set endTimeInclusive to true so that ForInterval keeps a gap of 1ms between splits to - // avoid querying duplicate data in adjacent queries. - util.ForInterval(interval, r.StartTs, r.EndTs, true, func(start, end time.Time) { - reqs = append(reqs, &LokiSeriesRequest{ - Match: r.Match, - Path: r.Path, - StartTs: start, - EndTs: end, - Shards: r.Shards, - }) - }) - case *LabelRequest: - // metadata queries have end time inclusive. - // Set endTimeInclusive to true so that ForInterval keeps a gap of 1ms between splits to - // avoid querying duplicate data in adjacent queries. - util.ForInterval(interval, *r.Start, *r.End, true, func(start, end time.Time) { - reqs = append(reqs, NewLabelRequest(start, end, r.Query, r.Name, r.Path())) - }) - case *logproto.IndexStatsRequest: - startTS := r.GetStart() - endTS := r.GetEnd() - util.ForInterval(interval, startTS, endTS, true, func(start, end time.Time) { - reqs = append(reqs, &logproto.IndexStatsRequest{ - From: model.TimeFromUnix(start.Unix()), - Through: model.TimeFromUnix(end.Unix()), - Matchers: r.GetMatchers(), - }) - }) - case *logproto.VolumeRequest: - startTS := r.GetStart() - endTS := r.GetEnd() - util.ForInterval(interval, startTS, endTS, true, func(start, end time.Time) { - reqs = append(reqs, &logproto.VolumeRequest{ - From: model.TimeFromUnix(start.Unix()), - Through: model.TimeFromUnix(end.Unix()), - Matchers: r.GetMatchers(), - Limit: r.Limit, - TargetLabels: r.TargetLabels, - AggregateBy: r.AggregateBy, - }) - }) - default: - return nil, nil - } - return reqs, nil -} - // maxRangeVectorAndOffsetDurationFromQueryString func maxRangeVectorAndOffsetDurationFromQueryString(q string) (time.Duration, time.Duration, error) { parsed, err := syntax.ParseExpr(q) @@ -346,92 +276,3 @@ func maxRangeVectorAndOffsetDuration(expr syntax.Expr) (time.Duration, time.Dura }) return maxRVDuration, maxOffset, nil } - -// reduceSplitIntervalForRangeVector reduces the split interval for a range query based on the duration of the range vector. -// Large range vector durations will not be split into smaller intervals because it can cause the queries to be slow by over-processing data. -func reduceSplitIntervalForRangeVector(r *LokiRequest, interval time.Duration) (time.Duration, error) { - maxRange, _, err := maxRangeVectorAndOffsetDuration(r.Plan.AST) - if err != nil { - return 0, err - } - if maxRange > interval { - return maxRange, nil - } - return interval, nil -} - -func splitMetricByTime(r queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) { - var reqs []queryrangebase.Request - - lokiReq := r.(*LokiRequest) - - interval, err := reduceSplitIntervalForRangeVector(lokiReq, interval) - if err != nil { - return nil, err - } - - // step align start and end time of the query. Start time is rounded down and end time is rounded up. - stepNs := r.GetStep() * 1e6 - startNs := lokiReq.StartTs.UnixNano() - start := time.Unix(0, startNs-startNs%stepNs) - - endNs := lokiReq.EndTs.UnixNano() - if mod := endNs % stepNs; mod != 0 { - endNs += stepNs - mod - } - end := time.Unix(0, endNs) - - lokiReq = lokiReq.WithStartEnd(start, end).(*LokiRequest) - - // step is >= configured split interval, let us just split the query interval by step - if lokiReq.Step >= interval.Milliseconds() { - util.ForInterval(time.Duration(lokiReq.Step*1e6), lokiReq.StartTs, lokiReq.EndTs, false, func(start, end time.Time) { - reqs = append(reqs, &LokiRequest{ - Query: lokiReq.Query, - Limit: lokiReq.Limit, - Step: lokiReq.Step, - Interval: lokiReq.Interval, - Direction: lokiReq.Direction, - Path: lokiReq.Path, - StartTs: start, - EndTs: end, - Plan: lokiReq.Plan, - }) - }) - - return reqs, nil - } - - for start := lokiReq.StartTs; start.Before(lokiReq.EndTs); start = nextIntervalBoundary(start, r.GetStep(), interval).Add(time.Duration(r.GetStep()) * time.Millisecond) { - end := nextIntervalBoundary(start, r.GetStep(), interval) - if end.Add(time.Duration(r.GetStep())*time.Millisecond).After(lokiReq.EndTs) || end.Add(time.Duration(r.GetStep())*time.Millisecond) == lokiReq.EndTs { - end = lokiReq.EndTs - } - reqs = append(reqs, &LokiRequest{ - Query: lokiReq.Query, - Limit: lokiReq.Limit, - Step: lokiReq.Step, - Interval: lokiReq.Interval, - Direction: lokiReq.Direction, - Path: lokiReq.Path, - StartTs: start, - EndTs: end, - Plan: lokiReq.Plan, - }) - } - - return reqs, nil -} - -// Round up to the step before the next interval boundary. -func nextIntervalBoundary(t time.Time, step int64, interval time.Duration) time.Time { - stepNs := step * 1e6 - nsPerInterval := interval.Nanoseconds() - startOfNextInterval := ((t.UnixNano() / nsPerInterval) + 1) * nsPerInterval - // ensure that target is a multiple of steps away from the start time - target := startOfNextInterval - ((startOfNextInterval - t.UnixNano()) % stepNs) - if target == startOfNextInterval { - target -= stepNs - } - return time.Unix(0, target) -} diff --git a/pkg/querier/queryrange/split_by_interval_test.go b/pkg/querier/queryrange/split_by_interval_test.go index b236b88fb4d53..acf8c495becce 100644 --- a/pkg/querier/queryrange/split_by_interval_test.go +++ b/pkg/querier/queryrange/split_by_interval_test.go @@ -9,9 +9,9 @@ import ( "testing" "time" - "github.com/prometheus/common/model" - "github.com/grafana/dskit/user" + "github.com/prometheus/common/model" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gopkg.in/yaml.v2" @@ -22,6 +22,8 @@ import ( "github.com/grafana/loki/pkg/querier/plan" "github.com/grafana/loki/pkg/querier/queryrange/queryrangebase" "github.com/grafana/loki/pkg/storage/config" + "github.com/grafana/loki/pkg/storage/stores/index/seriesvolume" + "github.com/grafana/loki/pkg/util" ) var nilMetrics = NewSplitByMetrics(nil) @@ -56,181 +58,393 @@ var testSchemasTSDB = func() []config.PeriodConfig { return confs }() -func Test_splitQuery(t *testing.T) { - buildLokiRequest := func(start, end time.Time) queryrangebase.Request { - return &LokiRequest{ - Query: `{app="foo"}`, - Limit: 1, - Step: 2, - StartTs: start, - EndTs: end, - Direction: logproto.BACKWARD, - Path: "/path", - Plan: &plan.QueryPlan{ - AST: syntax.MustParseExpr(`{app="foo"}`), - }, - } - } - - buildLokiRequestWithInterval := func(start, end time.Time) queryrangebase.Request { - return &LokiRequest{ - Query: `{app="foo"}`, - Limit: 1, - Interval: 2, - StartTs: start, - EndTs: end, - Direction: logproto.BACKWARD, - Path: "/path", - Plan: &plan.QueryPlan{ - AST: syntax.MustParseExpr(`{app="foo"}`), - }, - } - } - - buildLokiSeriesRequest := func(start, end time.Time) queryrangebase.Request { - return &LokiSeriesRequest{ - Match: []string{"match1"}, - StartTs: start, - EndTs: end, - Path: "/series", - Shards: []string{"shard1"}, - } - } - - buildLokiLabelNamesRequest := func(start, end time.Time) queryrangebase.Request { - return NewLabelRequest(start, end, "", "", "/lables") - } +var ( + // 62697274686461792063616b65 + refTime = time.Date(2023, 1, 15, 8, 5, 30, 123456789, time.UTC) + tenantID = "1" +) +func Test_splitQuery(t *testing.T) { type interval struct { start, end time.Time } + for requestType, tc := range map[string]struct { requestBuilderFunc func(start, end time.Time) queryrangebase.Request endTimeInclusive bool }{ - "LokiRequest": { - buildLokiRequest, - false, + "logs request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &LokiRequest{ + Query: `{app="foo"}`, + Limit: 1, + Step: 2, + StartTs: start, + EndTs: end, + Direction: logproto.BACKWARD, + Path: "/query", + Plan: &plan.QueryPlan{ + AST: syntax.MustParseExpr(`{app="foo"}`), + }, + } + }, + }, + "logs request with interval": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &LokiRequest{ + Query: `{app="foo"}`, + Limit: 1, + Interval: 2, + StartTs: start, + EndTs: end, + Direction: logproto.BACKWARD, + Path: "/query", + Plan: &plan.QueryPlan{ + AST: syntax.MustParseExpr(`{app="foo"}`), + }, + } + }, + }, + "series request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &LokiSeriesRequest{ + Match: []string{"match1"}, + StartTs: start, + EndTs: end, + Path: "/series", + Shards: []string{"shard1"}, + } + }, + endTimeInclusive: true, + }, + "label names request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return NewLabelRequest(start, end, `{foo="bar"}`, "", "/labels") + }, + endTimeInclusive: true, }, - "LokiRequestWithInterval": { - buildLokiRequestWithInterval, - false, + "label values request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return NewLabelRequest(start, end, `{foo="bar"}`, "test", "/label/test/values") + }, + endTimeInclusive: true, }, - "LokiSeriesRequest": { - buildLokiSeriesRequest, - true, + "index stats request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &logproto.IndexStatsRequest{ + From: model.TimeFromUnix(start.Unix()), + Through: model.TimeFromUnix(end.Unix()), + Matchers: `{host="agent"}`, + } + }, + endTimeInclusive: true, }, - "LokiLabelNamesRequest": { - buildLokiLabelNamesRequest, - true, + "volume request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &logproto.VolumeRequest{ + From: model.TimeFromUnix(start.Unix()), + Through: model.TimeFromUnix(end.Unix()), + Matchers: `{host="agent"}`, + Limit: 5, + AggregateBy: seriesvolume.Series, + } + }, + endTimeInclusive: true, }, } { expectedSplitGap := time.Duration(0) if tc.endTimeInclusive { - expectedSplitGap = time.Millisecond + expectedSplitGap = util.SplitGap } - for name, intervals := range map[string]struct { - inp interval - expected []interval - }{ - "no_change": { - inp: interval{ - start: time.Unix(0, 0), - end: time.Unix(0, (1 * time.Hour).Nanoseconds()), - }, - expected: []interval{ - { + + t.Run(requestType, func(t *testing.T) { + for name, intervals := range map[string]struct { + input interval + expected []interval + splitInterval time.Duration + splitter splitter + }{ + "no change": { + input: interval{ start: time.Unix(0, 0), end: time.Unix(0, (1 * time.Hour).Nanoseconds()), }, + expected: []interval{ + { + start: time.Unix(0, 0), + end: time.Unix(0, (1 * time.Hour).Nanoseconds()), + }, + }, }, - }, - "align_start": { - inp: interval{ - start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (2 * time.Hour).Nanoseconds()), - }, - expected: []interval{ - { + "align start": { + input: interval{ start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), - }, - { - start: time.Unix(0, (1 * time.Hour).Nanoseconds()), end: time.Unix(0, (2 * time.Hour).Nanoseconds()), }, + expected: []interval{ + { + start: time.Unix(0, (5 * time.Minute).Nanoseconds()), + end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + }, + { + start: time.Unix(0, (1 * time.Hour).Nanoseconds()), + end: time.Unix(0, (2 * time.Hour).Nanoseconds()), + }, + }, }, - }, - "align_end": { - inp: interval{ - start: time.Unix(0, 0), - end: time.Unix(0, (115 * time.Minute).Nanoseconds()), - }, - expected: []interval{ - { + "align end": { + input: interval{ start: time.Unix(0, 0), - end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), - }, - { - start: time.Unix(0, (1 * time.Hour).Nanoseconds()), end: time.Unix(0, (115 * time.Minute).Nanoseconds()), }, + expected: []interval{ + { + start: time.Unix(0, 0), + end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + }, + { + start: time.Unix(0, (1 * time.Hour).Nanoseconds()), + end: time.Unix(0, (115 * time.Minute).Nanoseconds()), + }, + }, }, - }, - "align_both": { - inp: interval{ - start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (175 * time.Minute).Nanoseconds()), + "align both": { + input: interval{ + start: time.Unix(0, (5 * time.Minute).Nanoseconds()), + end: time.Unix(0, (175 * time.Minute).Nanoseconds()), + }, + expected: []interval{ + { + start: time.Unix(0, (5 * time.Minute).Nanoseconds()), + end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + }, + { + start: time.Unix(0, (1 * time.Hour).Nanoseconds()), + end: time.Unix(0, (2 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + }, + { + start: time.Unix(0, (2 * time.Hour).Nanoseconds()), + end: time.Unix(0, (175 * time.Minute).Nanoseconds()), + }, + }, }, - expected: []interval{ - { + "no align": { + input: interval{ start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + end: time.Unix(0, (55 * time.Minute).Nanoseconds()), }, - { - start: time.Unix(0, (1 * time.Hour).Nanoseconds()), - end: time.Unix(0, (2 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + expected: []interval{ + { + start: time.Unix(0, (5 * time.Minute).Nanoseconds()), + end: time.Unix(0, (55 * time.Minute).Nanoseconds()), + }, }, - { - start: time.Unix(0, (2 * time.Hour).Nanoseconds()), - end: time.Unix(0, (175 * time.Minute).Nanoseconds()), + }, + "wholly within ingester query window": { + input: interval{ + start: refTime.Add(-time.Hour).Truncate(time.Second), + end: refTime, + }, + expected: []interval{ + { + start: refTime.Add(-time.Hour).Truncate(time.Second), + end: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC), + end: refTime, + }, }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), }, - }, - "no_align": { - inp: interval{ - start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (55 * time.Minute).Nanoseconds()), + "partially within ingester query window": { + input: interval{ + // overlapping `query_ingesters_within` window of 3h + start: refTime.Add(-4 * time.Hour).Add(-30 * time.Minute).Truncate(time.Second), + end: refTime, + }, + expected: []interval{ + // regular intervals until `query_ingesters_within` window + { + start: refTime.Add(-4 * time.Hour).Add(-30 * time.Minute).Truncate(time.Second), + end: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 5, 5, 30, 123456789, time.UTC).Add(-expectedSplitGap), + }, + // and then different intervals for queries to ingesters + { + start: time.Date(2023, 1, 15, 5, 5, 30, 123456789, time.UTC), + end: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC), + end: refTime, + }, + }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), }, - expected: []interval{ - { - start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (55 * time.Minute).Nanoseconds()), + "not within ingester query window": { + input: interval{ + // outside `query_ingesters_within` range of 3h + start: refTime.Add(-5 * time.Hour).Truncate(time.Second), + end: refTime.Add(-4 * time.Hour).Truncate(time.Second), + }, + expected: []interval{ + // regular intervals outside `query_ingesters_within` window + { + start: refTime.Add(-5 * time.Hour).Truncate(time.Second), + end: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC), + end: refTime.Add(-4 * time.Hour).Truncate(time.Second), + }, }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), }, - }, - } { - t.Run(fmt.Sprintf("%s - %s", name, requestType), func(t *testing.T) { - inp := tc.requestBuilderFunc(intervals.inp.start, intervals.inp.end) - var want []queryrangebase.Request - for _, interval := range intervals.expected { - want = append(want, tc.requestBuilderFunc(interval.start, interval.end)) - } - splits, err := splitByTime(inp, time.Hour) - require.NoError(t, err) - require.Equal(t, want, splits) - }) - } + "ingester query split by disabled": { + input: interval{ + // overlapping `query_ingesters_within` range of 3h + start: refTime.Add(-4 * time.Hour).Truncate(time.Second), + end: refTime, + }, + expected: []interval{ + // regular intervals only, since ingester split duration is 0 + { + start: refTime.Add(-4 * time.Hour).Truncate(time.Second), + end: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC), + end: refTime, + }, + }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 0}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + "ingester query split enabled but query_store_only enabled too": { + input: interval{ + // overlapping `query_ingesters_within` range of 3h + start: refTime.Add(-4 * time.Hour).Truncate(time.Second), + end: refTime, + }, + expected: []interval{ + // regular intervals only, since ingester split duration is 0 + { + start: refTime.Add(-4 * time.Hour).Truncate(time.Second), + end: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC), + end: refTime, + }, + }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour, queryStoreOnly: true}, + ), + }, + } { + t.Run(name, func(t *testing.T) { + req := tc.requestBuilderFunc(intervals.input.start, intervals.input.end) + var want []queryrangebase.Request + for _, exp := range intervals.expected { + want = append(want, tc.requestBuilderFunc(exp.start, exp.end)) + } + + if intervals.splitInterval == 0 { + intervals.splitInterval = time.Hour + } + + if intervals.splitter == nil { + intervals.splitter = newDefaultSplitter(fakeLimits{}, nil) + } + + splits, err := intervals.splitter.split(refTime, []string{tenantID}, req, intervals.splitInterval) + require.NoError(t, err) + if !assert.Equal(t, want, splits) { + t.Logf("expected and actual do not match\n") + defer t.Fail() + + if len(want) != len(splits) { + t.Logf("expected %d splits, got %d\n", len(want), len(splits)) + return + } + + for j := 0; j < len(want); j++ { + exp := want[j] + act := splits[j] + equal := assert.Equal(t, exp, act) + t.Logf("\t#%d [matches: %v]: expected %q/%q got %q/%q\n", j, equal, exp.GetStart(), exp.GetEnd(), act.GetStart(), act.GetEnd()) + } + } + }) + } + }) } } func Test_splitMetricQuery(t *testing.T) { const seconds = 1e3 // 1e3 milliseconds per second. + const shortRange = `rate({app="foo"}[1m])` + const longRange = `rate({app="foo"}[7d])` + for i, tc := range []struct { - input *LokiRequest - expected []queryrangebase.Request - interval time.Duration + input *LokiRequest + expected []queryrangebase.Request + splitInterval time.Duration + splitter splitter }{ // the step is lower than the interval therefore we should split only once. { @@ -238,172 +452,172 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(0, 0), EndTs: time.Unix(0, 60*time.Minute.Nanoseconds()), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(0, 60*time.Minute.Nanoseconds()), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 24 * time.Hour, + splitInterval: 24 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(60*60, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(60*60, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 24 * time.Hour, + splitInterval: 24 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(2*24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix((24*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((24 * 3600), 0), EndTs: time.Unix((2 * 24 * 3600), 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 24 * time.Hour, + splitInterval: 24 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(2*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix((3*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((3 * 3600), 0), EndTs: time.Unix((2 * 3 * 3600), 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(3*3600, 0), EndTs: time.Unix(3*24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(3*3600, 0), EndTs: time.Unix((24*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(24*3600, 0), EndTs: time.Unix((2*24*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*24*3600, 0), EndTs: time.Unix(3*24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 24 * time.Hour, + splitInterval: 24 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600, 0), EndTs: time.Unix((3*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(3*3600, 0), EndTs: time.Unix((2*3*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*3*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, // step not a multiple of interval @@ -413,29 +627,29 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(2*3600-9, 0), // 2h mod 17s = 9s EndTs: time.Unix(3*3*3600, 0), Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600-9, 0), EndTs: time.Unix((3*3600)-5, 0), // 3h mod 17s = 5s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((3*3600)+12, 0), EndTs: time.Unix((2*3*3600)-10, 0), // 6h mod 17s = 10s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*3*3600+7, 0), EndTs: time.Unix(3*3*3600+2, 0), // 9h mod 17s = 2s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, // end time already step aligned { @@ -443,29 +657,29 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600+2, 0), // 9h mod 17s = 2s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600-9, 0), // 2h mod 17s = 9s EndTs: time.Unix((3*3600)-5, 0), // 3h mod 17s = 5s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((3*3600)+12, 0), EndTs: time.Unix((2*3*3600)-10, 0), // 6h mod 17s = 10s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*3*3600+7, 0), EndTs: time.Unix(3*3*3600+2, 0), Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, // start & end time not aligned with step { @@ -473,29 +687,29 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600-9, 0), // 2h mod 17s = 9s EndTs: time.Unix((3*3600)-5, 0), // 3h mod 17s = 5s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((3*3600)+12, 0), EndTs: time.Unix((2*3*3600)-10, 0), // 6h mod 17s = 10s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*3*3600+7, 0), EndTs: time.Unix(3*3*3600+2, 0), // 9h mod 17s = 2s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, // step larger than split interval @@ -504,58 +718,58 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(0, 0), EndTs: time.Unix(25*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(6*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(6*3600, 0), EndTs: time.Unix(12*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(12*3600, 0), EndTs: time.Unix(18*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(18*3600, 0), EndTs: time.Unix(24*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(24*3600, 0), EndTs: time.Unix(30*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 15 * time.Minute, + splitInterval: 15 * time.Minute, }, { input: &LokiRequest{ StartTs: time.Unix(1*3600, 0), EndTs: time.Unix(3*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(6*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 15 * time.Minute, + splitInterval: 15 * time.Minute, }, // reduce split by to 6h instead of 1h { @@ -579,7 +793,7 @@ func Test_splitMetricQuery(t *testing.T) { Query: `rate({app="foo"}[6h])`, }, }, - interval: 1 * time.Hour, + splitInterval: 1 * time.Hour, }, // range vector too large we don't want to split it { @@ -587,17 +801,222 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[7d])`, + Query: longRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[7d])`, + Query: longRange, + }, + }, + splitInterval: 15 * time.Minute, + }, + // query is wholly within ingester query window + { + input: &LokiRequest{ + StartTs: refTime.Add(-time.Hour), + EndTs: refTime, + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 05, 30, 0, time.UTC), // start time is aligned down to step of 15s + EndTs: time.Date(2023, 1, 15, 7, 29, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 8, 5, 45, 0, time.UTC), // end time is aligned up to step of 15s + Step: 15 * seconds, + Query: shortRange, }, }, - interval: 15 * time.Minute, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + // query is partially within ingester query window + { + input: &LokiRequest{ + StartTs: refTime.Add(-4 * time.Hour).Add(-30 * time.Minute), + EndTs: refTime, + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + // regular intervals until `query_ingesters_within` window + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 3, 35, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 3, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 4, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 5, 5, 15, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + // and then different intervals for queries to ingesters + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 5, 5, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 5, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 7, 29, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 8, 5, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + }, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + // not within ingester query window + { + input: &LokiRequest{ + StartTs: refTime.Add(-5 * time.Hour), + EndTs: refTime.Add(-4 * time.Hour), + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + // regular intervals until `query_ingesters_within` window + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 3, 5, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 3, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 4, 5, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + }, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + // ingester query split by disabled + { + input: &LokiRequest{ + StartTs: refTime.Add(-4 * time.Hour), + EndTs: refTime, + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + // regular intervals only, since ingester split duration is 0 + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 4, 5, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 4, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 5, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 6, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 7, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 8, 5, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + }, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 0}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + // ingester query split by enabled, but query_store_only is enabled too + { + input: &LokiRequest{ + StartTs: refTime.Add(-4 * time.Hour), + EndTs: refTime, + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + // regular intervals only, since ingester split duration is 0 + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 4, 5, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 4, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 5, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 6, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 7, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 8, 5, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + }, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour, queryStoreOnly: true}, + ), }, } { // Set query plans @@ -612,13 +1031,29 @@ func Test_splitMetricQuery(t *testing.T) { } t.Run(strconv.Itoa(i), func(t *testing.T) { - splits, err := splitMetricByTime(tc.input, tc.interval) + ms := newMetricQuerySplitter(fakeLimits{}, nil) + if tc.splitter != nil { + ms = tc.splitter.(*metricQuerySplitter) + } + + splits, err := ms.split(refTime, []string{tenantID}, tc.input, tc.splitInterval) require.NoError(t, err) - for i, s := range splits { - s := s.(*LokiRequest) - t.Logf(" want: %d start:%s end:%s \n", i, s.StartTs, s.EndTs) + if !assert.Equal(t, tc.expected, splits) { + t.Logf("expected and actual do not match\n") + defer t.Fail() + + if len(tc.expected) != len(splits) { + t.Logf("expected %d splits, got %d\n", len(tc.expected), len(splits)) + return + } + + for j := 0; j < len(tc.expected); j++ { + exp := tc.expected[j] + act := splits[j] + equal := assert.Equal(t, exp, act) + t.Logf("\t#%d [matches: %v]: expected %q/%q got %q/%q\n", j, equal, exp.GetStart(), exp.GetEnd(), act.GetStart(), act.GetEnd()) + } } - require.Equal(t, tc.expected, splits) }) } @@ -646,12 +1081,13 @@ func Test_splitByInterval_Do(t *testing.T) { }, nil }) + defSplitter := newDefaultSplitter(fakeLimits{}, nil) l := WithSplitByLimits(fakeLimits{maxQueryParallelism: 1}, time.Hour) split := SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) @@ -834,11 +1270,12 @@ func Test_series_splitByInterval_Do(t *testing.T) { "1": time.Hour, }, } + defSplitter := newDefaultSplitter(fakeLimits{}, nil) split := SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) @@ -888,12 +1325,13 @@ func Test_series_splitByInterval_Do(t *testing.T) { func Test_seriesvolume_splitByInterval_Do(t *testing.T) { ctx := user.InjectOrgID(context.Background(), "1") + defSplitter := newDefaultSplitter(fakeLimits{}, nil) setup := func(next queryrangebase.Handler, l Limits) queryrangebase.Handler { return SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) } @@ -1050,11 +1488,12 @@ func Test_ExitEarly(t *testing.T) { }) l := WithSplitByLimits(fakeLimits{maxQueryParallelism: 1}, time.Hour) + defSplitter := newDefaultSplitter(fakeLimits{}, nil) split := SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) @@ -1132,11 +1571,12 @@ func Test_DoesntDeadlock(t *testing.T) { }) l := WithSplitByLimits(fakeLimits{maxQueryParallelism: n}, time.Hour) + defSplitter := newDefaultSplitter(fakeLimits{}, nil) split := SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) diff --git a/pkg/querier/queryrange/splitters.go b/pkg/querier/queryrange/splitters.go new file mode 100644 index 0000000000000..79e3d5352e06f --- /dev/null +++ b/pkg/querier/queryrange/splitters.go @@ -0,0 +1,297 @@ +package queryrange + +import ( + "time" + + "github.com/prometheus/common/model" + + "github.com/grafana/loki/pkg/logproto" + "github.com/grafana/loki/pkg/querier/queryrange/queryrangebase" + "github.com/grafana/loki/pkg/util" + "github.com/grafana/loki/pkg/util/validation" +) + +type splitter interface { + split(execTime time.Time, tenantIDs []string, request queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) +} + +type defaultSplitter struct { + limits Limits + iqo util.IngesterQueryOptions +} + +func newDefaultSplitter(limits Limits, iqo util.IngesterQueryOptions) *defaultSplitter { + return &defaultSplitter{limits, iqo} +} + +func (s *defaultSplitter) split(execTime time.Time, tenantIDs []string, req queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) { + var ( + reqs []queryrangebase.Request + factory func(start, end time.Time) + endTimeInclusive = true + ) + + switch r := req.(type) { + case *LokiRequest: + endTimeInclusive = false + factory = func(start, end time.Time) { + reqs = append(reqs, &LokiRequest{ + Query: r.Query, + Limit: r.Limit, + Step: r.Step, + Interval: r.Interval, + Direction: r.Direction, + Path: r.Path, + StartTs: start, + EndTs: end, + Plan: r.Plan, + }) + } + case *LokiSeriesRequest: + // metadata queries have end time inclusive. + // Set endTimeInclusive to true so that ForInterval keeps a gap of 1ms between splits to + // avoid querying duplicate data in adjacent queries. + factory = func(start, end time.Time) { + reqs = append(reqs, &LokiSeriesRequest{ + Match: r.Match, + Path: r.Path, + StartTs: start, + EndTs: end, + Shards: r.Shards, + }) + } + case *LabelRequest: + // metadata queries have end time inclusive. + // Set endTimeInclusive to true so that ForInterval keeps a gap of 1ms between splits to + // avoid querying duplicate data in adjacent queries. + factory = func(start, end time.Time) { + reqs = append(reqs, NewLabelRequest(start, end, r.Query, r.Name, r.Path())) + } + case *logproto.IndexStatsRequest: + factory = func(start, end time.Time) { + reqs = append(reqs, &logproto.IndexStatsRequest{ + From: model.TimeFromUnix(start.Unix()), + Through: model.TimeFromUnix(end.Unix()), + Matchers: r.GetMatchers(), + }) + } + case *logproto.VolumeRequest: + factory = func(start, end time.Time) { + reqs = append(reqs, &logproto.VolumeRequest{ + From: model.TimeFromUnix(start.Unix()), + Through: model.TimeFromUnix(end.Unix()), + Matchers: r.GetMatchers(), + Limit: r.Limit, + TargetLabels: r.TargetLabels, + AggregateBy: r.AggregateBy, + }) + } + default: + return nil, nil + } + + var ( + ingesterSplits []queryrangebase.Request + origStart = req.GetStart().UTC() + origEnd = req.GetEnd().UTC() + ) + + start, end, needsIngesterSplits := ingesterQueryBounds(execTime, s.iqo, req) + + if ingesterQueryInterval := validation.MaxDurationPerTenant(tenantIDs, s.limits.IngesterQuerySplitDuration); ingesterQueryInterval != 0 && needsIngesterSplits { + // perform splitting using special interval (`split_ingester_queries_by_interval`) + util.ForInterval(ingesterQueryInterval, start, end, endTimeInclusive, factory) + + // rebound after ingester queries have been split out + end = start + start = req.GetStart().UTC() + if endTimeInclusive { + end = end.Add(-util.SplitGap) + } + + // query only overlaps ingester query window, nothing more to do + if start.After(end) || start.Equal(end) { + return reqs, nil + } + + // copy the splits, reset the results + ingesterSplits = reqs + reqs = nil + } else { + start = origStart + end = origEnd + } + + // perform splitting over the rest of the time range + util.ForInterval(interval, origStart, end, endTimeInclusive, factory) + + // move the ingester splits to the end to maintain correct order + reqs = append(reqs, ingesterSplits...) + return reqs, nil +} + +type metricQuerySplitter struct { + limits Limits + iqo util.IngesterQueryOptions +} + +func newMetricQuerySplitter(limits Limits, iqo util.IngesterQueryOptions) *metricQuerySplitter { + return &metricQuerySplitter{limits, iqo} +} + +// reduceSplitIntervalForRangeVector reduces the split interval for a range query based on the duration of the range vector. +// Large range vector durations will not be split into smaller intervals because it can cause the queries to be slow by over-processing data. +func (s *metricQuerySplitter) reduceSplitIntervalForRangeVector(r *LokiRequest, interval time.Duration) (time.Duration, error) { + maxRange, _, err := maxRangeVectorAndOffsetDuration(r.Plan.AST) + if err != nil { + return 0, err + } + if maxRange > interval { + return maxRange, nil + } + return interval, nil +} + +// Round up to the step before the next interval boundary. +func (s *metricQuerySplitter) nextIntervalBoundary(t time.Time, step int64, interval time.Duration) time.Time { + stepNs := step * 1e6 + nsPerInterval := interval.Nanoseconds() + startOfNextInterval := ((t.UnixNano() / nsPerInterval) + 1) * nsPerInterval + // ensure that target is a multiple of steps away from the start time + target := startOfNextInterval - ((startOfNextInterval - t.UnixNano()) % stepNs) + if target == startOfNextInterval { + target -= stepNs + } + return time.Unix(0, target) +} + +func (s *metricQuerySplitter) split(execTime time.Time, tenantIDs []string, r queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) { + var reqs []queryrangebase.Request + + lokiReq := r.(*LokiRequest) + + interval, err := s.reduceSplitIntervalForRangeVector(lokiReq, interval) + if err != nil { + return nil, err + } + + start, end := s.alignStartEnd(r.GetStep(), lokiReq.StartTs, lokiReq.EndTs) + + lokiReq = lokiReq.WithStartEnd(start, end).(*LokiRequest) + + factory := func(start, end time.Time) { + reqs = append(reqs, &LokiRequest{ + Query: lokiReq.Query, + Limit: lokiReq.Limit, + Step: lokiReq.Step, + Interval: lokiReq.Interval, + Direction: lokiReq.Direction, + Path: lokiReq.Path, + StartTs: start, + EndTs: end, + Plan: lokiReq.Plan, + }) + } + + // step is >= configured split interval, let us just split the query interval by step + // TODO this is likely buggy when step >= query range, how should we handle this? + if lokiReq.Step >= interval.Milliseconds() { + util.ForInterval(time.Duration(lokiReq.Step*1e6), lokiReq.StartTs, lokiReq.EndTs, false, factory) + + return reqs, nil + } + + var ( + ingesterSplits []queryrangebase.Request + needsIngesterSplits bool + ) + + origStart := start + origEnd := end + + start, end, needsIngesterSplits = ingesterQueryBounds(execTime, s.iqo, lokiReq) + start, end = s.alignStartEnd(r.GetStep(), start, end) + + if ingesterQueryInterval := validation.MaxDurationPerTenant(tenantIDs, s.limits.IngesterQuerySplitDuration); ingesterQueryInterval != 0 && needsIngesterSplits { + // perform splitting using special interval (`split_ingester_queries_by_interval`) + s.buildMetricSplits(lokiReq.GetStep(), ingesterQueryInterval, start, end, factory) + + // rebound after ingester queries have been split out + // + // the end time should now be the boundary of the `query_ingester_within` window, which is "start" currently; + // but since start is already step-aligned we need to subtract 1ns to align it down by 1 more step so that we + // get a consistent step between splits + end, _ = s.alignStartEnd(r.GetStep(), start.Add(-time.Nanosecond), end) + // we restore the previous start time (the start time of the query) + start = origStart + + // query only overlaps ingester query window, nothing more to do + if start.After(end) || start.Equal(end) { + return reqs, nil + } + + // copy the splits, reset the results + ingesterSplits = reqs + reqs = nil + } else { + start = origStart + end = origEnd + } + + // perform splitting over the rest of the time range + s.buildMetricSplits(lokiReq.GetStep(), interval, start, end, factory) + + // move the ingester splits to the end to maintain correct order + reqs = append(reqs, ingesterSplits...) + + return reqs, nil +} + +func (s *metricQuerySplitter) alignStartEnd(step int64, start, end time.Time) (time.Time, time.Time) { + // step align start and end time of the query. Start time is rounded down and end time is rounded up. + stepNs := step * 1e6 + startNs := start.UnixNano() + + endNs := end.UnixNano() + if mod := endNs % stepNs; mod != 0 { + endNs += stepNs - mod + } + + return time.Unix(0, startNs-startNs%stepNs), time.Unix(0, endNs) +} + +func (s *metricQuerySplitter) buildMetricSplits(step int64, interval time.Duration, start, end time.Time, factory func(start, end time.Time)) { + for splStart := start; splStart.Before(end); splStart = s.nextIntervalBoundary(splStart, step, interval).Add(time.Duration(step) * time.Millisecond) { + splEnd := s.nextIntervalBoundary(splStart, step, interval) + if splEnd.Add(time.Duration(step)*time.Millisecond).After(end) || splEnd.Add(time.Duration(step)*time.Millisecond) == end { + splEnd = end + } + factory(splStart, splEnd) + } +} + +// ingesterQueryBounds determines if we need to split time ranges overlapping the ingester query window (`query_ingesters_within`) +// and retrieve the bounds for those specific splits +func ingesterQueryBounds(execTime time.Time, iqo util.IngesterQueryOptions, req queryrangebase.Request) (time.Time, time.Time, bool) { + start, end := req.GetStart().UTC(), req.GetEnd().UTC() + + // ingesters are not queried, nothing to do + if iqo == nil || iqo.QueryStoreOnly() { + return start, end, false + } + + windowSize := iqo.QueryIngestersWithin() + ingesterWindow := execTime.UTC().Add(-windowSize) + + // clamp to the start time + if ingesterWindow.Before(start) { + ingesterWindow = start + } + + // query range does not overlap with ingester query window, nothing to do + if end.Before(ingesterWindow) { + return start, end, false + } + + return ingesterWindow, end, true +} diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 2eaeb576b318d..946aeaf54495c 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -2,6 +2,7 @@ package v1 import ( "context" + "fmt" "math" "time" @@ -82,75 +83,82 @@ func prefixedToken(ngram int, chk logproto.ChunkRef) ([]byte, int) { } // PopulateSeriesWithBloom is intended to be called on the write path, and is used to populate the bloom filter for a given series. -func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBloom, chunks []chunk.Chunk) error { +func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBloom, chunks Iterator[[]chunk.Chunk]) error { startTime := time.Now().UnixMilli() level.Debug(util_log.Logger).Log("msg", "PopulateSeriesWithBloom") clearCache(bt.cache) chunkTotalUncompressedSize := 0 - for idx := range chunks { - lc := chunks[idx].Data.(*chunkenc.Facade).LokiChunk() - tokenBuf, prefixLn := prefixedToken(bt.lineTokenizer.N, chunks[idx].ChunkRef) - chunkTotalUncompressedSize += lc.UncompressedSize() - - itr, err := lc.Iterator( - context.Background(), - time.Unix(0, 0), // TODO: Parameterize/better handle the timestamps? - time.Unix(0, math.MaxInt64), - logproto.FORWARD, - log.NewNoopPipeline().ForStream(chunks[idx].Metric), - ) - if err != nil { - level.Error(util_log.Logger).Log("msg", "chunk iterator cannot be created", "err", err) - return err - } - - defer itr.Close() - - for itr.Next() && itr.Error() == nil { - chunkTokenizer := NewPrefixedTokenIter(tokenBuf, prefixLn, bt.lineTokenizer.Tokens(itr.Entry().Line)) - for chunkTokenizer.Next() { - tok := chunkTokenizer.At() - if tok != nil { - str := string(tok) - _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters - if !found { - bt.cache[str] = nil - - seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) - - if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other - clearCache(bt.cache) + for chunks.Next() { + chunksBatch := chunks.At() + for idx := range chunksBatch { + lc := chunksBatch[idx].Data.(*chunkenc.Facade).LokiChunk() + tokenBuf, prefixLn := prefixedToken(bt.lineTokenizer.N, chunksBatch[idx].ChunkRef) + chunkTotalUncompressedSize += lc.UncompressedSize() + + itr, err := lc.Iterator( + context.Background(), + time.Unix(0, 0), // TODO: Parameterize/better handle the timestamps? + time.Unix(0, math.MaxInt64), + logproto.FORWARD, + log.NewNoopPipeline().ForStream(chunksBatch[idx].Metric), + ) + if err != nil { + level.Error(util_log.Logger).Log("msg", "chunk iterator cannot be created", "err", err) + return err + } + + defer itr.Close() + + for itr.Next() && itr.Error() == nil { + chunkTokenizer := NewPrefixedTokenIter(tokenBuf, prefixLn, bt.lineTokenizer.Tokens(itr.Entry().Line)) + for chunkTokenizer.Next() { + tok := chunkTokenizer.At() + if tok != nil { + str := string(tok) + _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters + if !found { + bt.cache[str] = nil + + seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) + + if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other + clearCache(bt.cache) + } } } } - } - lineTokenizer := bt.lineTokenizer.Tokens(itr.Entry().Line) - for lineTokenizer.Next() { - tok := lineTokenizer.At() - if tok != nil { - str := string(tok) - _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters - if !found { - bt.cache[str] = nil - - seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) - - if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other - clearCache(bt.cache) + lineTokenizer := bt.lineTokenizer.Tokens(itr.Entry().Line) + for lineTokenizer.Next() { + tok := lineTokenizer.At() + if tok != nil { + str := string(tok) + _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters + if !found { + bt.cache[str] = nil + + seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) + + if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other + clearCache(bt.cache) + } } } } - } - } - seriesWithBloom.Series.Chunks = append(seriesWithBloom.Series.Chunks, ChunkRef{ - Start: chunks[idx].From, - End: chunks[idx].Through, - Checksum: chunks[idx].Checksum, - }) - } // for each chunk + } + seriesWithBloom.Series.Chunks = append(seriesWithBloom.Series.Chunks, ChunkRef{ + Start: chunksBatch[idx].From, + End: chunksBatch[idx].Through, + Checksum: chunksBatch[idx].Checksum, + }) + } // for each chunk + } + if err := chunks.Err(); err != nil { + level.Error(util_log.Logger).Log("msg", "error downloading chunks batch", "err", err) + return fmt.Errorf("error downloading chunks batch: %w", err) + } endTime := time.Now().UnixMilli() diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index f22c741651246..0fad08e78f080 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -123,7 +123,7 @@ func TestPopulateSeriesWithBloom(t *testing.T) { Series: &series, } - err := bt.PopulateSeriesWithBloom(&swb, chunks) + err := bt.PopulateSeriesWithBloom(&swb, NewSliceIter([][]chunk.Chunk{chunks})) require.NoError(t, err) tokenizer := NewNGramTokenizer(DefaultNGramLength, DefaultNGramSkip) itr := tokenizer.Tokens(testLine) @@ -171,7 +171,7 @@ func BenchmarkPopulateSeriesWithBloom(b *testing.B) { Series: &series, } - err := bt.PopulateSeriesWithBloom(&swb, chunks) + err := bt.PopulateSeriesWithBloom(&swb, NewSliceIter([][]chunk.Chunk{chunks})) require.NoError(b, err) } } diff --git a/pkg/storage/chunk/cache/background.go b/pkg/storage/chunk/cache/background.go index 16feb62551f5b..299444c6a54e0 100644 --- a/pkg/storage/chunk/cache/background.go +++ b/pkg/storage/chunk/cache/background.go @@ -148,8 +148,11 @@ func (c *backgroundCache) Store(ctx context.Context, keys []string, bufs [][]byt } size := bgWrite.size() - newSize := c.size.Load() + int64(size) + // prospectively add new size + newSize := c.size.Add(int64(size)) if newSize > int64(c.sizeLimit) { + // subtract it since we've exceeded the limit + c.size.Sub(int64(size)) c.failStore(ctx, size, num, "queue at byte size limit") return nil } diff --git a/pkg/storage/stores/shipper/bloomshipper/client.go b/pkg/storage/stores/shipper/bloomshipper/client.go index d5d981cddb9e5..b189cba390b82 100644 --- a/pkg/storage/stores/shipper/bloomshipper/client.go +++ b/pkg/storage/stores/shipper/bloomshipper/client.go @@ -133,17 +133,18 @@ func (b *BloomClient) GetMetas(ctx context.Context, params MetaSearchParams) ([] periodClient := b.periodicObjectClients[periodFrom] for _, table := range tables { prefix := filepath.Join(rootFolder, table, params.TenantID, metasFolder) - list, _, err := periodClient.List(ctx, prefix, delimiter) + list, _, err := periodClient.List(ctx, prefix, "") if err != nil { return nil, fmt.Errorf("error listing metas under prefix [%s]: %w", prefix, err) } for _, object := range list { metaRef, err := createMetaRef(object.Key, params.TenantID, table) + if err != nil { return nil, err } if metaRef.MaxFingerprint < uint64(params.MinFingerprint) || uint64(params.MaxFingerprint) < metaRef.MinFingerprint || - metaRef.StartTimestamp.Before(params.StartTimestamp) || metaRef.EndTimestamp.After(params.EndTimestamp) { + metaRef.EndTimestamp.Before(params.StartTimestamp) || metaRef.StartTimestamp.After(params.EndTimestamp) { continue } meta, err := b.downloadMeta(ctx, metaRef, periodClient) diff --git a/pkg/storage/stores/shipper/bloomshipper/compress_utils.go b/pkg/storage/stores/shipper/bloomshipper/compress_utils.go index aa30ec4901f00..96af5e987c3d4 100644 --- a/pkg/storage/stores/shipper/bloomshipper/compress_utils.go +++ b/pkg/storage/stores/shipper/bloomshipper/compress_utils.go @@ -5,10 +5,10 @@ import ( "io" "os" "path/filepath" - "strings" "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/google/uuid" v1 "github.com/grafana/loki/pkg/storage/bloom/v1" ) @@ -42,8 +42,9 @@ func UncompressBloomBlock(block *LazyBlock, workingDirectory string, logger log. if err != nil { return "", fmt.Errorf("error writing data to temp file: %w", err) } + level.Info(logger).Log("msg", "extracting archive", "archive", archivePath, "workingDirectory", workingDirectoryPath, "blockPath", block.BlockPath) defer func() { - os.Remove(archivePath) + err = os.Remove(archivePath) if err != nil { level.Error(logger).Log("msg", "removing archive file", "err", err, "file", archivePath) } @@ -57,7 +58,7 @@ func UncompressBloomBlock(block *LazyBlock, workingDirectory string, logger log. func writeDataToTempFile(workingDirectoryPath string, block *LazyBlock) (string, error) { defer block.Data.Close() - archivePath := filepath.Join(workingDirectoryPath, block.BlockPath[strings.LastIndex(block.BlockPath, "/")+1:]) + archivePath := filepath.Join(workingDirectoryPath, uuid.New().String()) archiveFile, err := os.Create(archivePath) if err != nil { @@ -74,7 +75,7 @@ func writeDataToTempFile(workingDirectoryPath string, block *LazyBlock) (string, func extractArchive(archivePath string, workingDirectoryPath string) error { file, err := os.Open(archivePath) if err != nil { - return fmt.Errorf("error opening archive file %s: %w", file.Name(), err) + return fmt.Errorf("error opening archive file %s: %w", archivePath, err) } return v1.UnTarGz(workingDirectoryPath, file) } diff --git a/pkg/storage/stores/shipper/bloomshipper/shipper.go b/pkg/storage/stores/shipper/bloomshipper/shipper.go index d7038fc13761c..d9d96fcc7783c 100644 --- a/pkg/storage/stores/shipper/bloomshipper/shipper.go +++ b/pkg/storage/stores/shipper/bloomshipper/shipper.go @@ -1,7 +1,6 @@ package bloomshipper import ( - "cmp" "context" "fmt" "math" @@ -15,6 +14,16 @@ import ( "github.com/grafana/loki/pkg/storage/stores/shipper/bloomshipper/config" ) +type fpRange [2]uint64 + +func (r fpRange) minFp() uint64 { + return r[0] +} + +func (r fpRange) maxFp() uint64 { + return r[1] +} + type Shipper struct { client Client config config.Config @@ -43,7 +52,7 @@ func NewShipper(client Client, config config.Config, limits Limits, logger log.L func (s *Shipper) GetBlockRefs(ctx context.Context, tenantID string, from, through model.Time) ([]BlockRef, error) { level.Debug(s.logger).Log("msg", "GetBlockRefs", "tenant", tenantID, "from", from, "through", through) - blockRefs, err := s.getActiveBlockRefs(ctx, tenantID, from, through, []uint64{0, math.MaxUint64}) + blockRefs, err := s.getActiveBlockRefs(ctx, tenantID, from, through, []fpRange{{0, math.MaxUint64}}) if err != nil { return nil, fmt.Errorf("error fetching active block references : %w", err) } @@ -55,30 +64,36 @@ func (s *Shipper) Fetch(ctx context.Context, tenantID string, blocks []BlockRef, defer cancelFunc() blocksChannel, errorsChannel := s.blockDownloader.downloadBlocks(cancelContext, tenantID, blocks) + // track how many blocks are still remaning to be downloaded + remaining := len(blocks) + for { select { case <-ctx.Done(): return fmt.Errorf("failed to fetch blocks: %w", ctx.Err()) - case result, ok := <-blocksChannel: - if !ok { + case result, sentBeforeClosed := <-blocksChannel: + if !sentBeforeClosed { return nil } err := runCallback(callback, result) if err != nil { return err } - case err := <-errorsChannel: - if err != nil { - return fmt.Errorf("error downloading blocks : %w", err) + remaining-- + if remaining == 0 { + return nil } + case err := <-errorsChannel: + return fmt.Errorf("error downloading blocks : %w", err) } } } func runCallback(callback ForEachBlockCallback, block blockWithQuerier) error { - defer func(result blockWithQuerier) { - _ = result.Close() + defer func(b blockWithQuerier) { + _ = b.Close() }(block) + err := callback(block.closableBlockQuerier.BlockQuerier, block.MinFingerprint, block.MaxFingerprint) if err != nil { return fmt.Errorf("error running callback function for block %s err: %w", block.BlockPath, err) @@ -86,17 +101,6 @@ func runCallback(callback ForEachBlockCallback, block blockWithQuerier) error { return nil } -func (s *Shipper) ForEachBlock(ctx context.Context, tenantID string, from, through model.Time, fingerprints []uint64, callback ForEachBlockCallback) error { - level.Debug(s.logger).Log("msg", "ForEachBlock", "tenant", tenantID, "from", from, "through", through, "fingerprints", len(fingerprints)) - - blockRefs, err := s.getActiveBlockRefs(ctx, tenantID, from, through, fingerprints) - if err != nil { - return fmt.Errorf("error fetching active block references : %w", err) - } - - return s.Fetch(ctx, tenantID, blockRefs, callback) -} - func (s *Shipper) Stop() { s.client.Stop() s.blockDownloader.stop() @@ -112,18 +116,19 @@ func getFirstLast[T any](s []T) (T, T) { return s[0], s[len(s)-1] } -func (s *Shipper) getActiveBlockRefs(ctx context.Context, tenantID string, from, through model.Time, fingerprints []uint64) ([]BlockRef, error) { - minFingerprint, maxFingerprint := getFirstLast(fingerprints) +func (s *Shipper) getActiveBlockRefs(ctx context.Context, tenantID string, from, through model.Time, fingerprints []fpRange) ([]BlockRef, error) { + minFpRange, maxFpRange := getFirstLast(fingerprints) metas, err := s.client.GetMetas(ctx, MetaSearchParams{ TenantID: tenantID, - MinFingerprint: model.Fingerprint(minFingerprint), - MaxFingerprint: model.Fingerprint(maxFingerprint), + MinFingerprint: model.Fingerprint(minFpRange.minFp()), + MaxFingerprint: model.Fingerprint(maxFpRange.maxFp()), StartTimestamp: from, EndTimestamp: through, }) if err != nil { return []BlockRef{}, fmt.Errorf("error fetching meta.json files: %w", err) } + level.Debug(s.logger).Log("msg", "dowloaded metas", "count", len(metas)) activeBlocks := s.findBlocks(metas, from, through, fingerprints) slices.SortStableFunc(activeBlocks, func(a, b BlockRef) int { if a.MinFingerprint < b.MinFingerprint { @@ -138,7 +143,7 @@ func (s *Shipper) getActiveBlockRefs(ctx context.Context, tenantID string, from, return activeBlocks, nil } -func (s *Shipper) findBlocks(metas []Meta, startTimestamp, endTimestamp model.Time, fingerprints []uint64) []BlockRef { +func (s *Shipper) findBlocks(metas []Meta, startTimestamp, endTimestamp model.Time, fingerprints []fpRange) []BlockRef { outdatedBlocks := make(map[string]interface{}) for _, meta := range metas { for _, tombstone := range meta.Tombstones { @@ -164,39 +169,29 @@ func (s *Shipper) findBlocks(metas []Meta, startTimestamp, endTimestamp model.Ti return blockRefs } -// getPosition returns the smallest index of element v in slice s where v > s[i] -// TODO(chaudum): Use binary search to find index instead of iteration. -func getPosition[S ~[]E, E cmp.Ordered](s S, v E) int { - for i := range s { - if v > s[i] { - continue - } - return i - } - return len(s) -} - -func isOutsideRange(b *BlockRef, startTimestamp, endTimestamp model.Time, fingerprints []uint64) bool { +// isOutsideRange tests if a given BlockRef b is outside of search boundaries +// defined by min/max timestamp and min/max fingerprint. +// Fingerprint ranges must be sorted in ascending order. +func isOutsideRange(b *BlockRef, startTimestamp, endTimestamp model.Time, fingerprints []fpRange) bool { // First, check time range if b.EndTimestamp < startTimestamp || b.StartTimestamp > endTimestamp { return true } // Then, check if outside of min/max of fingerprint slice - minFp, maxFp := getFirstLast(fingerprints) - if b.MaxFingerprint < minFp || b.MinFingerprint > maxFp { + minFpRange, maxFpRange := getFirstLast(fingerprints) + if b.MaxFingerprint < minFpRange.minFp() || b.MinFingerprint > maxFpRange.maxFp() { return true } - // Check if the block range is inside a "gap" in the fingerprint slice - // e.g. - // fingerprints = [1, 2, 6, 7, 8] - // block = [3, 4, 5] - idx := getPosition[[]uint64](fingerprints, b.MinFingerprint) - // in case b.MinFingerprint is outside of the fingerprints range, return true - // this is already covered in the range check above, but I keep it as a second gate - if idx > len(fingerprints)-1 { - return true + prev := fpRange{0, 0} + for i := 0; i < len(fingerprints); i++ { + fpr := fingerprints[i] + if b.MinFingerprint > prev.maxFp() && b.MaxFingerprint < fpr.minFp() { + return true + } + prev = fpr } - return b.MaxFingerprint < fingerprints[idx] + + return false } diff --git a/pkg/storage/stores/shipper/bloomshipper/shipper_test.go b/pkg/storage/stores/shipper/bloomshipper/shipper_test.go index 83c9379cd44c6..859aa38c82a61 100644 --- a/pkg/storage/stores/shipper/bloomshipper/shipper_test.go +++ b/pkg/storage/stores/shipper/bloomshipper/shipper_test.go @@ -4,6 +4,7 @@ import ( "fmt" "math" "testing" + "time" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" @@ -40,7 +41,7 @@ func Test_Shipper_findBlocks(t *testing.T) { } shipper := &Shipper{} - blocks := shipper.findBlocks(metas, 300, 400, []uint64{100, 200}) + blocks := shipper.findBlocks(metas, model.Now().Add(-2*time.Hour), model.Now().Add(-1*time.Hour), []fpRange{{100, 200}}) expectedBlockRefs := []BlockRef{ createMatchingBlockRef("block2"), @@ -53,8 +54,8 @@ func Test_Shipper_findBlocks(t *testing.T) { tests := map[string]struct { minFingerprint uint64 maxFingerprint uint64 - startTimestamp int64 - endTimestamp int64 + startTimestamp model.Time + endTimestamp model.Time filtered bool }{ "expected block not to be filtered out if minFingerprint and startTimestamp are within range": { @@ -94,7 +95,7 @@ func Test_Shipper_findBlocks(t *testing.T) { t.Run(name, func(t *testing.T) { shipper := &Shipper{} ref := createBlockRef("fake-block", data.minFingerprint, data.maxFingerprint, data.startTimestamp, data.endTimestamp) - blocks := shipper.findBlocks([]Meta{{Blocks: []BlockRef{ref}}}, 300, 400, []uint64{100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200}) + blocks := shipper.findBlocks([]Meta{{Blocks: []BlockRef{ref}}}, 300, 400, []fpRange{{100, 200}}) if data.filtered { require.Empty(t, blocks) return @@ -105,94 +106,83 @@ func Test_Shipper_findBlocks(t *testing.T) { } } -func TestGetPosition(t *testing.T) { - for i, tc := range []struct { - s []int - v int - exp int - }{ - {s: []int{}, v: 1, exp: 0}, - {s: []int{1, 2, 3}, v: 0, exp: 0}, - {s: []int{1, 2, 3}, v: 2, exp: 1}, - {s: []int{1, 2, 3}, v: 4, exp: 3}, - {s: []int{1, 2, 4, 5}, v: 3, exp: 2}, - } { - tc := tc - name := fmt.Sprintf("case-%d", i) - t.Run(name, func(t *testing.T) { - got := getPosition[[]int](tc.s, tc.v) - require.Equal(t, tc.exp, got) - }) - } -} - func TestIsOutsideRange(t *testing.T) { + startTs := model.Time(1000) + endTs := model.Time(2000) + t.Run("is outside if startTs > through", func(t *testing.T) { - b := createBlockRef("block", 0, math.MaxUint64, 100, 200) - isOutside := isOutsideRange(&b, 0, 90, []uint64{}) + b := createBlockRef("block", 0, math.MaxUint64, startTs, endTs) + isOutside := isOutsideRange(&b, 0, 900, []fpRange{}) require.True(t, isOutside) }) t.Run("is outside if endTs < from", func(t *testing.T) { - b := createBlockRef("block", 0, math.MaxUint64, 100, 200) - isOutside := isOutsideRange(&b, 210, 300, []uint64{}) + b := createBlockRef("block", 0, math.MaxUint64, startTs, endTs) + isOutside := isOutsideRange(&b, 2100, 3000, []fpRange{}) require.True(t, isOutside) }) t.Run("is outside if endFp < first fingerprint", func(t *testing.T) { - b := createBlockRef("block", 0, 90, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{100, 200}) + b := createBlockRef("block", 0, 90, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{100, 199}}) require.True(t, isOutside) }) t.Run("is outside if startFp > last fingerprint", func(t *testing.T) { - b := createBlockRef("block", 210, math.MaxUint64, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{100, 200}) + b := createBlockRef("block", 200, math.MaxUint64, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 49}, {100, 149}}) require.True(t, isOutside) }) t.Run("is outside if within gaps in fingerprints", func(t *testing.T) { - b := createBlockRef("block", 100, 200, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{0, 99, 201, 300}) + b := createBlockRef("block", 100, 199, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 99}, {200, 299}}) require.True(t, isOutside) }) t.Run("is not outside if within fingerprints 1", func(t *testing.T) { - b := createBlockRef("block", 100, 200, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{0, 100, 200, 300}) + b := createBlockRef("block", 10, 90, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 99}, {200, 299}}) require.False(t, isOutside) }) t.Run("is not outside if within fingerprints 2", func(t *testing.T) { - b := createBlockRef("block", 100, 150, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{0, 100, 200, 300}) + b := createBlockRef("block", 210, 290, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 99}, {200, 299}}) + require.False(t, isOutside) + }) + + t.Run("is not outside if spans across multiple fingerprint ranges", func(t *testing.T) { + b := createBlockRef("block", 50, 250, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 99}, {200, 299}}) require.False(t, isOutside) }) - t.Run("is not outside if within fingerprints 3", func(t *testing.T) { - b := createBlockRef("block", 150, 200, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{0, 100, 200, 300}) + t.Run("is not outside if fingerprint range and time range are larger than block", func(t *testing.T) { + b := createBlockRef("block", math.MaxUint64/3, math.MaxUint64/3*2, startTs, endTs) + isOutside := isOutsideRange(&b, 0, 3000, []fpRange{{0, math.MaxUint64}}) require.False(t, isOutside) }) } func createMatchingBlockRef(blockPath string) BlockRef { - return createBlockRef(blockPath, 0, uint64(math.MaxUint64), 0, math.MaxInt) + return createBlockRef(blockPath, 0, math.MaxUint64, model.Time(0), model.Now()) } func createBlockRef( blockPath string, minFingerprint, maxFingerprint uint64, - startTimestamp, endTimestamp int64, + startTimestamp, endTimestamp model.Time, ) BlockRef { + day := startTimestamp.Unix() / int64(24*time.Hour/time.Second) return BlockRef{ Ref: Ref{ TenantID: "fake", - TableName: "16600", + TableName: fmt.Sprintf("%d", day), MinFingerprint: minFingerprint, MaxFingerprint: maxFingerprint, - StartTimestamp: model.Time(startTimestamp), - EndTimestamp: model.Time(endTimestamp), + StartTimestamp: startTimestamp, + EndTimestamp: endTimestamp, Checksum: 0, }, // block path is unique, and it's used to distinguish the blocks so the rest of the fields might be skipped in this test diff --git a/pkg/storage/stores/shipper/bloomshipper/store.go b/pkg/storage/stores/shipper/bloomshipper/store.go index 06e1d7a4675bf..40c23658e9a1a 100644 --- a/pkg/storage/stores/shipper/bloomshipper/store.go +++ b/pkg/storage/stores/shipper/bloomshipper/store.go @@ -2,7 +2,6 @@ package bloomshipper import ( "context" - "sort" "time" "github.com/prometheus/common/model" @@ -14,7 +13,6 @@ type ForEachBlockCallback func(bq *v1.BlockQuerier, minFp, maxFp uint64) error type ReadShipper interface { GetBlockRefs(ctx context.Context, tenant string, from, through model.Time) ([]BlockRef, error) - ForEachBlock(ctx context.Context, tenant string, from, through model.Time, fingerprints []uint64, callback ForEachBlockCallback) error Fetch(ctx context.Context, tenant string, blocks []BlockRef, callback ForEachBlockCallback) error } @@ -30,8 +28,6 @@ type BlockQuerierWithFingerprintRange struct { type Store interface { GetBlockRefs(ctx context.Context, tenant string, from, through time.Time) ([]BlockRef, error) - GetBlockQueriers(ctx context.Context, tenant string, from, through time.Time, fingerprints []uint64) ([]BlockQuerierWithFingerprintRange, error) - GetBlockQueriersForBlockRefs(ctx context.Context, tenant string, blocks []BlockRef) ([]BlockQuerierWithFingerprintRange, error) ForEach(ctx context.Context, tenant string, blocks []BlockRef, callback ForEachBlockCallback) error Stop() } @@ -60,40 +56,6 @@ func (bs *BloomStore) ForEach(ctx context.Context, tenant string, blocks []Block return bs.shipper.Fetch(ctx, tenant, blocks, callback) } -// GetQueriersForBlocks implements Store -func (bs *BloomStore) GetBlockQueriersForBlockRefs(ctx context.Context, tenant string, blocks []BlockRef) ([]BlockQuerierWithFingerprintRange, error) { - bqs := make([]BlockQuerierWithFingerprintRange, 0, 32) - err := bs.shipper.Fetch(ctx, tenant, blocks, func(bq *v1.BlockQuerier, minFp uint64, maxFp uint64) error { - bqs = append(bqs, BlockQuerierWithFingerprintRange{ - BlockQuerier: bq, - MinFp: model.Fingerprint(minFp), - MaxFp: model.Fingerprint(maxFp), - }) - return nil - }) - sort.Slice(bqs, func(i, j int) bool { - return bqs[i].MinFp < bqs[j].MinFp - }) - return bqs, err -} - -// BlockQueriers implements Store -func (bs *BloomStore) GetBlockQueriers(ctx context.Context, tenant string, from, through time.Time, fingerprints []uint64) ([]BlockQuerierWithFingerprintRange, error) { - bqs := make([]BlockQuerierWithFingerprintRange, 0, 32) - err := bs.shipper.ForEachBlock(ctx, tenant, toModelTime(from), toModelTime(through), fingerprints, func(bq *v1.BlockQuerier, minFp uint64, maxFp uint64) error { - bqs = append(bqs, BlockQuerierWithFingerprintRange{ - BlockQuerier: bq, - MinFp: model.Fingerprint(minFp), - MaxFp: model.Fingerprint(maxFp), - }) - return nil - }) - sort.Slice(bqs, func(i, j int) bool { - return bqs[i].MinFp < bqs[j].MinFp - }) - return bqs, err -} - func toModelTime(t time.Time) model.Time { return model.TimeFromUnixNano(t.UnixNano()) } diff --git a/pkg/storage/stores/shipper/indexshipper/indexgateway/gateway.go b/pkg/storage/stores/shipper/indexshipper/indexgateway/gateway.go index 1040bd6c1b565..8b0f186386bdf 100644 --- a/pkg/storage/stores/shipper/indexshipper/indexgateway/gateway.go +++ b/pkg/storage/stores/shipper/indexshipper/indexgateway/gateway.go @@ -204,7 +204,7 @@ func (g *Gateway) GetChunkRef(ctx context.Context, req *logproto.GetChunkRefRequ return nil, err } - predicate := chunk.NewPredicate(matchers, *(&req.Filters)) + predicate := chunk.NewPredicate(matchers, req.Filters) chunks, _, err := g.indexQuerier.GetChunks(ctx, instanceID, req.From, req.Through, predicate) if err != nil { return nil, err @@ -219,8 +219,11 @@ func (g *Gateway) GetChunkRef(ctx context.Context, req *logproto.GetChunkRefRequ } } + initialChunkCount := len(result.Refs) + // Return unfiltered results if there is no bloom querier (Bloom Gateway disabled) or if there are not filters. if g.bloomQuerier == nil || len(req.Filters) == 0 { + level.Info(g.log).Log("msg", "chunk filtering is not enabled or there is no line filter", "filters", len(req.Filters)) return result, nil } @@ -234,6 +237,7 @@ func (g *Gateway) GetChunkRef(ctx context.Context, req *logproto.GetChunkRefRequ } result.Refs = chunkRefs + level.Info(g.log).Log("msg", "return filtered chunk refs", "unfiltered", initialChunkCount, "filtered", len(result.Refs)) return result, nil } diff --git a/pkg/tracing/config.go b/pkg/tracing/config.go index 1c97d88a845df..f9faefa6a7303 100644 --- a/pkg/tracing/config.go +++ b/pkg/tracing/config.go @@ -5,7 +5,8 @@ import ( ) type Config struct { - Enabled bool `yaml:"enabled"` + Enabled bool `yaml:"enabled"` + ProfilingEnabled bool `yaml:"profiling_enabled" category:"experimental" doc:"hidden"` } func (cfg *Config) RegisterFlags(f *flag.FlagSet) { @@ -14,4 +15,5 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { f.BoolVar(&cfg.Enabled, prefix+"tracing.enabled", true, "Set to false to disable tracing.") + f.BoolVar(&cfg.ProfilingEnabled, prefix+"tracing.profiling-enabled", true, "Set to true to enable profiling integration.") } diff --git a/pkg/util/config.go b/pkg/util/config.go index 6989931fb618e..f54d469690c98 100644 --- a/pkg/util/config.go +++ b/pkg/util/config.go @@ -4,6 +4,7 @@ import ( "fmt" "io" "strings" + "time" "github.com/go-kit/log/level" "github.com/prometheus/common/version" @@ -38,3 +39,10 @@ func PrintConfig(w io.Writer, config interface{}) error { fmt.Fprintf(w, "---\n# Loki Config\n# %s\n%s\n\n", version.Info(), string(lc)) return nil } + +// IngesterQueryOptions exists because querier.Config cannot be passed directly to the queryrange package +// due to an import cycle. +type IngesterQueryOptions interface { + QueryStoreOnly() bool + QueryIngestersWithin() time.Duration +} diff --git a/pkg/util/time.go b/pkg/util/time.go index 8f9e0c01b0a91..b943fea92aad8 100644 --- a/pkg/util/time.go +++ b/pkg/util/time.go @@ -87,6 +87,8 @@ func NewDisableableTicker(interval time.Duration) (func(), <-chan time.Time) { return func() { tick.Stop() }, tick.C } +const SplitGap = time.Millisecond + // ForInterval splits the given start and end time into given interval. // The start and end time in splits would be aligned to the interval // except for the start time of first split and end time of last split which would be kept same as original start/end @@ -107,7 +109,7 @@ func ForInterval(interval time.Duration, start, end time.Time, endTimeInclusive if !newEnd.Before(end) { newEnd = end } else if endTimeInclusive { - newEnd = newEnd.Add(-time.Millisecond) + newEnd = newEnd.Add(-SplitGap) } if firstInterval { callback(ogStart, newEnd) diff --git a/pkg/validation/limits.go b/pkg/validation/limits.go index 7f1f6ea0d7342..45dd34f201e8d 100644 --- a/pkg/validation/limits.go +++ b/pkg/validation/limits.go @@ -106,6 +106,7 @@ type Limits struct { // Query frontend enforced limits. The default is actually parameterized by the queryrange config. QuerySplitDuration model.Duration `yaml:"split_queries_by_interval" json:"split_queries_by_interval"` MetadataQuerySplitDuration model.Duration `yaml:"split_metadata_queries_by_interval" json:"split_metadata_queries_by_interval"` + IngesterQuerySplitDuration model.Duration `yaml:"split_ingester_queries_by_interval" json:"split_ingester_queries_by_interval"` MinShardingLookback model.Duration `yaml:"min_sharding_lookback" json:"min_sharding_lookback"` MaxQueryBytesRead flagext.ByteSize `yaml:"max_query_bytes_read" json:"max_query_bytes_read"` MaxQuerierBytesRead flagext.ByteSize `yaml:"max_querier_bytes_read" json:"max_querier_bytes_read"` @@ -187,6 +188,7 @@ type Limits struct { BloomCompactorShardSize int `yaml:"bloom_compactor_shard_size" json:"bloom_compactor_shard_size"` BloomCompactorMaxTableAge time.Duration `yaml:"bloom_compactor_max_table_age" json:"bloom_compactor_max_table_age"` BloomCompactorEnabled bool `yaml:"bloom_compactor_enable_compaction" json:"bloom_compactor_enable_compaction"` + BloomCompactorChunksBatchSize int `yaml:"bloom_compactor_chunks_batch_size" json:"bloom_compactor_chunks_batch_size"` BloomNGramLength int `yaml:"bloom_ngram_length" json:"bloom_ngram_length"` BloomNGramSkip int `yaml:"bloom_ngram_skip" json:"bloom_ngram_skip"` BloomFalsePositiveRate float64 `yaml:"bloom_false_positive_rate" json:"bloom_false_positive_rate"` @@ -299,6 +301,9 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { _ = l.MetadataQuerySplitDuration.Set("24h") f.Var(&l.MetadataQuerySplitDuration, "querier.split-metadata-queries-by-interval", "Split metadata queries by a time interval and execute in parallel. The value 0 disables splitting metadata queries by time. This also determines how cache keys are chosen when label/series result caching is enabled.") + _ = l.IngesterQuerySplitDuration.Set("0s") + f.Var(&l.IngesterQuerySplitDuration, "querier.split-ingester-queries-by-interval", "Interval to use for time-based splitting when a request is within the `query_ingesters_within` window; defaults to `split-queries-by-interval` by setting to 0.") + f.StringVar(&l.DeletionMode, "compactor.deletion-mode", "filter-and-delete", "Deletion mode. Can be one of 'disabled', 'filter-only', or 'filter-and-delete'. When set to 'filter-only' or 'filter-and-delete', and if retention_enabled is true, then the log entry deletion API endpoints are available.") // Deprecated @@ -312,6 +317,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.BloomCompactorShardSize, "bloom-compactor.shard-size", 1, "The shard size defines how many bloom compactors should be used by a tenant when computing blooms. If it's set to 0, shuffle sharding is disabled.") f.DurationVar(&l.BloomCompactorMaxTableAge, "bloom-compactor.max-table-age", 7*24*time.Hour, "The maximum age of a table before it is compacted. Do not compact tables older than the the configured time. Default to 7 days. 0s means no limit.") f.BoolVar(&l.BloomCompactorEnabled, "bloom-compactor.enable-compaction", false, "Whether to compact chunks into bloom filters.") + f.IntVar(&l.BloomCompactorChunksBatchSize, "bloom-compactor.chunks-batch-size", 100, "The batch size of the chunks the bloom-compactor downloads at once.") f.IntVar(&l.BloomNGramLength, "bloom-compactor.ngram-length", 4, "Length of the n-grams created when computing blooms from log lines.") f.IntVar(&l.BloomNGramSkip, "bloom-compactor.ngram-skip", 0, "Skip factor for the n-grams created when computing blooms from log lines.") f.Float64Var(&l.BloomFalsePositiveRate, "bloom-compactor.false-positive-rate", 0.01, "Scalable Bloom Filter desired false-positive rate.") @@ -574,6 +580,12 @@ func (o *Overrides) MetadataQuerySplitDuration(userID string) time.Duration { return time.Duration(o.getOverridesForUser(userID).MetadataQuerySplitDuration) } +// IngesterQuerySplitDuration returns the tenant specific splitby interval applied in the query frontend when querying +// during the `query_ingesters_within` window. +func (o *Overrides) IngesterQuerySplitDuration(userID string) time.Duration { + return time.Duration(o.getOverridesForUser(userID).IngesterQuerySplitDuration) +} + // MaxQueryBytesRead returns the maximum bytes a query can read. func (o *Overrides) MaxQueryBytesRead(_ context.Context, userID string) int { return o.getOverridesForUser(userID).MaxQueryBytesRead.Val() @@ -828,6 +840,10 @@ func (o *Overrides) BloomGatewayEnabled(userID string) bool { return o.getOverridesForUser(userID).BloomGatewayEnabled } +func (o *Overrides) BloomCompactorChunksBatchSize(userID string) int { + return o.getOverridesForUser(userID).BloomCompactorChunksBatchSize +} + func (o *Overrides) BloomCompactorShardSize(userID string) int { return o.getOverridesForUser(userID).BloomCompactorShardSize } diff --git a/pkg/validation/limits_test.go b/pkg/validation/limits_test.go index 4e449e421c5a7..908531f9858f6 100644 --- a/pkg/validation/limits_test.go +++ b/pkg/validation/limits_test.go @@ -6,7 +6,6 @@ import ( "testing" "time" - "github.com/pkg/errors" "github.com/prometheus/common/model" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -290,7 +289,7 @@ query_timeout: 5m } } -func TestLimitsValidation(t *testing.T) { +func TestLimitsValidation_deletionMode(t *testing.T) { for _, tc := range []struct { mode string expected error @@ -300,7 +299,9 @@ func TestLimitsValidation(t *testing.T) { {mode: "filter-and-delete", expected: nil}, {mode: "something-else", expected: deletionmode.ErrUnknownMode}, } { - limits := Limits{DeletionMode: tc.mode} - require.True(t, errors.Is(limits.Validate(), tc.expected)) + t.Run(tc.mode, func(t *testing.T) { + limits := Limits{DeletionMode: tc.mode} + require.ErrorIs(t, limits.Validate(), tc.expected) + }) } } diff --git a/production/docker/config/loki.yaml b/production/docker/config/loki.yaml index 6e4541164a235..0a124e5ccfaae 100644 --- a/production/docker/config/loki.yaml +++ b/production/docker/config/loki.yaml @@ -9,19 +9,20 @@ server: common: path_prefix: /loki - storage: - s3: - endpoint: minio:9000 - insecure: true - bucketnames: loki-data - access_key_id: loki - secret_access_key: supersecret - s3forcepathstyle: true - compactor_address: http://loki-write:3100 + compactor_address: http://loki-backend:3100 replication_factor: 3 +storage_config: + aws: + endpoint: minio:9000 + insecure: true + bucketnames: loki-data + access_key_id: loki + secret_access_key: supersecret + s3forcepathstyle: true + memberlist: - join_members: ["loki-read", "loki-write"] + join_members: ["loki-read", "loki-write", "loki-backend"] dead_node_reclaim_time: 30s gossip_to_dead_nodes_time: 15s left_ingesters_timeout: 30s @@ -54,6 +55,10 @@ ruler: enable_sharding: true wal: dir: /loki/ruler-wal + evaluation: + mode: remote + query_frontend: + address: dns:///loki-read:9095 storage: type: local local: @@ -85,6 +90,13 @@ schema_config: index: prefix: index_ period: 24h + - from: 2024-01-10 + store: tsdb + object_store: s3 + schema: v12 + index: + prefix: index_ + period: 24h limits_config: diff --git a/production/docker/config/prometheus.yaml b/production/docker/config/prometheus.yaml index 9bb03bb209047..3369106f94001 100644 --- a/production/docker/config/prometheus.yaml +++ b/production/docker/config/prometheus.yaml @@ -11,6 +11,7 @@ scrape_configs: - names: - loki-read - loki-write + - loki-backend type: A port: 3100 - job_name: 'promtail' diff --git a/production/docker/docker-compose.yaml b/production/docker/docker-compose.yaml index 5c1b93f829173..a4f74c7bb1182 100644 --- a/production/docker/docker-compose.yaml +++ b/production/docker/docker-compose.yaml @@ -89,7 +89,7 @@ services: - | mkdir -p /data/loki-data && \ mkdir -p /data/loki-ruler && - minio server /data + minio server --address "0.0.0.0:9000" --console-address "0.0.0.0:9001" /data environment: - MINIO_ROOT_USER=loki - MINIO_ROOT_PASSWORD=supersecret @@ -97,6 +97,7 @@ services: - MINIO_UPDATE=off ports: - "9000:9000" + - "9001:9001" volumes: - ./.data/minio:/data networks: @@ -116,7 +117,6 @@ services: image: *lokiImage volumes: - ./config:/etc/loki/ - - ./rules:/loki/rules:ro # only needed for interactive debugging with dlv # cap_add: # - SYS_PTRACE @@ -127,7 +127,7 @@ services: - "7946" # uncomment to use interactive debugging # - "40000-40002:40000" # makes the replicas available on ports 40000, 40001, 40002 - command: "-config.file=/etc/loki/loki.yaml -target=read" + command: "-config.file=/etc/loki/loki.yaml -target=read -legacy-read-mode=false" networks: - loki restart: always @@ -161,6 +161,7 @@ services: image: *lokiImage volumes: - ./config:/etc/loki/ + - ./rules:/loki/rules:ro # only needed for interactive debugging with dlv # cap_add: # - SYS_PTRACE diff --git a/production/helm/loki/CHANGELOG.md b/production/helm/loki/CHANGELOG.md index ca04f5d18ce5d..272aa69428852 100644 --- a/production/helm/loki/CHANGELOG.md +++ b/production/helm/loki/CHANGELOG.md @@ -13,6 +13,10 @@ Entries should include a reference to the pull request that introduced the chang [//]: # ( : do not remove this line. This locator is used by the CI pipeline to automatically create a changelog entry for each new Loki release. Add other chart versions and respective changelog entries bellow this line.) +## 5.41.6 + +- [BUGFIX] Added missing namespace to query-scheduler-discovery service when deploying loki in a specific namespace. + ## 5.41.5 - [BUGFIX] Added "swift" type object storage to resolve Loki HELM Chart error. diff --git a/production/helm/loki/Chart.yaml b/production/helm/loki/Chart.yaml index 1e08c0c8f0d1e..cb43a70c965b7 100644 --- a/production/helm/loki/Chart.yaml +++ b/production/helm/loki/Chart.yaml @@ -3,7 +3,7 @@ name: loki description: Helm chart for Grafana Loki in simple, scalable mode type: application appVersion: 2.9.3 -version: 5.41.5 +version: 5.41.6 home: https://grafana.github.io/helm-charts sources: - https://github.com/grafana/loki diff --git a/production/helm/loki/README.md b/production/helm/loki/README.md index ec3360d378d76..6b4ec081e9bba 100644 --- a/production/helm/loki/README.md +++ b/production/helm/loki/README.md @@ -1,6 +1,6 @@ # loki -![Version: 5.41.5](https://img.shields.io/badge/Version-5.41.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.9.3](https://img.shields.io/badge/AppVersion-2.9.3-informational?style=flat-square) +![Version: 5.41.6](https://img.shields.io/badge/Version-5.41.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.9.3](https://img.shields.io/badge/AppVersion-2.9.3-informational?style=flat-square) Helm chart for Grafana Loki in simple, scalable mode diff --git a/production/helm/loki/templates/backend/query-scheduler-discovery.yaml b/production/helm/loki/templates/backend/query-scheduler-discovery.yaml index 01865863e57ef..a9dedbb545649 100644 --- a/production/helm/loki/templates/backend/query-scheduler-discovery.yaml +++ b/production/helm/loki/templates/backend/query-scheduler-discovery.yaml @@ -5,6 +5,7 @@ apiVersion: v1 kind: Service metadata: name: query-scheduler-discovery + namespace: {{ $.Release.Namespace }} labels: {{- include "loki.backendSelectorLabels" . | nindent 4 }} prometheus.io/service-monitor: "false" diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json b/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json index bcd620e69e4a9..9d2544082d158 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(cluster_job:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}) / sum(cluster_job:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", diff --git a/production/loki-mixin-compiled/dashboards/loki-writes.json b/production/loki-mixin-compiled/dashboards/loki-writes.json index fdb347f56055f..b7cf83f95f44b 100644 --- a/production/loki-mixin-compiled/dashboards/loki-writes.json +++ b/production/loki-mixin-compiled/dashboards/loki-writes.json @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(cluster_job:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}) / sum(cluster_job:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", diff --git a/production/loki-mixin/dashboards/loki-writes.libsonnet b/production/loki-mixin/dashboards/loki-writes.libsonnet index a12f4f7cea6e0..d5c85337a29db 100644 --- a/production/loki-mixin/dashboards/loki-writes.libsonnet +++ b/production/loki-mixin/dashboards/loki-writes.libsonnet @@ -65,7 +65,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.distributor, + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.distributor + [utils.selector.eq('route', 'api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle')], ) ) ) diff --git a/tools/deprecated-config-checker/checker/checker_test.go b/tools/deprecated-config-checker/checker/checker_test.go index efecefb1700f0..d9fdf4dc607b0 100644 --- a/tools/deprecated-config-checker/checker/checker_test.go +++ b/tools/deprecated-config-checker/checker/checker_test.go @@ -39,6 +39,7 @@ var ( } expectedConfigDeprecates = []string{ + "legacy-read-mode", "ruler.remote_write.client", "index_gateway.ring.replication_factor", "storage_config.bigtable", diff --git a/tools/deprecated-config-checker/deprecated-config.yaml b/tools/deprecated-config-checker/deprecated-config.yaml index ab4c3c073d738..46b89971bdd2e 100644 --- a/tools/deprecated-config-checker/deprecated-config.yaml +++ b/tools/deprecated-config-checker/deprecated-config.yaml @@ -13,6 +13,7 @@ # _msg: "Use tsdb (preferred) or boltdb-shipper instead." # # Note that even though the configs in schema_config takes a list, here we specify the deprecated fields for each item in the list. +legacy-read-mode: "Legacy read SSD mode is deprecated and will be eventually removed. Use the new read and backend targets." ruler: remote_write: diff --git a/tools/deprecated-config-checker/test-fixtures/config.yaml b/tools/deprecated-config-checker/test-fixtures/config.yaml index d5a326c8647f3..be875f3ac10f8 100644 --- a/tools/deprecated-config-checker/test-fixtures/config.yaml +++ b/tools/deprecated-config-checker/test-fixtures/config.yaml @@ -1,4 +1,5 @@ auth_enabled: false +legacy-read-mode: true server: http_listen_port: 3100 diff --git a/vendor/github.com/grafana/dskit/spanprofiler/README.md b/vendor/github.com/grafana/dskit/spanprofiler/README.md new file mode 100644 index 0000000000000..a415985f6649e --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/README.md @@ -0,0 +1,104 @@ +# Span Profiler for OpenTracing-Go + +## Overview + +The Span Profiler for OpenTracing-Go is a package that seamlessly integrates `opentracing-go` instrumentation with +profiling through the use of pprof labels. + +Accessing trace span profiles is made convenient through the Grafana Explore view. You can find a complete example setup +with Grafana Tempo in the [Pyroscope repository](https://github.com/grafana/pyroscope/tree/main/examples/tracing/tempo): + +![image](https://github.com/grafana/otel-profiling-go/assets/12090599/31e33cd1-818b-4116-b952-c9ec7b1fb593) + +## Usage + +There are two primary ways to use the Span Profiler: + +### 1. Wrap the Global Tracer. + +You can wrap the global tracer using `spanprofiler.NewTracer`: + +```go +import ( + "github.com/opentracing/opentracing-go" + "github.com/grafana/dskit/spanprofiler" +) + +func main() { + // Initialize your OpenTracing tracer + tracer := opentracing.GlobalTracer() + // Wrap it with the tracer-profiler + wrappedTracer := spanprofiler.NewTracer(tracer) + // Use the wrapped tracer in your application + opentracing.SetGlobalTracer(wrappedTracer) + + // Or, as an oneliner: + // opentracing.SetGlobalTracer(spanprofiler.NewTracer(opentracing.GlobalTracer())) + + // Your application logic here +} +``` + +For efficiency, the tracer selectively records profiles for _root_ spans — the initial _local_ span in a process — since +a trace may encompass thousands of spans. All stack trace samples accumulated during the execution of their child spans +contribute to the root span's profile. In practical terms, this signifies that, for instance, an HTTP request results +in a singular profile, irrespective of the numerous spans within the trace. It's important to note that these profiles +don't extend beyond the boundaries of a single process. + +The limitation of this approach is that only spans created within the same goroutine, or its children, as the parent are +taken into account. Consequently, in scenarios involving asynchronous execution, where the parent span context is passed +to another goroutine, explicit profiling becomes necessary using `spanprofiler.StartSpanFromContext`. + +### 2. Profile individual spans. + +The `spanprofiler.StartSpanFromContext` function allows you to granularly control which spans to profile: + +```go +func YourOperationName(ctx context.Background()) { + // Start a span and enable profiling for it + span, ctx := spanprofiler.StartSpanFromContext(ctx, "YourOperationName", tracer) + defer span.Finish() // Finish the span when done + + // Use the span in your application logic +} +``` + +The function guarantees that the span is to be profiled. + +Both methods can be employed either in conjunction or independently. Our recommendation is to utilize the tracer for +seamless integration, reserving explicit span profiling only for cases where spans are spawned in detached goroutines. + +## Implementation details + +When a new trace span is created, and is eligible for profiling, the tracer sets `span_id` and `span_name` [pprof labels](https://github.com/google/pprof/blob/master/doc/README.md#tag-filtering) +that point to the respective span. These labels are stored in the goroutine's local storage and inherited by any +subsequent child goroutines. + +`span_name` is available as a regular label and can be used in the query expressions. For example, the following query +will show you profile for the code that is not covered with traces: +``` +{service_name="my-service",span_name=""} +``` + +Additionally, trace spans are identified by the `pyroscope.profile.id` attribute, indicating the associated profile. +This allows to find such spans in the trace view (in the screenshot) and fetch profiles for specific spans. + +It's important to note that the presence of this attribute does not guarantee profile availability; stack trace samples +might not be collected if the CPU time utilized falls below the sample interval (10ms). + +It is crucial to understand that this module doesn't directly control the pprof profiler; its initialization is still +necessary for profile collection. This initialization can be achieved through the `runtime/pprof` package, or using the +[Pyroscope client](https://github.com/grafana/pyroscope-go). + +Limitations: + - Only CPU profiling is fully supported at the moment. + - Only [Jaeger tracer](https://github.com/jaegertracing/jaeger-client-go) implementation is supported. + +## Performance implications + +The typical performance impact is generally imperceptible and primarily arises from the cost of pprof labeling. However, +intensive use of pprof labels may have negative impact on the profiled application. + +In the case of the tracer provided by this package, the `StartSpan` method wrapper introduces an approximate 20% increase +in CPU time compared to the original call. In vase majority of cases, the overhead constitutes less than 0.01% of the total +CPU time and is considered safe for deployment in production systems. diff --git a/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go b/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go new file mode 100644 index 0000000000000..8481d04498d5a --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go @@ -0,0 +1,107 @@ +package spanprofiler + +import ( + "context" + "runtime/pprof" + + "github.com/opentracing/opentracing-go" + "github.com/uber/jaeger-client-go" +) + +// StartSpanFromContext starts and returns a Span with `operationName`, using +// any Span found within `ctx` as a ChildOfRef. If no such parent could be +// found, StartSpanFromContext creates a root (parentless) Span. +// +// The call sets `operationName` as `span_name` pprof label, and the new span +// identifier as `span_id` pprof label, if the trace is sampled. +// +// The second return value is a context.Context object built around the +// returned Span. +// +// Example usage: +// +// SomeFunction(ctx context.Context, ...) { +// sp, ctx := opentracing.StartSpanFromContext(ctx, "SomeFunction") +// defer sp.Finish() +// ... +// } +func StartSpanFromContext(ctx context.Context, operationName string, opts ...opentracing.StartSpanOption) (opentracing.Span, context.Context) { + return StartSpanFromContextWithTracer(ctx, opentracing.GlobalTracer(), operationName, opts...) +} + +// StartSpanFromContextWithTracer starts and returns a span with `operationName` +// using a span found within the context as a ChildOfRef. If that doesn't exist +// it creates a root span. It also returns a context.Context object built +// around the returned span. +// +// The call sets `operationName` as `span_name` pprof label, and the new span +// identifier as `span_id` pprof label, if the trace is sampled. +// +// It's behavior is identical to StartSpanFromContext except that it takes an explicit +// tracer as opposed to using the global tracer. +func StartSpanFromContextWithTracer(ctx context.Context, tracer opentracing.Tracer, operationName string, opts ...opentracing.StartSpanOption) (opentracing.Span, context.Context) { + span, ctx := opentracing.StartSpanFromContextWithTracer(ctx, tracer, operationName, opts...) + spanCtx, ok := span.Context().(jaeger.SpanContext) + if ok { + span = wrapJaegerSpanWithGoroutineLabels(ctx, span, operationName, sampledSpanID(spanCtx)) + } + return span, ctx +} + +func wrapJaegerSpanWithGoroutineLabels( + parentCtx context.Context, + span opentracing.Span, + operationName string, + spanID string, +) *spanWrapper { + // Note that pprof labels are propagated through the goroutine's local + // storage and are always copied to child goroutines. This way, stack + // trace samples collected during execution of child spans will be taken + // into account at the root. + var ctx context.Context + if spanID != "" { + ctx = pprof.WithLabels(parentCtx, pprof.Labels( + spanNameLabelName, operationName, + spanIDLabelName, spanID)) + } else { + // Even if the trace has not been sampled, we still need to keep track + // of samples that belong to the span (all spans with the given name). + ctx = pprof.WithLabels(parentCtx, pprof.Labels( + spanNameLabelName, operationName)) + } + // Goroutine labels should be set as early as possible, + // in order to capture the overhead of the function call. + pprof.SetGoroutineLabels(ctx) + // We create a span wrapper to ensure we remove the newly attached pprof + // labels when span finishes. The need of this wrapper is questioned: + // as we do not have the original context, we could leave the goroutine + // labels – normally, span is finished at the very end of the goroutine's + // lifetime, so no significant side effects should take place. + w := spanWrapper{ + parentPprofCtx: parentCtx, + currentPprofCtx: ctx, + } + w.Span = span.SetTag(profileIDTagKey, spanID) + return &w +} + +type spanWrapper struct { + parentPprofCtx context.Context + currentPprofCtx context.Context + opentracing.Span +} + +func (s *spanWrapper) Finish() { + s.Span.Finish() + pprof.SetGoroutineLabels(s.parentPprofCtx) + s.currentPprofCtx = s.parentPprofCtx +} + +// sampledSpanID returns the span ID, if the span is sampled, +// otherwise an empty string is returned. +func sampledSpanID(spanCtx jaeger.SpanContext) string { + if spanCtx.IsSampled() { + return spanCtx.SpanID().String() + } + return "" +} diff --git a/vendor/github.com/grafana/dskit/spanprofiler/tracer.go b/vendor/github.com/grafana/dskit/spanprofiler/tracer.go new file mode 100644 index 0000000000000..c28b52b11d444 --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/tracer.go @@ -0,0 +1,109 @@ +package spanprofiler + +import ( + "context" + "unsafe" + + "github.com/opentracing/opentracing-go" + "github.com/uber/jaeger-client-go" +) + +const ( + profileIDTagKey = "pyroscope.profile.id" + + spanIDLabelName = "span_id" + spanNameLabelName = "span_name" +) + +type tracer struct{ opentracing.Tracer } + +// NewTracer creates a new opentracing.Tracer with the span profiler integrated. +// +// For efficiency, the tracer selectively records profiles for _root_ spans +// — the initial _local_ span in a process — since a trace may encompass +// thousands of spans. All stack trace samples accumulated during the execution +// of their child spans contribute to the root span's profile. In practical +// terms, this signifies that, for instance, an HTTP request results in a +// singular profile, irrespective of the numerous spans within the trace. It's +// important to note that these profiles don't extend beyond the boundaries of +// a single process. +// +// The limitation of this approach is that only spans created within the same +// goroutine, or its children, as the parent are taken into account. +// Consequently, in scenarios involving asynchronous execution, where the parent +// span context is passed to another goroutine, explicit profiling becomes +// necessary using `spanprofiler.StartSpanFromContext`. +func NewTracer(tr opentracing.Tracer) opentracing.Tracer { return &tracer{tr} } + +func (t *tracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span { + span := t.Tracer.StartSpan(operationName, opts...) + spanCtx, ok := span.Context().(jaeger.SpanContext) + if !ok { + return span + } + // pprof labels are attached only once, at the span root level. + if !isRootSpan(opts...) { + return span + } + // The pprof label API assumes that pairs of labels are passed through the + // context. Unfortunately, the opentracing Tracer API doesn't match this + // concept: this makes it impossible to save an existing pprof context and + // all the original pprof labels associated with the goroutine. + ctx := context.Background() + return wrapJaegerSpanWithGoroutineLabels(ctx, span, operationName, sampledSpanID(spanCtx)) +} + +// isRootSpan reports whether the span is a root span. +// +// There are only two valid cases: if the span is the first span in the trace, +// or is the first _local_ span in the trace. +// +// An exception is made for FollowsFrom reference: spans without an explicit +// parent are considered as root ones. +func isRootSpan(opts ...opentracing.StartSpanOption) bool { + parent, ok := parentSpanContextFromRef(opts...) + return !ok || isRemoteSpan(parent) +} + +// parentSpanContextFromRef returns the first parent reference. +func parentSpanContextFromRef(options ...opentracing.StartSpanOption) (sc jaeger.SpanContext, ok bool) { + var sso opentracing.StartSpanOptions + for _, option := range options { + option.Apply(&sso) + } + for _, ref := range sso.References { + if ref.Type == opentracing.ChildOfRef && ref.ReferencedContext != nil { + sc, ok = ref.ReferencedContext.(jaeger.SpanContext) + return sc, ok + } + } + return sc, ok +} + +// isRemoteSpan reports whether the span context represents a remote parent. +// +// NOTE(kolesnikovae): this is ugly, but the only reliable method I found. +// The opentracing-go package and Jaeger client are not meant to change as +// both are deprecated. +func isRemoteSpan(c jaeger.SpanContext) bool { + jaegerCtx := *(*jaegerSpanCtx)(unsafe.Pointer(&c)) + return jaegerCtx.remote +} + +// jaegerSpanCtx represents memory layout of the jaeger.SpanContext type. +type jaegerSpanCtx struct { + traceID [16]byte // TraceID + spanID [8]byte // SpanID + parentID [8]byte // SpanID + baggage uintptr // map[string]string + debugID [2]uintptr // string + + // samplingState is a pointer to a struct that has "localRootSpan" member, + // which we could probably use: that would allow omitting quite expensive + // parentSpanContextFromRef call. However, interpreting the pointer and + // the complex struct memory layout is more complicated and dangerous. + samplingState uintptr + + // remote indicates that span context represents a remote parent + remote bool +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 8ce4557f461ba..b69a2f1e5315d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -894,6 +894,7 @@ github.com/grafana/dskit/server github.com/grafana/dskit/services github.com/grafana/dskit/signals github.com/grafana/dskit/spanlogger +github.com/grafana/dskit/spanprofiler github.com/grafana/dskit/tenant github.com/grafana/dskit/test github.com/grafana/dskit/tracing