From 5559b26856178e5d0d1c2c590017530423666c8e Mon Sep 17 00:00:00 2001 From: Joao Marcal Date: Thu, 11 Jan 2024 09:14:22 +0100 Subject: [PATCH 01/21] operator: Fixes Ruler RBAC to allow it to send alerts to UWM AM (#11620) Updated RBAC now match the new updated requirements by UWM Alertmanager introduced on: https://github.com/openshift/cluster-monitoring-operator/pull/2099 --- .../loki-operator.clusterserviceversion.yaml | 8 +++++++- .../loki-operator.clusterserviceversion.yaml | 8 +++++++- .../loki-operator.clusterserviceversion.yaml | 8 +++++++- operator/config/rbac/role.yaml | 6 ++++++ operator/controllers/loki/lokistack_controller.go | 1 + operator/internal/manifests/openshift/rbac.go | 11 +++++++++++ 6 files changed, 39 insertions(+), 3 deletions(-) diff --git a/operator/bundle/community-openshift/manifests/loki-operator.clusterserviceversion.yaml b/operator/bundle/community-openshift/manifests/loki-operator.clusterserviceversion.yaml index 36151790a2099..2915af504fd39 100644 --- a/operator/bundle/community-openshift/manifests/loki-operator.clusterserviceversion.yaml +++ b/operator/bundle/community-openshift/manifests/loki-operator.clusterserviceversion.yaml @@ -150,7 +150,7 @@ metadata: categories: OpenShift Optional, Logging & Tracing certified: "false" containerImage: docker.io/grafana/loki-operator:0.5.0 - createdAt: "2023-12-12T09:22:19Z" + createdAt: "2024-01-10T18:25:00Z" description: The Community Loki Operator provides Kubernetes native deployment and management of Loki and related logging components. features.operators.openshift.io/disconnected: "true" @@ -1591,6 +1591,12 @@ spec: - alertmanagers verbs: - patch + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers/api + verbs: + - create - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/bundle/community/manifests/loki-operator.clusterserviceversion.yaml b/operator/bundle/community/manifests/loki-operator.clusterserviceversion.yaml index 322bc606611f3..b78b8f6d30b98 100644 --- a/operator/bundle/community/manifests/loki-operator.clusterserviceversion.yaml +++ b/operator/bundle/community/manifests/loki-operator.clusterserviceversion.yaml @@ -150,7 +150,7 @@ metadata: categories: OpenShift Optional, Logging & Tracing certified: "false" containerImage: docker.io/grafana/loki-operator:0.5.0 - createdAt: "2023-12-12T09:22:17Z" + createdAt: "2024-01-10T18:24:59Z" description: The Community Loki Operator provides Kubernetes native deployment and management of Loki and related logging components. operators.operatorframework.io/builder: operator-sdk-unknown @@ -1571,6 +1571,12 @@ spec: - alertmanagers verbs: - patch + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers/api + verbs: + - create - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/bundle/openshift/manifests/loki-operator.clusterserviceversion.yaml b/operator/bundle/openshift/manifests/loki-operator.clusterserviceversion.yaml index f4a951400e946..b0fca996ce78f 100644 --- a/operator/bundle/openshift/manifests/loki-operator.clusterserviceversion.yaml +++ b/operator/bundle/openshift/manifests/loki-operator.clusterserviceversion.yaml @@ -150,7 +150,7 @@ metadata: categories: OpenShift Optional, Logging & Tracing certified: "false" containerImage: quay.io/openshift-logging/loki-operator:0.1.0 - createdAt: "2023-12-12T09:22:21Z" + createdAt: "2024-01-10T18:25:02Z" description: | The Loki Operator for OCP provides a means for configuring and managing a Loki stack for cluster logging. ## Prerequisites and Requirements @@ -1576,6 +1576,12 @@ spec: - alertmanagers verbs: - patch + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers/api + verbs: + - create - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/config/rbac/role.yaml b/operator/config/rbac/role.yaml index d7b881ef8e33d..09dc60b8c33b9 100644 --- a/operator/config/rbac/role.yaml +++ b/operator/config/rbac/role.yaml @@ -175,6 +175,12 @@ rules: - alertmanagers verbs: - patch +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers/api + verbs: + - create - apiGroups: - monitoring.coreos.com resources: diff --git a/operator/controllers/loki/lokistack_controller.go b/operator/controllers/loki/lokistack_controller.go index 49b5bdab069e0..487390d7287bd 100644 --- a/operator/controllers/loki/lokistack_controller.go +++ b/operator/controllers/loki/lokistack_controller.go @@ -123,6 +123,7 @@ type LokiStackReconciler struct { // +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings;clusterroles;roles;rolebindings,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=get;list;watch;create;update;delete // +kubebuilder:rbac:groups=monitoring.coreos.com,resources=alertmanagers,verbs=patch +// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=alertmanagers/api,verbs=create // +kubebuilder:rbac:urls=/api/v2/alerts,verbs=create // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;create;update // +kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses,verbs=get;list;watch;create;update diff --git a/operator/internal/manifests/openshift/rbac.go b/operator/internal/manifests/openshift/rbac.go index ebd464274c437..46e5837a2c262 100644 --- a/operator/internal/manifests/openshift/rbac.go +++ b/operator/internal/manifests/openshift/rbac.go @@ -108,6 +108,17 @@ func BuildRulerClusterRole(opts Options) *rbacv1.ClusterRole { "create", }, }, + { + APIGroups: []string{ + "monitoring.coreos.com", + }, + Resources: []string{ + "alertmanagers/api", + }, + Verbs: []string{ + "create", + }, + }, }, } } From 88e957af58ff71d80e1722a0140a1da3d582958c Mon Sep 17 00:00:00 2001 From: Karsten Jeschkies Date: Thu, 11 Jan 2024 11:56:52 +0100 Subject: [PATCH 02/21] Skip optimization for DownstreamSampleExpr (#11651) This fixes a four year old bug. The `optimizeSampleExpr` was switching on the wrong `DownstreamSampleExpr` type. This bug only surfaced because of the recent fix how binop expressions are executed. --- pkg/logql/optimize.go | 2 +- pkg/logql/shardmapper.go | 2 +- pkg/logql/shardmapper_test.go | 72 ++++++++++++++++++++++++++++++++++- 3 files changed, 72 insertions(+), 4 deletions(-) diff --git a/pkg/logql/optimize.go b/pkg/logql/optimize.go index 2f9c80a64f918..9b885b0fd229c 100644 --- a/pkg/logql/optimize.go +++ b/pkg/logql/optimize.go @@ -8,7 +8,7 @@ func optimizeSampleExpr(expr syntax.SampleExpr) (syntax.SampleExpr, error) { // we skip sharding AST for now, it's not easy to clone them since they are not part of the language. expr.Walk(func(e syntax.Expr) { switch e.(type) { - case *ConcatSampleExpr, *DownstreamSampleExpr, *QuantileSketchEvalExpr, *QuantileSketchMergeExpr: + case *ConcatSampleExpr, DownstreamSampleExpr, *QuantileSketchEvalExpr, *QuantileSketchMergeExpr: skip = true return } diff --git a/pkg/logql/shardmapper.go b/pkg/logql/shardmapper.go index 4a06b5f804e84..e8d78a438c9bb 100644 --- a/pkg/logql/shardmapper.go +++ b/pkg/logql/shardmapper.go @@ -128,7 +128,7 @@ func (m ShardMapper) mapBinOpExpr(e *syntax.BinOpExpr, r *downstreamRecorder) (* if err != nil { return nil, 0, err } - if isNoOp(e.SampleExpr, rhsMapped) && !isLiteralOrVector(rhsMapped) { + if isNoOp(e.RHS, rhsMapped) && !isLiteralOrVector(rhsMapped) { // TODO: check if literal or vector rhsMapped = DownstreamSampleExpr{ shard: nil, diff --git a/pkg/logql/shardmapper_test.go b/pkg/logql/shardmapper_test.go index 7a02640c81491..96955109a9413 100644 --- a/pkg/logql/shardmapper_test.go +++ b/pkg/logql/shardmapper_test.go @@ -1446,16 +1446,84 @@ func TestMapping(t *testing.T) { }, }, }, + { + in: `quantile_over_time(0.99, {a="foo"} | unwrap bytes [1s]) by (a, b) > 1`, + expr: &syntax.BinOpExpr{ + SampleExpr: DownstreamSampleExpr{ + SampleExpr: &syntax.RangeAggregationExpr{ + Operation: syntax.OpRangeTypeQuantile, + Params: float64p(0.99), + Left: &syntax.LogRange{ + Left: &syntax.MatchersExpr{ + Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "a", "foo")}, + }, + Unwrap: &syntax.UnwrapExpr{ + Identifier: "bytes", + }, + Interval: 1 * time.Second, + }, + Grouping: &syntax.Grouping{ + Groups: []string{"a", "b"}, + }, + }, + }, + RHS: &syntax.LiteralExpr{ + Val: 1, + }, + Op: syntax.OpTypeGT, + Opts: &syntax.BinOpOptions{ + ReturnBool: false, + VectorMatching: &syntax.VectorMatching{}, + }, + }, + }, + { + in: `1 < quantile_over_time(0.99, {a="foo"} | unwrap bytes [1s]) by (a, b)`, + expr: &syntax.BinOpExpr{ + SampleExpr: &syntax.LiteralExpr{ + Val: 1, + }, + RHS: DownstreamSampleExpr{ + SampleExpr: &syntax.RangeAggregationExpr{ + Operation: syntax.OpRangeTypeQuantile, + Params: float64p(0.99), + Left: &syntax.LogRange{ + Left: &syntax.MatchersExpr{ + Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "a", "foo")}, + }, + Unwrap: &syntax.UnwrapExpr{ + Identifier: "bytes", + }, + Interval: 1 * time.Second, + }, + Grouping: &syntax.Grouping{ + Groups: []string{"a", "b"}, + }, + }, + }, + Op: syntax.OpTypeLT, + Opts: &syntax.BinOpOptions{ + ReturnBool: false, + VectorMatching: &syntax.VectorMatching{}, + }, + }, + }, } { t.Run(tc.in, func(t *testing.T) { ast, err := syntax.ParseExpr(tc.in) require.Equal(t, tc.err, err) mapped, _, err := m.Map(ast, nilShardMetrics.downstreamRecorder()) + switch e := mapped.(type) { + case syntax.SampleExpr: + optimized, err := optimizeSampleExpr(e) + require.NoError(t, err) + require.Equal(t, mapped.String(), optimized.String()) + } require.Equal(t, tc.err, err) - require.Equal(t, mapped.String(), tc.expr.String()) - require.Equal(t, mapped, tc.expr) + require.Equal(t, tc.expr.String(), mapped.String()) + require.Equal(t, tc.expr, mapped) }) } } From 2182bacd9ddd0de4e881192cc545168ad85d65fb Mon Sep 17 00:00:00 2001 From: Karsten Jeschkies Date: Thu, 11 Jan 2024 12:06:12 +0100 Subject: [PATCH 03/21] Validate dev cluster config in CI (#11642) **What this PR does / why we need it**: The configuration in `tools/dev/loki-boltdb-storage-s3` was broken in the past because it's easy to miss. This change covers the validation in the CI to raise awareness. **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [ ] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --- .drone/drone.jsonnet | 1 + .drone/drone.yml | 9 ++++++++- Makefile | 3 +++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.drone/drone.jsonnet b/.drone/drone.jsonnet index d9bfee681f2a3..6dcea160a78c4 100644 --- a/.drone/drone.jsonnet +++ b/.drone/drone.jsonnet @@ -640,6 +640,7 @@ local build_image_tag = '0.33.0'; 'GIT_TARGET_BRANCH="$DRONE_TARGET_BRANCH"', ]) { depends_on: ['loki'], when: onPRs }, make('validate-example-configs', container=false) { depends_on: ['loki'] }, + make('validate-dev-cluster-config', container=false) { depends_on: ['loki'] }, make('check-example-config-doc', container=false) { depends_on: ['clone'] }, { name: 'build-docs-website', diff --git a/.drone/drone.yml b/.drone/drone.yml index a4b2b44299cee..d45f7898a0851 100644 --- a/.drone/drone.yml +++ b/.drone/drone.yml @@ -303,6 +303,13 @@ steps: environment: {} image: grafana/loki-build-image:0.33.0 name: validate-example-configs +- commands: + - make BUILD_IN_CONTAINER=false validate-dev-cluster-config + depends_on: + - loki + environment: {} + image: grafana/loki-build-image:0.33.0 + name: validate-dev-cluster-config - commands: - make BUILD_IN_CONTAINER=false check-example-config-doc depends_on: @@ -2106,6 +2113,6 @@ kind: secret name: gpg_private_key --- kind: signature -hmac: 30f2fb121d8271e00dc2ae8fe83a32e0e22fd2bd268609d0c3f295033fcd4fb6 +hmac: fe7669a21410ae5f2d1ad6b6205fdc582af874f65f7bd6a679731a88174e3a1c ... diff --git a/Makefile b/Makefile index 50938cac56d91..d311ed1c4f3c6 100644 --- a/Makefile +++ b/Makefile @@ -801,6 +801,9 @@ EXAMPLES_SKIP_VALIDATION_FLAG := "doc-example:skip-validation=true" validate-example-configs: loki for f in $$(grep -rL $(EXAMPLES_SKIP_VALIDATION_FLAG) $(EXAMPLES_YAML_PATH)/*.yaml); do echo "Validating provided example config: $$f" && ./cmd/loki/loki -config.file=$$f -verify-config || exit 1; done +validate-dev-cluster-config: loki + ./cmd/loki/loki -config.file=./tools/dev/loki-boltdb-storage-s3/config/loki.yaml -verify-config + # Dynamically generate ./docs/sources/configure/examples.md using the example configs that we provide. # This target should be run if any of our example configs change. generate-example-config-doc: From 88aaa7dc5eff24c8528294cf9d6c0314afdcbe1e Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 11 Jan 2024 13:37:37 +0200 Subject: [PATCH 04/21] Cache: correctly check background cache size (#11654) **What this PR does / why we need it**: The size check was not being performed atomically, which led to flakiness in the `TestBackgroundSizeLimit` test. --------- Signed-off-by: Danny Kopping Co-authored-by: Christian Haudum --- CHANGELOG.md | 1 + pkg/storage/chunk/cache/background.go | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 348e1dc86ba9c..357714b030b1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ * [11539](https://github.com/grafana/loki/pull/11539) **kaviraj,ashwanthgoli** Support caching /series and /labels query results * [11545](https://github.com/grafana/loki/pull/11545) **dannykopping** Force correct memcached timeout when fetching chunks. * [11589](https://github.com/grafana/loki/pull/11589) **ashwanthgoli** Results Cache: Adds `query_length_served` cache stat to measure the length of the query served from cache. +* [11654](https://github.com/grafana/loki/pull/11654) **dannykopping** Cache: atomically check background cache size limit correctly. ##### Fixes * [11074](https://github.com/grafana/loki/pull/11074) **hainenber** Fix panic in lambda-promtail due to mishandling of empty DROP_LABELS env var. diff --git a/pkg/storage/chunk/cache/background.go b/pkg/storage/chunk/cache/background.go index 16feb62551f5b..299444c6a54e0 100644 --- a/pkg/storage/chunk/cache/background.go +++ b/pkg/storage/chunk/cache/background.go @@ -148,8 +148,11 @@ func (c *backgroundCache) Store(ctx context.Context, keys []string, bufs [][]byt } size := bgWrite.size() - newSize := c.size.Load() + int64(size) + // prospectively add new size + newSize := c.size.Add(int64(size)) if newSize > int64(c.sizeLimit) { + // subtract it since we've exceeded the limit + c.size.Sub(int64(size)) c.failStore(ctx, size, num, "queue at byte size limit") return nil } From e7b9455327446a0960967db134d76c4cb11156d7 Mon Sep 17 00:00:00 2001 From: Robert Jacob Date: Thu, 11 Jan 2024 13:58:40 +0100 Subject: [PATCH 05/21] operator: React to changes in ConfigMap used for storage CA (#11624) --- operator/CHANGELOG.md | 1 + .../controllers/loki/lokistack_controller.go | 42 +++++++++++++++---- .../loki/lokistack_controller_test.go | 19 ++++++--- .../handlers/internal/storage/ca_configmap.go | 39 ++++++++++++++--- .../internal/storage/ca_configmap_test.go | 26 ++++++++---- .../handlers/lokistack_create_or_update.go | 7 +++- .../lokistack_create_or_update_test.go | 2 +- 7 files changed, 107 insertions(+), 29 deletions(-) diff --git a/operator/CHANGELOG.md b/operator/CHANGELOG.md index 9ea61a0dba4e5..f6cfa9a5cda01 100644 --- a/operator/CHANGELOG.md +++ b/operator/CHANGELOG.md @@ -1,5 +1,6 @@ ## Main +- [11624](https://github.com/grafana/loki/pull/11624) **xperimental**: React to changes in ConfigMap used for storage CA - [11481](https://github.com/grafana/loki/pull/11481) **JoaoBraveCoding**: Adds AWS STS support - [11533](https://github.com/grafana/loki/pull/11533) **periklis**: Add serviceaccount per LokiStack resource - [11158](https://github.com/grafana/loki/pull/11158) **btaani**: operator: Add warning for old schema configuration diff --git a/operator/controllers/loki/lokistack_controller.go b/operator/controllers/loki/lokistack_controller.go index 487390d7287bd..629ee85d5edd7 100644 --- a/operator/controllers/loki/lokistack_controller.go +++ b/operator/controllers/loki/lokistack_controller.go @@ -94,12 +94,7 @@ var ( }) createUpdateOrDeletePred = builder.WithPredicates(predicate.Funcs{ UpdateFunc: func(e event.UpdateEvent) bool { - if e.ObjectOld.GetGeneration() == 0 && len(e.ObjectOld.GetAnnotations()) == 0 { - return e.ObjectOld.GetResourceVersion() != e.ObjectNew.GetResourceVersion() - } - - return e.ObjectOld.GetGeneration() != e.ObjectNew.GetGeneration() || - cmp.Diff(e.ObjectOld.GetAnnotations(), e.ObjectNew.GetAnnotations()) != "" + return e.ObjectOld.GetResourceVersion() != e.ObjectNew.GetResourceVersion() }, CreateFunc: func(e event.CreateEvent) bool { return true }, DeleteFunc: func(e event.DeleteEvent) bool { return true }, @@ -207,7 +202,8 @@ func (r *LokiStackReconciler) buildController(bld k8s.Builder) error { Owns(&rbacv1.Role{}, updateOrDeleteOnlyPred). Owns(&rbacv1.RoleBinding{}, updateOrDeleteOnlyPred). Watches(&corev1.Service{}, r.enqueueForAlertManagerServices(), createUpdateOrDeletePred). - Watches(&corev1.Secret{}, r.enqueueForStorageSecret(), createUpdateOrDeletePred) + Watches(&corev1.Secret{}, r.enqueueForStorageSecret(), createUpdateOrDeletePred). + Watches(&corev1.ConfigMap{}, r.enqueueForStorageCA(), createUpdateOrDeletePred) if r.FeatureGates.LokiStackAlerts { bld = bld.Owns(&monitoringv1.PrometheusRule{}, updateOrDeleteOnlyPred) @@ -324,3 +320,35 @@ func (r *LokiStackReconciler) enqueueForStorageSecret() handler.EventHandler { return requests }) } + +func (r *LokiStackReconciler) enqueueForStorageCA() handler.EventHandler { + return handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []reconcile.Request { + lokiStacks := &lokiv1.LokiStackList{} + if err := r.Client.List(ctx, lokiStacks, client.InNamespace(obj.GetNamespace())); err != nil { + r.Log.Error(err, "Error listing LokiStack resources for storage CA update") + return nil + } + + var requests []reconcile.Request + for _, stack := range lokiStacks.Items { + if stack.Spec.Storage.TLS == nil { + continue + } + + storageTLS := stack.Spec.Storage.TLS + if obj.GetName() != storageTLS.CA { + continue + } + + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: stack.Namespace, + Name: stack.Name, + }, + }) + r.Log.Info("Enqueued request for LokiStack because of Storage CA resource change", "LokiStack", stack.Name, "ConfigMap", obj.GetName()) + } + + return requests + }) +} diff --git a/operator/controllers/loki/lokistack_controller_test.go b/operator/controllers/loki/lokistack_controller_test.go index d8eae5a1ec66f..7421b63331b5d 100644 --- a/operator/controllers/loki/lokistack_controller_test.go +++ b/operator/controllers/loki/lokistack_controller_test.go @@ -203,8 +203,8 @@ func TestLokiStackController_RegisterWatchedResources(t *testing.T) { table := []test{ { src: &openshiftconfigv1.APIServer{}, - index: 2, - watchesCallsCount: 3, + index: 3, + watchesCallsCount: 4, featureGates: configv1.FeatureGates{ OpenShift: configv1.OpenShiftFeatureGates{ ClusterTLSPolicy: true, @@ -214,8 +214,8 @@ func TestLokiStackController_RegisterWatchedResources(t *testing.T) { }, { src: &openshiftconfigv1.Proxy{}, - index: 2, - watchesCallsCount: 3, + index: 3, + watchesCallsCount: 4, featureGates: configv1.FeatureGates{ OpenShift: configv1.OpenShiftFeatureGates{ ClusterProxy: true, @@ -226,14 +226,21 @@ func TestLokiStackController_RegisterWatchedResources(t *testing.T) { { src: &corev1.Service{}, index: 0, - watchesCallsCount: 2, + watchesCallsCount: 3, featureGates: configv1.FeatureGates{}, pred: createUpdateOrDeletePred, }, { src: &corev1.Secret{}, index: 1, - watchesCallsCount: 2, + watchesCallsCount: 3, + featureGates: configv1.FeatureGates{}, + pred: createUpdateOrDeletePred, + }, + { + src: &corev1.ConfigMap{}, + index: 2, + watchesCallsCount: 3, featureGates: configv1.FeatureGates{}, pred: createUpdateOrDeletePred, }, diff --git a/operator/internal/handlers/internal/storage/ca_configmap.go b/operator/internal/handlers/internal/storage/ca_configmap.go index ccb4f93d06a34..ce70591e55cfa 100644 --- a/operator/internal/handlers/internal/storage/ca_configmap.go +++ b/operator/internal/handlers/internal/storage/ca_configmap.go @@ -1,9 +1,38 @@ package storage -import corev1 "k8s.io/api/core/v1" +import ( + "crypto/sha1" + "fmt" -// IsValidCAConfigMap checks if the given CA configMap has an -// non-empty entry for the key -func IsValidCAConfigMap(cm *corev1.ConfigMap, key string) bool { - return cm.Data[key] != "" + corev1 "k8s.io/api/core/v1" +) + +type caKeyError string + +func (e caKeyError) Error() string { + return fmt.Sprintf("key not present or data empty: %s", string(e)) +} + +// CheckCAConfigMap checks if the given CA configMap has an non-empty entry for the key used as CA certificate. +// If the key is present it will return a hash of the current key name and contents. +func CheckCAConfigMap(cm *corev1.ConfigMap, key string) (string, error) { + data := cm.Data[key] + if data == "" { + return "", caKeyError(key) + } + + h := sha1.New() + if _, err := h.Write([]byte(key)); err != nil { + return "", err + } + + if _, err := h.Write(hashSeparator); err != nil { + return "", err + } + + if _, err := h.Write([]byte(data)); err != nil { + return "", err + } + + return fmt.Sprintf("%x", h.Sum(nil)), nil } diff --git a/operator/internal/handlers/internal/storage/ca_configmap_test.go b/operator/internal/handlers/internal/storage/ca_configmap_test.go index 1e164f5a25413..bd3d4d56a690a 100644 --- a/operator/internal/handlers/internal/storage/ca_configmap_test.go +++ b/operator/internal/handlers/internal/storage/ca_configmap_test.go @@ -11,9 +11,10 @@ import ( func TestIsValidConfigMap(t *testing.T) { type test struct { - name string - cm *corev1.ConfigMap - valid bool + name string + cm *corev1.ConfigMap + wantHash string + wantErrorMsg string } table := []test{ { @@ -23,11 +24,13 @@ func TestIsValidConfigMap(t *testing.T) { "service-ca.crt": "has-some-data", }, }, - valid: true, + wantHash: "de6ae206d4920549d21c24ad9721e87a9b1ec7dc", + wantErrorMsg: "", }, { - name: "missing `service-ca.crt` key", - cm: &corev1.ConfigMap{}, + name: "missing `service-ca.crt` key", + cm: &corev1.ConfigMap{}, + wantErrorMsg: "key not present or data empty: service-ca.crt", }, { name: "missing CA content", @@ -36,6 +39,7 @@ func TestIsValidConfigMap(t *testing.T) { "service-ca.crt": "", }, }, + wantErrorMsg: "key not present or data empty: service-ca.crt", }, } for _, tst := range table { @@ -43,8 +47,14 @@ func TestIsValidConfigMap(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - ok := storage.IsValidCAConfigMap(tst.cm, "service-ca.crt") - require.Equal(t, tst.valid, ok) + hash, err := storage.CheckCAConfigMap(tst.cm, "service-ca.crt") + + require.Equal(t, tst.wantHash, hash) + if tst.wantErrorMsg == "" { + require.NoError(t, err) + } else { + require.EqualError(t, err, tst.wantErrorMsg) + } }) } } diff --git a/operator/internal/handlers/lokistack_create_or_update.go b/operator/internal/handlers/lokistack_create_or_update.go index 49c84af4dcf4b..a6963f7574321 100644 --- a/operator/internal/handlers/lokistack_create_or_update.go +++ b/operator/internal/handlers/lokistack_create_or_update.go @@ -134,14 +134,17 @@ func CreateOrUpdateLokiStack( caKey = tlsConfig.CAKey } - if !storage.IsValidCAConfigMap(&cm, caKey) { + var caHash string + caHash, err = storage.CheckCAConfigMap(&cm, caKey) + if err != nil { return &status.DegradedError{ - Message: "Invalid object storage CA configmap contents: missing key or no contents", + Message: fmt.Sprintf("Invalid object storage CA configmap contents: %s", err), Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, Requeue: false, } } + objStore.SecretSHA1 = fmt.Sprintf("%s;%s", objStore.SecretSHA1, caHash) objStore.TLS = &storageoptions.TLSConfig{CA: cm.Name, Key: caKey} } diff --git a/operator/internal/handlers/lokistack_create_or_update_test.go b/operator/internal/handlers/lokistack_create_or_update_test.go index 79928b4a82e50..b2158fe4d2ba2 100644 --- a/operator/internal/handlers/lokistack_create_or_update_test.go +++ b/operator/internal/handlers/lokistack_create_or_update_test.go @@ -997,7 +997,7 @@ func TestCreateOrUpdateLokiStack_WhenInvalidCAConfigMap_SetDegraded(t *testing.T } degradedErr := &status.DegradedError{ - Message: "Invalid object storage CA configmap contents: missing key or no contents", + Message: "Invalid object storage CA configmap contents: key not present or data empty: service-ca.crt", Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, Requeue: false, } From e915efc7f81350ea82d4dcbe105055075df6fc76 Mon Sep 17 00:00:00 2001 From: Ashwanth Date: Thu, 11 Jan 2024 19:06:37 +0530 Subject: [PATCH 06/21] fix(log results cache): compose empty response based on the request (#11657) **What this PR does / why we need it**: Log results cache when handling a hit composes an empty response based on the cached request. But the limit or direction fields in the cached request need not match with the current request being served. This causes the log results cache to return a response with incorrect limit. This incorrect limit could then get applied when merging responses upstream (split by interval mw for ex.) This pr fixes this by composing the response based on the request being served. I also thought about updating the cache key to include both limit and direction to have a clear separation, but I left it as is for the following reason: if a time range contains no log lines, that result would not change irrespective of a different limit or direction **Which issue(s) this PR fixes**: Fixes # **Special notes for your reviewer**: **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --- CHANGELOG.md | 1 + pkg/querier/queryrange/log_result_cache.go | 5 +- .../queryrange/log_result_cache_test.go | 49 +++++++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 357714b030b1a..612b70ab36fd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,6 +55,7 @@ * [11551](https://github.com/grafana/loki/pull/11551) **dannykopping** Do not reflect label names in request metrics' "route" label. * [11601](https://github.com/grafana/loki/pull/11601) **dannykopping** Ruler: Fixed a panic that can be caused by concurrent read-write access of tenant configs when there are a large amount of rules. * [11606](https://github.com/grafana/loki/pull/11606) **dannykopping** Fixed regression adding newlines to HTTP error response bodies which may break client integrations. +* [11657](https://github.com/grafana/loki/pull/11657) **ashwanthgoli** Log results cache: compose empty response based on the request being served to avoid returning incorrect limit or direction. ##### Changes diff --git a/pkg/querier/queryrange/log_result_cache.go b/pkg/querier/queryrange/log_result_cache.go index c15568d9075ac..fd26b67412a6b 100644 --- a/pkg/querier/queryrange/log_result_cache.go +++ b/pkg/querier/queryrange/log_result_cache.go @@ -106,7 +106,8 @@ func (l *logResultCache) Do(ctx context.Context, req queryrangebase.Request) (qu interval := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, l.limits.QuerySplitDuration) // skip caching by if interval is unset - if interval == 0 { + // skip caching when limit is 0 as it would get registerted as empty result in the cache even if that time range contains log lines. + if interval == 0 || lokiReq.Limit == 0 { return l.next.Do(ctx, req) } // The first subquery might not be aligned. @@ -181,7 +182,7 @@ func (l *logResultCache) handleMiss(ctx context.Context, cacheKey string, req *L func (l *logResultCache) handleHit(ctx context.Context, cacheKey string, cachedRequest *LokiRequest, lokiReq *LokiRequest) (queryrangebase.Response, error) { l.metrics.CacheHit.Inc() // we start with an empty response - result := emptyResponse(cachedRequest) + result := emptyResponse(lokiReq) // if the request is the same and cover the whole time range, // we can just return the cached result. if cachedRequest.StartTs.UnixNano() <= lokiReq.StartTs.UnixNano() && cachedRequest.EndTs.UnixNano() >= lokiReq.EndTs.UnixNano() { diff --git a/pkg/querier/queryrange/log_result_cache_test.go b/pkg/querier/queryrange/log_result_cache_test.go index 5d67be33b84fd..5da4aee7c4be3 100644 --- a/pkg/querier/queryrange/log_result_cache_test.go +++ b/pkg/querier/queryrange/log_result_cache_test.go @@ -580,6 +580,54 @@ func Test_LogResultNonOverlappingCache(t *testing.T) { fake.AssertExpectations(t) } +func Test_LogResultCacheDifferentLimit(t *testing.T) { + var ( + ctx = user.InjectOrgID(context.Background(), "foo") + lrc = NewLogResultCache( + log.NewNopLogger(), + fakeLimits{ + splitDuration: map[string]time.Duration{"foo": time.Minute}, + }, + cache.NewMockCache(), + nil, + nil, + nil, + ) + ) + + req1 := &LokiRequest{ + StartTs: time.Unix(0, time.Minute.Nanoseconds()), + EndTs: time.Unix(0, 2*time.Minute.Nanoseconds()), + Limit: entriesLimit, + } + + req2 := &LokiRequest{ + StartTs: time.Unix(0, time.Minute.Nanoseconds()), + EndTs: time.Unix(0, 2*time.Minute.Nanoseconds()), + Limit: 10, + } + + fake := newFakeResponse([]mockResponse{ + { + RequestResponse: queryrangebase.RequestResponse{ + Request: req1, + Response: emptyResponse(req1), + }, + }, + }) + + h := lrc.Wrap(fake) + + resp, err := h.Do(ctx, req1) + require.NoError(t, err) + require.Equal(t, emptyResponse(req1), resp) + resp, err = h.Do(ctx, req2) + require.NoError(t, err) + require.Equal(t, emptyResponse(req2), resp) + + fake.AssertExpectations(t) +} + func TestExtractLokiResponse(t *testing.T) { for _, tc := range []struct { name string @@ -677,6 +725,7 @@ func newFakeResponse(responses []mockResponse) fakeResponse { for _, r := range responses { m.On("Do", mock.Anything, r.Request).Return(r.Response, r.err).Once() } + return fakeResponse{ Mock: m, } From bcd03150c91e62575816e6f58fb8f7cc0e255707 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 11 Jan 2024 15:56:29 +0200 Subject: [PATCH 07/21] Query-frontend: customisable query splitting for queries overlapping `query_ingester_within` window (#11535) **What this PR does / why we need it**: The config option `query_ingesters_within` defines the window during which logs _could_ be present on ingesters, and as such queriers will send queries to ingesters instead. `split_queries_by_interval` is defined to split queries into subqueries for increased parallelism. Aggressive query splitting within the `query_ingesters_within` window can result in overloading ingesters with unnecessarily large numbers of subqueries, which perversely can impact writes. `query_ingesters_within` is set to 3h by default. In Grafana Cloud Logs we set `split_queries_by_interval` as low as 15m (defaults to 1h), which would result in result in 3*60/15=12 requests. Every querier queries every ingester during this window, so that's 12 requests _per ingester per query_ which has the `query_ingesters_within` window in its time range _(i.e. a query from now to now-7d would include the `query_ingesters_within` window as well, now-3h to now-7d would not)_. However, we _do_ want to split queries so an ingester won't have to handle a query for a full `query_ingesters_within` window - this could involve a large amount of data. To account for this, this PR introduces a new option `split_ingester_queries_by_interval` on the query-frontend; this setting is disabled by default. ![image](https://github.com/grafana/loki/assets/373762/2e671bd8-9e8d-4bf3-addf-bebcfc25e8d7) --- CHANGELOG.md | 1 + docs/sources/configure/_index.md | 6 + pkg/loki/modules.go | 13 + pkg/querier/queryrange/limits/definitions.go | 1 + pkg/querier/queryrange/limits_test.go | 4 +- pkg/querier/queryrange/roundtrip.go | 115 +-- pkg/querier/queryrange/roundtrip_test.go | 51 +- pkg/querier/queryrange/split_by_interval.go | 165 +--- .../queryrange/split_by_interval_test.go | 844 +++++++++++++----- pkg/querier/queryrange/splitters.go | 297 ++++++ pkg/util/config.go | 8 + pkg/util/time.go | 4 +- pkg/validation/limits.go | 10 + 13 files changed, 1050 insertions(+), 469 deletions(-) create mode 100644 pkg/querier/queryrange/splitters.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 612b70ab36fd9..f9d2d38fbbfed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,7 @@ * [11539](https://github.com/grafana/loki/pull/11539) **kaviraj,ashwanthgoli** Support caching /series and /labels query results * [11545](https://github.com/grafana/loki/pull/11545) **dannykopping** Force correct memcached timeout when fetching chunks. * [11589](https://github.com/grafana/loki/pull/11589) **ashwanthgoli** Results Cache: Adds `query_length_served` cache stat to measure the length of the query served from cache. +* [11535](https://github.com/grafana/loki/pull/11535) **dannykopping** Query Frontend: Allow customisable splitting of queries which overlap the `query_ingester_within` window to reduce query pressure on ingesters. * [11654](https://github.com/grafana/loki/pull/11654) **dannykopping** Cache: atomically check background cache size limit correctly. ##### Fixes diff --git a/docs/sources/configure/_index.md b/docs/sources/configure/_index.md index e2185c19474f0..9bf65788c8a22 100644 --- a/docs/sources/configure/_index.md +++ b/docs/sources/configure/_index.md @@ -2884,6 +2884,12 @@ The `limits_config` block configures global and per-tenant limits in Loki. # CLI flag: -querier.split-metadata-queries-by-interval [split_metadata_queries_by_interval: | default = 1d] +# Interval to use for time-based splitting when a request is within the +# `query_ingesters_within` window; defaults to `split-queries-by-interval` by +# setting to 0. +# CLI flag: -querier.split-ingester-queries-by-interval +[split_ingester_queries_by_interval: | default = 0s] + # Limit queries that can be sharded. Queries within the time range of now and # now minus this sharding lookback are not sharded. The default value of 0s # disables the lookback, causing sharding of all queries at all times. diff --git a/pkg/loki/modules.go b/pkg/loki/modules.go index 1342a105f34b6..8282098c85aec 100644 --- a/pkg/loki/modules.go +++ b/pkg/loki/modules.go @@ -800,12 +800,25 @@ func (disabledShuffleShardingLimits) MaxQueriersPerUser(_ string) uint { return func (disabledShuffleShardingLimits) MaxQueryCapacity(_ string) float64 { return 0 } +// ingesterQueryOptions exists simply to avoid dependency cycles when using querier.Config directly in queryrange.NewMiddleware +type ingesterQueryOptions struct { + querier.Config +} + +func (i ingesterQueryOptions) QueryStoreOnly() bool { + return i.Config.QueryStoreOnly +} +func (i ingesterQueryOptions) QueryIngestersWithin() time.Duration { + return i.Config.QueryIngestersWithin +} + func (t *Loki) initQueryFrontendMiddleware() (_ services.Service, err error) { level.Debug(util_log.Logger).Log("msg", "initializing query frontend tripperware") middleware, stopper, err := queryrange.NewMiddleware( t.Cfg.QueryRange, t.Cfg.Querier.Engine, + ingesterQueryOptions{t.Cfg.Querier}, util_log.Logger, t.Overrides, t.Cfg.SchemaConfig, diff --git a/pkg/querier/queryrange/limits/definitions.go b/pkg/querier/queryrange/limits/definitions.go index bd84e144fa47d..57b2e03c6697b 100644 --- a/pkg/querier/queryrange/limits/definitions.go +++ b/pkg/querier/queryrange/limits/definitions.go @@ -15,6 +15,7 @@ type Limits interface { logql.Limits QuerySplitDuration(string) time.Duration MetadataQuerySplitDuration(string) time.Duration + IngesterQuerySplitDuration(string) time.Duration MaxQuerySeries(context.Context, string) int MaxEntriesLimitPerQuery(context.Context, string) int MinShardingLookback(string) time.Duration diff --git a/pkg/querier/queryrange/limits_test.go b/pkg/querier/queryrange/limits_test.go index 3b82c1dc9eabb..0de342e42644f 100644 --- a/pkg/querier/queryrange/limits_test.go +++ b/pkg/querier/queryrange/limits_test.go @@ -58,7 +58,7 @@ func Test_seriesLimiter(t *testing.T) { cfg.CacheIndexStatsResults = false // split in 7 with 2 in // max. l := WithSplitByLimits(fakeLimits{maxSeries: 1, maxQueryParallelism: 2}, time.Hour) - tpw, stopper, err := NewMiddleware(cfg, testEngineOpts, util_log.Logger, l, config.SchemaConfig{ + tpw, stopper, err := NewMiddleware(cfg, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{ Configs: testSchemas, }, nil, false, nil, constants.Loki) if stopper != nil { @@ -228,7 +228,7 @@ func Test_MaxQueryParallelismDisable(t *testing.T) { } func Test_MaxQueryLookBack(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{ + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{ maxQueryLookback: 1 * time.Hour, maxQueryParallelism: 1, }, config.SchemaConfig{ diff --git a/pkg/querier/queryrange/roundtrip.go b/pkg/querier/queryrange/roundtrip.go index 5f0aef4a1ab49..6d0d62af7a88a 100644 --- a/pkg/querier/queryrange/roundtrip.go +++ b/pkg/querier/queryrange/roundtrip.go @@ -120,6 +120,7 @@ func newResultsCacheFromConfig(cfg base.ResultsCacheConfig, registerer prometheu func NewMiddleware( cfg Config, engineOpts logql.EngineOpts, + iqo util.IngesterQueryOptions, log log.Logger, limits Limits, schema config.SchemaConfig, @@ -176,36 +177,38 @@ func NewMiddleware( var codec base.Codec = DefaultCodec - indexStatsTripperware, err := NewIndexStatsTripperware(cfg, log, limits, schema, codec, statsCache, + split := newDefaultSplitter(limits, iqo) + + indexStatsTripperware, err := NewIndexStatsTripperware(cfg, log, limits, schema, codec, split, statsCache, cacheGenNumLoader, retentionEnabled, metrics, metricsNamespace) if err != nil { return nil, nil, err } - metricsTripperware, err := NewMetricTripperware(cfg, engineOpts, log, limits, schema, codec, resultsCache, + metricsTripperware, err := NewMetricTripperware(cfg, engineOpts, log, limits, schema, codec, newMetricQuerySplitter(limits, iqo), resultsCache, cacheGenNumLoader, retentionEnabled, PrometheusExtractor{}, metrics, indexStatsTripperware, metricsNamespace) if err != nil { return nil, nil, err } - limitedTripperware, err := NewLimitedTripperware(cfg, engineOpts, log, limits, schema, metrics, indexStatsTripperware, codec) + limitedTripperware, err := NewLimitedTripperware(cfg, engineOpts, log, limits, schema, metrics, indexStatsTripperware, codec, split) if err != nil { return nil, nil, err } // NOTE: When we would start caching response from non-metric queries we would have to consider cache gen headers as well in // MergeResponse implementation for Loki codecs same as it is done in Cortex at https://github.com/cortexproject/cortex/blob/21bad57b346c730d684d6d0205efef133422ab28/pkg/querier/queryrange/query_range.go#L170 - logFilterTripperware, err := NewLogFilterTripperware(cfg, engineOpts, log, limits, schema, codec, resultsCache, metrics, indexStatsTripperware, metricsNamespace) + logFilterTripperware, err := NewLogFilterTripperware(cfg, engineOpts, log, limits, schema, codec, split, resultsCache, metrics, indexStatsTripperware, metricsNamespace) if err != nil { return nil, nil, err } - seriesTripperware, err := NewSeriesTripperware(cfg, log, limits, metrics, schema, codec, seriesCache, cacheGenNumLoader, retentionEnabled, metricsNamespace) + seriesTripperware, err := NewSeriesTripperware(cfg, log, limits, metrics, schema, codec, split, seriesCache, cacheGenNumLoader, retentionEnabled, metricsNamespace) if err != nil { return nil, nil, err } - labelsTripperware, err := NewLabelsTripperware(cfg, log, limits, codec, labelsCache, cacheGenNumLoader, retentionEnabled, metrics, schema, metricsNamespace) + labelsTripperware, err := NewLabelsTripperware(cfg, log, limits, codec, split, labelsCache, cacheGenNumLoader, retentionEnabled, metrics, schema, metricsNamespace) if err != nil { return nil, nil, err } @@ -215,7 +218,7 @@ func NewMiddleware( return nil, nil, err } - seriesVolumeTripperware, err := NewVolumeTripperware(cfg, log, limits, schema, codec, volumeCache, cacheGenNumLoader, retentionEnabled, metrics, metricsNamespace) + seriesVolumeTripperware, err := NewVolumeTripperware(cfg, log, limits, schema, codec, split, volumeCache, cacheGenNumLoader, retentionEnabled, metrics, metricsNamespace) if err != nil { return nil, nil, err } @@ -406,18 +409,7 @@ func getOperation(path string) string { } // NewLogFilterTripperware creates a new frontend tripperware responsible for handling log requests. -func NewLogFilterTripperware( - cfg Config, - engineOpts logql.EngineOpts, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - merger base.Merger, - c cache.Cache, - metrics *Metrics, - indexStatsTripperware base.Middleware, - metricsNamespace string, -) (base.Middleware, error) { +func NewLogFilterTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, split splitter, c cache.Cache, metrics *Metrics, indexStatsTripperware base.Middleware, metricsNamespace string) (base.Middleware, error) { return base.MiddlewareFunc(func(next base.Handler) base.Handler { statsHandler := indexStatsTripperware.Wrap(next) @@ -426,7 +418,7 @@ func NewLogFilterTripperware( NewLimitsMiddleware(limits), NewQuerySizeLimiterMiddleware(schema.Configs, engineOpts, log, limits, statsHandler), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), } if cfg.CacheResults { @@ -481,16 +473,7 @@ func NewLogFilterTripperware( } // NewLimitedTripperware creates a new frontend tripperware responsible for handling log requests which are label matcher only, no filter expression. -func NewLimitedTripperware( - _ Config, - engineOpts logql.EngineOpts, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - metrics *Metrics, - indexStatsTripperware base.Middleware, - merger base.Merger, -) (base.Middleware, error) { +func NewLimitedTripperware(_ Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, metrics *Metrics, indexStatsTripperware base.Middleware, merger base.Merger, split splitter) (base.Middleware, error) { return base.MiddlewareFunc(func(next base.Handler) base.Handler { statsHandler := indexStatsTripperware.Wrap(next) @@ -499,7 +482,7 @@ func NewLimitedTripperware( NewLimitsMiddleware(limits), NewQuerySizeLimiterMiddleware(schema.Configs, engineOpts, log, limits, statsHandler), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, WithMaxParallelism(limits, limitedQuerySplits), merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, WithMaxParallelism(limits, limitedQuerySplits), merger, split, metrics.SplitByMetrics), NewQuerierSizeLimiterMiddleware(schema.Configs, engineOpts, log, limits, statsHandler), } @@ -518,6 +501,7 @@ func NewSeriesTripperware( metrics *Metrics, schema config.SchemaConfig, merger base.Merger, + split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, @@ -558,7 +542,7 @@ func NewSeriesTripperware( StatsCollectorMiddleware(), NewLimitsMiddleware(limits), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), } if cfg.CacheSeriesResults { @@ -567,7 +551,6 @@ func NewSeriesTripperware( base.InstrumentMiddleware("series_results_cache", metrics.InstrumentMiddlewareMetrics), cacheMiddleware, ) - } if cfg.MaxRetries > 0 { @@ -601,6 +584,7 @@ func NewLabelsTripperware( log log.Logger, limits Limits, merger base.Merger, + split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, @@ -643,7 +627,7 @@ func NewLabelsTripperware( StatsCollectorMiddleware(), NewLimitsMiddleware(limits), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), } if cfg.CacheLabelResults { @@ -652,7 +636,6 @@ func NewLabelsTripperware( base.InstrumentMiddleware("label_results_cache", metrics.InstrumentMiddlewareMetrics), cacheMiddleware, ) - } if cfg.MaxRetries > 0 { @@ -669,21 +652,7 @@ func NewLabelsTripperware( } // NewMetricTripperware creates a new frontend tripperware responsible for handling metric queries -func NewMetricTripperware( - cfg Config, - engineOpts logql.EngineOpts, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - merger base.Merger, - c cache.Cache, - cacheGenNumLoader base.CacheGenNumberLoader, - retentionEnabled bool, - extractor base.Extractor, - metrics *Metrics, - indexStatsTripperware base.Middleware, - metricsNamespace string, -) (base.Middleware, error) { +func NewMetricTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, extractor base.Extractor, metrics *Metrics, indexStatsTripperware base.Middleware, metricsNamespace string) (base.Middleware, error) { cacheKey := cacheKeyLimits{limits, cfg.Transformer} var queryCacheMiddleware base.Middleware if cfg.CacheResults { @@ -737,7 +706,7 @@ func NewMetricTripperware( queryRangeMiddleware, NewQuerySizeLimiterMiddleware(schema.Configs, engineOpts, log, limits, statsHandler), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitMetricByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), ) if cfg.CacheResults { @@ -793,16 +762,7 @@ func NewMetricTripperware( } // NewInstantMetricTripperware creates a new frontend tripperware responsible for handling metric queries -func NewInstantMetricTripperware( - cfg Config, - engineOpts logql.EngineOpts, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - metrics *Metrics, - indexStatsTripperware base.Middleware, - metricsNamespace string, -) (base.Middleware, error) { +func NewInstantMetricTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, metrics *Metrics, indexStatsTripperware base.Middleware, metricsNamespace string) (base.Middleware, error) { return base.MiddlewareFunc(func(next base.Handler) base.Handler { statsHandler := indexStatsTripperware.Wrap(next) @@ -844,21 +804,10 @@ func NewInstantMetricTripperware( }), nil } -func NewVolumeTripperware( - cfg Config, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - merger base.Merger, - c cache.Cache, - cacheGenNumLoader base.CacheGenNumberLoader, - retentionEnabled bool, - metrics *Metrics, - metricsNamespace string, -) (base.Middleware, error) { +func NewVolumeTripperware(cfg Config, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, metrics *Metrics, metricsNamespace string) (base.Middleware, error) { // Parallelize the volume requests, so it doesn't send a huge request to a single index-gw (i.e. {app=~".+"} for 30d). // Indices are sharded by 24 hours, so we split the volume request in 24h intervals. - limits = WithSplitByLimits(limits, 24*time.Hour) + limits = WithSplitByLimits(limits, indexStatsQuerySplitInterval) var cacheMiddleware base.Middleware if cfg.CacheVolumeResults { var err error @@ -894,6 +843,7 @@ func NewVolumeTripperware( cacheMiddleware, cfg, merger, + split, limits, log, metrics, @@ -962,18 +912,7 @@ func volumeFeatureFlagRoundTripper(nextTW base.Middleware, limits Limits) base.M }) } -func NewIndexStatsTripperware( - cfg Config, - log log.Logger, - limits Limits, - schema config.SchemaConfig, - merger base.Merger, - c cache.Cache, - cacheGenNumLoader base.CacheGenNumberLoader, - retentionEnabled bool, - metrics *Metrics, - metricsNamespace string, -) (base.Middleware, error) { +func NewIndexStatsTripperware(cfg Config, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, split splitter, c cache.Cache, cacheGenNumLoader base.CacheGenNumberLoader, retentionEnabled bool, metrics *Metrics, metricsNamespace string) (base.Middleware, error) { limits = WithSplitByLimits(limits, indexStatsQuerySplitInterval) var cacheMiddleware base.Middleware @@ -1011,6 +950,7 @@ func NewIndexStatsTripperware( cacheMiddleware, cfg, merger, + split, limits, log, metrics, @@ -1028,6 +968,7 @@ func sharedIndexTripperware( cacheMiddleware base.Middleware, cfg Config, merger base.Merger, + split splitter, limits Limits, log log.Logger, metrics *Metrics, @@ -1038,7 +979,7 @@ func sharedIndexTripperware( middlewares := []base.Middleware{ NewLimitsMiddleware(limits), base.InstrumentMiddleware("split_by_interval", metrics.InstrumentMiddlewareMetrics), - SplitByIntervalMiddleware(schema.Configs, limits, merger, splitByTime, metrics.SplitByMetrics), + SplitByIntervalMiddleware(schema.Configs, limits, merger, split, metrics.SplitByMetrics), } if cacheMiddleware != nil { diff --git a/pkg/querier/queryrange/roundtrip_test.go b/pkg/querier/queryrange/roundtrip_test.go index 883f9b14226bc..fe8799fffe799 100644 --- a/pkg/querier/queryrange/roundtrip_test.go +++ b/pkg/querier/queryrange/roundtrip_test.go @@ -189,7 +189,7 @@ func TestMetricsTripperware(t *testing.T) { noCacheTestCfg := testConfig noCacheTestCfg.CacheResults = false noCacheTestCfg.CacheIndexStatsResults = false - tpw, stopper, err := NewMiddleware(noCacheTestCfg, testEngineOpts, util_log.Logger, l, config.SchemaConfig{ + tpw, stopper, err := NewMiddleware(noCacheTestCfg, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{ Configs: testSchemasTSDB, }, nil, false, nil, constants.Loki) if stopper != nil { @@ -240,7 +240,7 @@ func TestMetricsTripperware(t *testing.T) { require.Error(t, err) // Configure with cache - tpw, stopper, err = NewMiddleware(testConfig, testEngineOpts, util_log.Logger, l, config.SchemaConfig{ + tpw, stopper, err = NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{ Configs: testSchemasTSDB, }, nil, false, nil, constants.Loki) if stopper != nil { @@ -278,7 +278,7 @@ func TestLogFilterTripperware(t *testing.T) { noCacheTestCfg := testConfig noCacheTestCfg.CacheResults = false noCacheTestCfg.CacheIndexStatsResults = false - tpw, stopper, err := NewMiddleware(noCacheTestCfg, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: testSchemasTSDB}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(noCacheTestCfg, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: testSchemasTSDB}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -347,7 +347,7 @@ func TestInstantQueryTripperware(t *testing.T) { queryTimeout: 1 * time.Minute, maxSeries: 1, } - tpw, stopper, err := NewMiddleware(testShardingConfigNoCache, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: testSchemasTSDB}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testShardingConfigNoCache, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: testSchemasTSDB}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -403,7 +403,7 @@ func TestSeriesTripperware(t *testing.T) { "1": 24 * time.Hour, }, } - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -441,7 +441,7 @@ func TestLabelsTripperware(t *testing.T) { "1": 24 * time.Hour, }, } - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -487,7 +487,7 @@ func TestLabelsTripperware(t *testing.T) { } func TestIndexStatsTripperware(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -537,7 +537,7 @@ func TestVolumeTripperware(t *testing.T) { volumeEnabled: true, maxSeries: 42, } - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -593,7 +593,7 @@ func TestVolumeTripperware(t *testing.T) { }) t.Run("range queries return a prometheus style metrics response, putting volumes in buckets based on the step", func(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, volumeEnabled: true}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, volumeEnabled: true}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -784,7 +784,7 @@ func TestNewTripperware_Caches(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - _, stopper, err := NewMiddleware(tc.config, testEngineOpts, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + _, stopper, err := NewMiddleware(tc.config, testEngineOpts, nil, util_log.Logger, fakeLimits{maxQueryLength: 48 * time.Hour, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -814,7 +814,7 @@ func TestNewTripperware_Caches(t *testing.T) { } func TestLogNoFilter(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -871,7 +871,7 @@ func TestPostQueries(t *testing.T) { } func TestTripperware_EntriesLimit(t *testing.T) { - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, fakeLimits{maxEntriesLimitPerQuery: 5000, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, fakeLimits{maxEntriesLimitPerQuery: 5000, maxQueryParallelism: 1}, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -920,7 +920,7 @@ func TestTripperware_RequiredLabels(t *testing.T) { } { t.Run(test.qs, func(t *testing.T) { limits := fakeLimits{maxEntriesLimitPerQuery: 5000, maxQueryParallelism: 1, requiredLabels: []string{"app"}} - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -1027,7 +1027,7 @@ func TestTripperware_RequiredNumberLabels(t *testing.T) { maxQueryParallelism: 1, requiredNumberLabels: tc.requiredNumberLabels, } - tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(testConfig, testEngineOpts, nil, util_log.Logger, limits, config.SchemaConfig{Configs: testSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -1218,7 +1218,7 @@ func TestMetricsTripperware_SplitShardStats(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - tpw, stopper, err := NewMiddleware(statsTestCfg, testEngineOpts, util_log.Logger, l, config.SchemaConfig{Configs: statsSchemas}, nil, false, nil, constants.Loki) + tpw, stopper, err := NewMiddleware(statsTestCfg, testEngineOpts, nil, util_log.Logger, l, config.SchemaConfig{Configs: statsSchemas}, nil, false, nil, constants.Loki) if stopper != nil { defer stopper.Stop() } @@ -1245,6 +1245,7 @@ type fakeLimits struct { maxSeries int splitDuration map[string]time.Duration metadataSplitDuration map[string]time.Duration + ingesterSplitDuration map[string]time.Duration minShardingLookback time.Duration queryTimeout time.Duration requiredLabels []string @@ -1269,6 +1270,13 @@ func (f fakeLimits) MetadataQuerySplitDuration(key string) time.Duration { return f.metadataSplitDuration[key] } +func (f fakeLimits) IngesterQuerySplitDuration(key string) time.Duration { + if f.ingesterSplitDuration == nil { + return 0 + } + return f.ingesterSplitDuration[key] +} + func (f fakeLimits) MaxQueryLength(context.Context, string) time.Duration { if f.maxQueryLength == 0 { return time.Hour * 7 @@ -1344,6 +1352,19 @@ func (f fakeLimits) TSDBMaxBytesPerShard(_ string) int { return valid.DefaultTSDBMaxBytesPerShard } +type ingesterQueryOpts struct { + queryStoreOnly bool + queryIngestersWithin time.Duration +} + +func (i ingesterQueryOpts) QueryStoreOnly() bool { + return i.queryStoreOnly +} + +func (i ingesterQueryOpts) QueryIngestersWithin() time.Duration { + return i.queryIngestersWithin +} + func counter() (*int, base.Handler) { count := 0 var lock sync.Mutex diff --git a/pkg/querier/queryrange/split_by_interval.go b/pkg/querier/queryrange/split_by_interval.go index 9e2eda4b19423..b332fe5e612e7 100644 --- a/pkg/querier/queryrange/split_by_interval.go +++ b/pkg/querier/queryrange/split_by_interval.go @@ -21,7 +21,6 @@ import ( "github.com/grafana/loki/pkg/logql/syntax" "github.com/grafana/loki/pkg/querier/queryrange/queryrangebase" "github.com/grafana/loki/pkg/storage/config" - "github.com/grafana/loki/pkg/util" "github.com/grafana/loki/pkg/util/validation" ) @@ -56,13 +55,11 @@ type splitByInterval struct { limits Limits merger queryrangebase.Merger metrics *SplitByMetrics - splitter Splitter + splitter splitter } -type Splitter func(req queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) - // SplitByIntervalMiddleware creates a new Middleware that splits log requests by a given interval. -func SplitByIntervalMiddleware(configs []config.PeriodConfig, limits Limits, merger queryrangebase.Merger, splitter Splitter, metrics *SplitByMetrics) queryrangebase.Middleware { +func SplitByIntervalMiddleware(configs []config.PeriodConfig, limits Limits, merger queryrangebase.Merger, splitter splitter, metrics *SplitByMetrics) queryrangebase.Middleware { if metrics == nil { metrics = NewSplitByMetrics(nil) } @@ -197,7 +194,7 @@ func (h *splitByInterval) Do(ctx context.Context, r queryrangebase.Request) (que return h.next.Do(ctx, r) } - intervals, err := h.splitter(r, interval) + intervals, err := h.splitter.split(time.Now().UTC(), tenantIDs, r, interval) if err != nil { return nil, err } @@ -251,73 +248,6 @@ func (h *splitByInterval) Do(ctx context.Context, r queryrangebase.Request) (que return h.merger.MergeResponse(resps...) } -func splitByTime(req queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) { - var reqs []queryrangebase.Request - - switch r := req.(type) { - case *LokiRequest: - util.ForInterval(interval, r.StartTs, r.EndTs, false, func(start, end time.Time) { - reqs = append(reqs, &LokiRequest{ - Query: r.Query, - Limit: r.Limit, - Step: r.Step, - Interval: r.Interval, - Direction: r.Direction, - Path: r.Path, - StartTs: start, - EndTs: end, - Plan: r.Plan, - }) - }) - case *LokiSeriesRequest: - // metadata queries have end time inclusive. - // Set endTimeInclusive to true so that ForInterval keeps a gap of 1ms between splits to - // avoid querying duplicate data in adjacent queries. - util.ForInterval(interval, r.StartTs, r.EndTs, true, func(start, end time.Time) { - reqs = append(reqs, &LokiSeriesRequest{ - Match: r.Match, - Path: r.Path, - StartTs: start, - EndTs: end, - Shards: r.Shards, - }) - }) - case *LabelRequest: - // metadata queries have end time inclusive. - // Set endTimeInclusive to true so that ForInterval keeps a gap of 1ms between splits to - // avoid querying duplicate data in adjacent queries. - util.ForInterval(interval, *r.Start, *r.End, true, func(start, end time.Time) { - reqs = append(reqs, NewLabelRequest(start, end, r.Query, r.Name, r.Path())) - }) - case *logproto.IndexStatsRequest: - startTS := r.GetStart() - endTS := r.GetEnd() - util.ForInterval(interval, startTS, endTS, true, func(start, end time.Time) { - reqs = append(reqs, &logproto.IndexStatsRequest{ - From: model.TimeFromUnix(start.Unix()), - Through: model.TimeFromUnix(end.Unix()), - Matchers: r.GetMatchers(), - }) - }) - case *logproto.VolumeRequest: - startTS := r.GetStart() - endTS := r.GetEnd() - util.ForInterval(interval, startTS, endTS, true, func(start, end time.Time) { - reqs = append(reqs, &logproto.VolumeRequest{ - From: model.TimeFromUnix(start.Unix()), - Through: model.TimeFromUnix(end.Unix()), - Matchers: r.GetMatchers(), - Limit: r.Limit, - TargetLabels: r.TargetLabels, - AggregateBy: r.AggregateBy, - }) - }) - default: - return nil, nil - } - return reqs, nil -} - // maxRangeVectorAndOffsetDurationFromQueryString func maxRangeVectorAndOffsetDurationFromQueryString(q string) (time.Duration, time.Duration, error) { parsed, err := syntax.ParseExpr(q) @@ -346,92 +276,3 @@ func maxRangeVectorAndOffsetDuration(expr syntax.Expr) (time.Duration, time.Dura }) return maxRVDuration, maxOffset, nil } - -// reduceSplitIntervalForRangeVector reduces the split interval for a range query based on the duration of the range vector. -// Large range vector durations will not be split into smaller intervals because it can cause the queries to be slow by over-processing data. -func reduceSplitIntervalForRangeVector(r *LokiRequest, interval time.Duration) (time.Duration, error) { - maxRange, _, err := maxRangeVectorAndOffsetDuration(r.Plan.AST) - if err != nil { - return 0, err - } - if maxRange > interval { - return maxRange, nil - } - return interval, nil -} - -func splitMetricByTime(r queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) { - var reqs []queryrangebase.Request - - lokiReq := r.(*LokiRequest) - - interval, err := reduceSplitIntervalForRangeVector(lokiReq, interval) - if err != nil { - return nil, err - } - - // step align start and end time of the query. Start time is rounded down and end time is rounded up. - stepNs := r.GetStep() * 1e6 - startNs := lokiReq.StartTs.UnixNano() - start := time.Unix(0, startNs-startNs%stepNs) - - endNs := lokiReq.EndTs.UnixNano() - if mod := endNs % stepNs; mod != 0 { - endNs += stepNs - mod - } - end := time.Unix(0, endNs) - - lokiReq = lokiReq.WithStartEnd(start, end).(*LokiRequest) - - // step is >= configured split interval, let us just split the query interval by step - if lokiReq.Step >= interval.Milliseconds() { - util.ForInterval(time.Duration(lokiReq.Step*1e6), lokiReq.StartTs, lokiReq.EndTs, false, func(start, end time.Time) { - reqs = append(reqs, &LokiRequest{ - Query: lokiReq.Query, - Limit: lokiReq.Limit, - Step: lokiReq.Step, - Interval: lokiReq.Interval, - Direction: lokiReq.Direction, - Path: lokiReq.Path, - StartTs: start, - EndTs: end, - Plan: lokiReq.Plan, - }) - }) - - return reqs, nil - } - - for start := lokiReq.StartTs; start.Before(lokiReq.EndTs); start = nextIntervalBoundary(start, r.GetStep(), interval).Add(time.Duration(r.GetStep()) * time.Millisecond) { - end := nextIntervalBoundary(start, r.GetStep(), interval) - if end.Add(time.Duration(r.GetStep())*time.Millisecond).After(lokiReq.EndTs) || end.Add(time.Duration(r.GetStep())*time.Millisecond) == lokiReq.EndTs { - end = lokiReq.EndTs - } - reqs = append(reqs, &LokiRequest{ - Query: lokiReq.Query, - Limit: lokiReq.Limit, - Step: lokiReq.Step, - Interval: lokiReq.Interval, - Direction: lokiReq.Direction, - Path: lokiReq.Path, - StartTs: start, - EndTs: end, - Plan: lokiReq.Plan, - }) - } - - return reqs, nil -} - -// Round up to the step before the next interval boundary. -func nextIntervalBoundary(t time.Time, step int64, interval time.Duration) time.Time { - stepNs := step * 1e6 - nsPerInterval := interval.Nanoseconds() - startOfNextInterval := ((t.UnixNano() / nsPerInterval) + 1) * nsPerInterval - // ensure that target is a multiple of steps away from the start time - target := startOfNextInterval - ((startOfNextInterval - t.UnixNano()) % stepNs) - if target == startOfNextInterval { - target -= stepNs - } - return time.Unix(0, target) -} diff --git a/pkg/querier/queryrange/split_by_interval_test.go b/pkg/querier/queryrange/split_by_interval_test.go index b236b88fb4d53..acf8c495becce 100644 --- a/pkg/querier/queryrange/split_by_interval_test.go +++ b/pkg/querier/queryrange/split_by_interval_test.go @@ -9,9 +9,9 @@ import ( "testing" "time" - "github.com/prometheus/common/model" - "github.com/grafana/dskit/user" + "github.com/prometheus/common/model" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gopkg.in/yaml.v2" @@ -22,6 +22,8 @@ import ( "github.com/grafana/loki/pkg/querier/plan" "github.com/grafana/loki/pkg/querier/queryrange/queryrangebase" "github.com/grafana/loki/pkg/storage/config" + "github.com/grafana/loki/pkg/storage/stores/index/seriesvolume" + "github.com/grafana/loki/pkg/util" ) var nilMetrics = NewSplitByMetrics(nil) @@ -56,181 +58,393 @@ var testSchemasTSDB = func() []config.PeriodConfig { return confs }() -func Test_splitQuery(t *testing.T) { - buildLokiRequest := func(start, end time.Time) queryrangebase.Request { - return &LokiRequest{ - Query: `{app="foo"}`, - Limit: 1, - Step: 2, - StartTs: start, - EndTs: end, - Direction: logproto.BACKWARD, - Path: "/path", - Plan: &plan.QueryPlan{ - AST: syntax.MustParseExpr(`{app="foo"}`), - }, - } - } - - buildLokiRequestWithInterval := func(start, end time.Time) queryrangebase.Request { - return &LokiRequest{ - Query: `{app="foo"}`, - Limit: 1, - Interval: 2, - StartTs: start, - EndTs: end, - Direction: logproto.BACKWARD, - Path: "/path", - Plan: &plan.QueryPlan{ - AST: syntax.MustParseExpr(`{app="foo"}`), - }, - } - } - - buildLokiSeriesRequest := func(start, end time.Time) queryrangebase.Request { - return &LokiSeriesRequest{ - Match: []string{"match1"}, - StartTs: start, - EndTs: end, - Path: "/series", - Shards: []string{"shard1"}, - } - } - - buildLokiLabelNamesRequest := func(start, end time.Time) queryrangebase.Request { - return NewLabelRequest(start, end, "", "", "/lables") - } +var ( + // 62697274686461792063616b65 + refTime = time.Date(2023, 1, 15, 8, 5, 30, 123456789, time.UTC) + tenantID = "1" +) +func Test_splitQuery(t *testing.T) { type interval struct { start, end time.Time } + for requestType, tc := range map[string]struct { requestBuilderFunc func(start, end time.Time) queryrangebase.Request endTimeInclusive bool }{ - "LokiRequest": { - buildLokiRequest, - false, + "logs request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &LokiRequest{ + Query: `{app="foo"}`, + Limit: 1, + Step: 2, + StartTs: start, + EndTs: end, + Direction: logproto.BACKWARD, + Path: "/query", + Plan: &plan.QueryPlan{ + AST: syntax.MustParseExpr(`{app="foo"}`), + }, + } + }, + }, + "logs request with interval": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &LokiRequest{ + Query: `{app="foo"}`, + Limit: 1, + Interval: 2, + StartTs: start, + EndTs: end, + Direction: logproto.BACKWARD, + Path: "/query", + Plan: &plan.QueryPlan{ + AST: syntax.MustParseExpr(`{app="foo"}`), + }, + } + }, + }, + "series request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &LokiSeriesRequest{ + Match: []string{"match1"}, + StartTs: start, + EndTs: end, + Path: "/series", + Shards: []string{"shard1"}, + } + }, + endTimeInclusive: true, + }, + "label names request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return NewLabelRequest(start, end, `{foo="bar"}`, "", "/labels") + }, + endTimeInclusive: true, }, - "LokiRequestWithInterval": { - buildLokiRequestWithInterval, - false, + "label values request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return NewLabelRequest(start, end, `{foo="bar"}`, "test", "/label/test/values") + }, + endTimeInclusive: true, }, - "LokiSeriesRequest": { - buildLokiSeriesRequest, - true, + "index stats request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &logproto.IndexStatsRequest{ + From: model.TimeFromUnix(start.Unix()), + Through: model.TimeFromUnix(end.Unix()), + Matchers: `{host="agent"}`, + } + }, + endTimeInclusive: true, }, - "LokiLabelNamesRequest": { - buildLokiLabelNamesRequest, - true, + "volume request": { + requestBuilderFunc: func(start, end time.Time) queryrangebase.Request { + return &logproto.VolumeRequest{ + From: model.TimeFromUnix(start.Unix()), + Through: model.TimeFromUnix(end.Unix()), + Matchers: `{host="agent"}`, + Limit: 5, + AggregateBy: seriesvolume.Series, + } + }, + endTimeInclusive: true, }, } { expectedSplitGap := time.Duration(0) if tc.endTimeInclusive { - expectedSplitGap = time.Millisecond + expectedSplitGap = util.SplitGap } - for name, intervals := range map[string]struct { - inp interval - expected []interval - }{ - "no_change": { - inp: interval{ - start: time.Unix(0, 0), - end: time.Unix(0, (1 * time.Hour).Nanoseconds()), - }, - expected: []interval{ - { + + t.Run(requestType, func(t *testing.T) { + for name, intervals := range map[string]struct { + input interval + expected []interval + splitInterval time.Duration + splitter splitter + }{ + "no change": { + input: interval{ start: time.Unix(0, 0), end: time.Unix(0, (1 * time.Hour).Nanoseconds()), }, + expected: []interval{ + { + start: time.Unix(0, 0), + end: time.Unix(0, (1 * time.Hour).Nanoseconds()), + }, + }, }, - }, - "align_start": { - inp: interval{ - start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (2 * time.Hour).Nanoseconds()), - }, - expected: []interval{ - { + "align start": { + input: interval{ start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), - }, - { - start: time.Unix(0, (1 * time.Hour).Nanoseconds()), end: time.Unix(0, (2 * time.Hour).Nanoseconds()), }, + expected: []interval{ + { + start: time.Unix(0, (5 * time.Minute).Nanoseconds()), + end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + }, + { + start: time.Unix(0, (1 * time.Hour).Nanoseconds()), + end: time.Unix(0, (2 * time.Hour).Nanoseconds()), + }, + }, }, - }, - "align_end": { - inp: interval{ - start: time.Unix(0, 0), - end: time.Unix(0, (115 * time.Minute).Nanoseconds()), - }, - expected: []interval{ - { + "align end": { + input: interval{ start: time.Unix(0, 0), - end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), - }, - { - start: time.Unix(0, (1 * time.Hour).Nanoseconds()), end: time.Unix(0, (115 * time.Minute).Nanoseconds()), }, + expected: []interval{ + { + start: time.Unix(0, 0), + end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + }, + { + start: time.Unix(0, (1 * time.Hour).Nanoseconds()), + end: time.Unix(0, (115 * time.Minute).Nanoseconds()), + }, + }, }, - }, - "align_both": { - inp: interval{ - start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (175 * time.Minute).Nanoseconds()), + "align both": { + input: interval{ + start: time.Unix(0, (5 * time.Minute).Nanoseconds()), + end: time.Unix(0, (175 * time.Minute).Nanoseconds()), + }, + expected: []interval{ + { + start: time.Unix(0, (5 * time.Minute).Nanoseconds()), + end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + }, + { + start: time.Unix(0, (1 * time.Hour).Nanoseconds()), + end: time.Unix(0, (2 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + }, + { + start: time.Unix(0, (2 * time.Hour).Nanoseconds()), + end: time.Unix(0, (175 * time.Minute).Nanoseconds()), + }, + }, }, - expected: []interval{ - { + "no align": { + input: interval{ start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (1 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + end: time.Unix(0, (55 * time.Minute).Nanoseconds()), }, - { - start: time.Unix(0, (1 * time.Hour).Nanoseconds()), - end: time.Unix(0, (2 * time.Hour).Nanoseconds()).Add(-expectedSplitGap), + expected: []interval{ + { + start: time.Unix(0, (5 * time.Minute).Nanoseconds()), + end: time.Unix(0, (55 * time.Minute).Nanoseconds()), + }, }, - { - start: time.Unix(0, (2 * time.Hour).Nanoseconds()), - end: time.Unix(0, (175 * time.Minute).Nanoseconds()), + }, + "wholly within ingester query window": { + input: interval{ + start: refTime.Add(-time.Hour).Truncate(time.Second), + end: refTime, + }, + expected: []interval{ + { + start: refTime.Add(-time.Hour).Truncate(time.Second), + end: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC), + end: refTime, + }, }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), }, - }, - "no_align": { - inp: interval{ - start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (55 * time.Minute).Nanoseconds()), + "partially within ingester query window": { + input: interval{ + // overlapping `query_ingesters_within` window of 3h + start: refTime.Add(-4 * time.Hour).Add(-30 * time.Minute).Truncate(time.Second), + end: refTime, + }, + expected: []interval{ + // regular intervals until `query_ingesters_within` window + { + start: refTime.Add(-4 * time.Hour).Add(-30 * time.Minute).Truncate(time.Second), + end: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 5, 5, 30, 123456789, time.UTC).Add(-expectedSplitGap), + }, + // and then different intervals for queries to ingesters + { + start: time.Date(2023, 1, 15, 5, 5, 30, 123456789, time.UTC), + end: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC), + end: refTime, + }, + }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), }, - expected: []interval{ - { - start: time.Unix(0, (5 * time.Minute).Nanoseconds()), - end: time.Unix(0, (55 * time.Minute).Nanoseconds()), + "not within ingester query window": { + input: interval{ + // outside `query_ingesters_within` range of 3h + start: refTime.Add(-5 * time.Hour).Truncate(time.Second), + end: refTime.Add(-4 * time.Hour).Truncate(time.Second), + }, + expected: []interval{ + // regular intervals outside `query_ingesters_within` window + { + start: refTime.Add(-5 * time.Hour).Truncate(time.Second), + end: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC), + end: refTime.Add(-4 * time.Hour).Truncate(time.Second), + }, }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), }, - }, - } { - t.Run(fmt.Sprintf("%s - %s", name, requestType), func(t *testing.T) { - inp := tc.requestBuilderFunc(intervals.inp.start, intervals.inp.end) - var want []queryrangebase.Request - for _, interval := range intervals.expected { - want = append(want, tc.requestBuilderFunc(interval.start, interval.end)) - } - splits, err := splitByTime(inp, time.Hour) - require.NoError(t, err) - require.Equal(t, want, splits) - }) - } + "ingester query split by disabled": { + input: interval{ + // overlapping `query_ingesters_within` range of 3h + start: refTime.Add(-4 * time.Hour).Truncate(time.Second), + end: refTime, + }, + expected: []interval{ + // regular intervals only, since ingester split duration is 0 + { + start: refTime.Add(-4 * time.Hour).Truncate(time.Second), + end: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC), + end: refTime, + }, + }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 0}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + "ingester query split enabled but query_store_only enabled too": { + input: interval{ + // overlapping `query_ingesters_within` range of 3h + start: refTime.Add(-4 * time.Hour).Truncate(time.Second), + end: refTime, + }, + expected: []interval{ + // regular intervals only, since ingester split duration is 0 + { + start: refTime.Add(-4 * time.Hour).Truncate(time.Second), + end: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC), + end: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC).Add(-expectedSplitGap), + }, + { + start: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC), + end: refTime, + }, + }, + splitInterval: time.Hour, + splitter: newDefaultSplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour, queryStoreOnly: true}, + ), + }, + } { + t.Run(name, func(t *testing.T) { + req := tc.requestBuilderFunc(intervals.input.start, intervals.input.end) + var want []queryrangebase.Request + for _, exp := range intervals.expected { + want = append(want, tc.requestBuilderFunc(exp.start, exp.end)) + } + + if intervals.splitInterval == 0 { + intervals.splitInterval = time.Hour + } + + if intervals.splitter == nil { + intervals.splitter = newDefaultSplitter(fakeLimits{}, nil) + } + + splits, err := intervals.splitter.split(refTime, []string{tenantID}, req, intervals.splitInterval) + require.NoError(t, err) + if !assert.Equal(t, want, splits) { + t.Logf("expected and actual do not match\n") + defer t.Fail() + + if len(want) != len(splits) { + t.Logf("expected %d splits, got %d\n", len(want), len(splits)) + return + } + + for j := 0; j < len(want); j++ { + exp := want[j] + act := splits[j] + equal := assert.Equal(t, exp, act) + t.Logf("\t#%d [matches: %v]: expected %q/%q got %q/%q\n", j, equal, exp.GetStart(), exp.GetEnd(), act.GetStart(), act.GetEnd()) + } + } + }) + } + }) } } func Test_splitMetricQuery(t *testing.T) { const seconds = 1e3 // 1e3 milliseconds per second. + const shortRange = `rate({app="foo"}[1m])` + const longRange = `rate({app="foo"}[7d])` + for i, tc := range []struct { - input *LokiRequest - expected []queryrangebase.Request - interval time.Duration + input *LokiRequest + expected []queryrangebase.Request + splitInterval time.Duration + splitter splitter }{ // the step is lower than the interval therefore we should split only once. { @@ -238,172 +452,172 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(0, 0), EndTs: time.Unix(0, 60*time.Minute.Nanoseconds()), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(0, 60*time.Minute.Nanoseconds()), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 24 * time.Hour, + splitInterval: 24 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(60*60, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(60*60, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 24 * time.Hour, + splitInterval: 24 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(2*24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix((24*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((24 * 3600), 0), EndTs: time.Unix((2 * 24 * 3600), 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 24 * time.Hour, + splitInterval: 24 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(2*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix((3*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((3 * 3600), 0), EndTs: time.Unix((2 * 3 * 3600), 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(3*3600, 0), EndTs: time.Unix(3*24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(3*3600, 0), EndTs: time.Unix((24*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(24*3600, 0), EndTs: time.Unix((2*24*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*24*3600, 0), EndTs: time.Unix(3*24*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 24 * time.Hour, + splitInterval: 24 * time.Hour, }, { input: &LokiRequest{ StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600, 0), EndTs: time.Unix((3*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(3*3600, 0), EndTs: time.Unix((2*3*3600)-15, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*3*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, // step not a multiple of interval @@ -413,29 +627,29 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(2*3600-9, 0), // 2h mod 17s = 9s EndTs: time.Unix(3*3*3600, 0), Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600-9, 0), EndTs: time.Unix((3*3600)-5, 0), // 3h mod 17s = 5s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((3*3600)+12, 0), EndTs: time.Unix((2*3*3600)-10, 0), // 6h mod 17s = 10s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*3*3600+7, 0), EndTs: time.Unix(3*3*3600+2, 0), // 9h mod 17s = 2s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, // end time already step aligned { @@ -443,29 +657,29 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600+2, 0), // 9h mod 17s = 2s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600-9, 0), // 2h mod 17s = 9s EndTs: time.Unix((3*3600)-5, 0), // 3h mod 17s = 5s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((3*3600)+12, 0), EndTs: time.Unix((2*3*3600)-10, 0), // 6h mod 17s = 10s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*3*3600+7, 0), EndTs: time.Unix(3*3*3600+2, 0), Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, // start & end time not aligned with step { @@ -473,29 +687,29 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600-9, 0), // 2h mod 17s = 9s EndTs: time.Unix((3*3600)-5, 0), // 3h mod 17s = 5s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix((3*3600)+12, 0), EndTs: time.Unix((2*3*3600)-10, 0), // 6h mod 17s = 10s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(2*3*3600+7, 0), EndTs: time.Unix(3*3*3600+2, 0), // 9h mod 17s = 2s Step: 17 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 3 * time.Hour, + splitInterval: 3 * time.Hour, }, // step larger than split interval @@ -504,58 +718,58 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(0, 0), EndTs: time.Unix(25*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(6*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(6*3600, 0), EndTs: time.Unix(12*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(12*3600, 0), EndTs: time.Unix(18*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(18*3600, 0), EndTs: time.Unix(24*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, &LokiRequest{ StartTs: time.Unix(24*3600, 0), EndTs: time.Unix(30*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 15 * time.Minute, + splitInterval: 15 * time.Minute, }, { input: &LokiRequest{ StartTs: time.Unix(1*3600, 0), EndTs: time.Unix(3*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(0, 0), EndTs: time.Unix(6*3600, 0), Step: 6 * 3600 * seconds, - Query: `rate({app="foo"}[1m])`, + Query: shortRange, }, }, - interval: 15 * time.Minute, + splitInterval: 15 * time.Minute, }, // reduce split by to 6h instead of 1h { @@ -579,7 +793,7 @@ func Test_splitMetricQuery(t *testing.T) { Query: `rate({app="foo"}[6h])`, }, }, - interval: 1 * time.Hour, + splitInterval: 1 * time.Hour, }, // range vector too large we don't want to split it { @@ -587,17 +801,222 @@ func Test_splitMetricQuery(t *testing.T) { StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[7d])`, + Query: longRange, }, expected: []queryrangebase.Request{ &LokiRequest{ StartTs: time.Unix(2*3600, 0), EndTs: time.Unix(3*3*3600, 0), Step: 15 * seconds, - Query: `rate({app="foo"}[7d])`, + Query: longRange, + }, + }, + splitInterval: 15 * time.Minute, + }, + // query is wholly within ingester query window + { + input: &LokiRequest{ + StartTs: refTime.Add(-time.Hour), + EndTs: refTime, + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 05, 30, 0, time.UTC), // start time is aligned down to step of 15s + EndTs: time.Date(2023, 1, 15, 7, 29, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 8, 5, 45, 0, time.UTC), // end time is aligned up to step of 15s + Step: 15 * seconds, + Query: shortRange, }, }, - interval: 15 * time.Minute, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + // query is partially within ingester query window + { + input: &LokiRequest{ + StartTs: refTime.Add(-4 * time.Hour).Add(-30 * time.Minute), + EndTs: refTime, + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + // regular intervals until `query_ingesters_within` window + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 3, 35, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 3, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 4, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 5, 5, 15, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + // and then different intervals for queries to ingesters + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 5, 5, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 5, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 7, 29, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 30, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 8, 5, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + }, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + // not within ingester query window + { + input: &LokiRequest{ + StartTs: refTime.Add(-5 * time.Hour), + EndTs: refTime.Add(-4 * time.Hour), + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + // regular intervals until `query_ingesters_within` window + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 3, 5, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 3, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 4, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 4, 5, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + }, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + // ingester query split by disabled + { + input: &LokiRequest{ + StartTs: refTime.Add(-4 * time.Hour), + EndTs: refTime, + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + // regular intervals only, since ingester split duration is 0 + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 4, 5, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 4, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 5, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 6, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 7, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 8, 5, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + }, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 0}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour}, + ), + }, + // ingester query split by enabled, but query_store_only is enabled too + { + input: &LokiRequest{ + StartTs: refTime.Add(-4 * time.Hour), + EndTs: refTime, + Step: 15 * seconds, + Query: shortRange, + }, + expected: []queryrangebase.Request{ + // regular intervals only, since ingester split duration is 0 + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 4, 5, 30, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 4, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 5, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 5, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 6, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 6, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 7, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 7, 59, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + &LokiRequest{ + StartTs: time.Date(2023, 1, 15, 8, 0, 0, 0, time.UTC), + EndTs: time.Date(2023, 1, 15, 8, 5, 45, 0, time.UTC), + Step: 15 * seconds, + Query: shortRange, + }, + }, + splitInterval: time.Hour, + splitter: newMetricQuerySplitter( + fakeLimits{ingesterSplitDuration: map[string]time.Duration{tenantID: 90 * time.Minute}}, + ingesterQueryOpts{queryIngestersWithin: 3 * time.Hour, queryStoreOnly: true}, + ), }, } { // Set query plans @@ -612,13 +1031,29 @@ func Test_splitMetricQuery(t *testing.T) { } t.Run(strconv.Itoa(i), func(t *testing.T) { - splits, err := splitMetricByTime(tc.input, tc.interval) + ms := newMetricQuerySplitter(fakeLimits{}, nil) + if tc.splitter != nil { + ms = tc.splitter.(*metricQuerySplitter) + } + + splits, err := ms.split(refTime, []string{tenantID}, tc.input, tc.splitInterval) require.NoError(t, err) - for i, s := range splits { - s := s.(*LokiRequest) - t.Logf(" want: %d start:%s end:%s \n", i, s.StartTs, s.EndTs) + if !assert.Equal(t, tc.expected, splits) { + t.Logf("expected and actual do not match\n") + defer t.Fail() + + if len(tc.expected) != len(splits) { + t.Logf("expected %d splits, got %d\n", len(tc.expected), len(splits)) + return + } + + for j := 0; j < len(tc.expected); j++ { + exp := tc.expected[j] + act := splits[j] + equal := assert.Equal(t, exp, act) + t.Logf("\t#%d [matches: %v]: expected %q/%q got %q/%q\n", j, equal, exp.GetStart(), exp.GetEnd(), act.GetStart(), act.GetEnd()) + } } - require.Equal(t, tc.expected, splits) }) } @@ -646,12 +1081,13 @@ func Test_splitByInterval_Do(t *testing.T) { }, nil }) + defSplitter := newDefaultSplitter(fakeLimits{}, nil) l := WithSplitByLimits(fakeLimits{maxQueryParallelism: 1}, time.Hour) split := SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) @@ -834,11 +1270,12 @@ func Test_series_splitByInterval_Do(t *testing.T) { "1": time.Hour, }, } + defSplitter := newDefaultSplitter(fakeLimits{}, nil) split := SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) @@ -888,12 +1325,13 @@ func Test_series_splitByInterval_Do(t *testing.T) { func Test_seriesvolume_splitByInterval_Do(t *testing.T) { ctx := user.InjectOrgID(context.Background(), "1") + defSplitter := newDefaultSplitter(fakeLimits{}, nil) setup := func(next queryrangebase.Handler, l Limits) queryrangebase.Handler { return SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) } @@ -1050,11 +1488,12 @@ func Test_ExitEarly(t *testing.T) { }) l := WithSplitByLimits(fakeLimits{maxQueryParallelism: 1}, time.Hour) + defSplitter := newDefaultSplitter(fakeLimits{}, nil) split := SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) @@ -1132,11 +1571,12 @@ func Test_DoesntDeadlock(t *testing.T) { }) l := WithSplitByLimits(fakeLimits{maxQueryParallelism: n}, time.Hour) + defSplitter := newDefaultSplitter(fakeLimits{}, nil) split := SplitByIntervalMiddleware( testSchemas, l, DefaultCodec, - splitByTime, + defSplitter, nilMetrics, ).Wrap(next) diff --git a/pkg/querier/queryrange/splitters.go b/pkg/querier/queryrange/splitters.go new file mode 100644 index 0000000000000..79e3d5352e06f --- /dev/null +++ b/pkg/querier/queryrange/splitters.go @@ -0,0 +1,297 @@ +package queryrange + +import ( + "time" + + "github.com/prometheus/common/model" + + "github.com/grafana/loki/pkg/logproto" + "github.com/grafana/loki/pkg/querier/queryrange/queryrangebase" + "github.com/grafana/loki/pkg/util" + "github.com/grafana/loki/pkg/util/validation" +) + +type splitter interface { + split(execTime time.Time, tenantIDs []string, request queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) +} + +type defaultSplitter struct { + limits Limits + iqo util.IngesterQueryOptions +} + +func newDefaultSplitter(limits Limits, iqo util.IngesterQueryOptions) *defaultSplitter { + return &defaultSplitter{limits, iqo} +} + +func (s *defaultSplitter) split(execTime time.Time, tenantIDs []string, req queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) { + var ( + reqs []queryrangebase.Request + factory func(start, end time.Time) + endTimeInclusive = true + ) + + switch r := req.(type) { + case *LokiRequest: + endTimeInclusive = false + factory = func(start, end time.Time) { + reqs = append(reqs, &LokiRequest{ + Query: r.Query, + Limit: r.Limit, + Step: r.Step, + Interval: r.Interval, + Direction: r.Direction, + Path: r.Path, + StartTs: start, + EndTs: end, + Plan: r.Plan, + }) + } + case *LokiSeriesRequest: + // metadata queries have end time inclusive. + // Set endTimeInclusive to true so that ForInterval keeps a gap of 1ms between splits to + // avoid querying duplicate data in adjacent queries. + factory = func(start, end time.Time) { + reqs = append(reqs, &LokiSeriesRequest{ + Match: r.Match, + Path: r.Path, + StartTs: start, + EndTs: end, + Shards: r.Shards, + }) + } + case *LabelRequest: + // metadata queries have end time inclusive. + // Set endTimeInclusive to true so that ForInterval keeps a gap of 1ms between splits to + // avoid querying duplicate data in adjacent queries. + factory = func(start, end time.Time) { + reqs = append(reqs, NewLabelRequest(start, end, r.Query, r.Name, r.Path())) + } + case *logproto.IndexStatsRequest: + factory = func(start, end time.Time) { + reqs = append(reqs, &logproto.IndexStatsRequest{ + From: model.TimeFromUnix(start.Unix()), + Through: model.TimeFromUnix(end.Unix()), + Matchers: r.GetMatchers(), + }) + } + case *logproto.VolumeRequest: + factory = func(start, end time.Time) { + reqs = append(reqs, &logproto.VolumeRequest{ + From: model.TimeFromUnix(start.Unix()), + Through: model.TimeFromUnix(end.Unix()), + Matchers: r.GetMatchers(), + Limit: r.Limit, + TargetLabels: r.TargetLabels, + AggregateBy: r.AggregateBy, + }) + } + default: + return nil, nil + } + + var ( + ingesterSplits []queryrangebase.Request + origStart = req.GetStart().UTC() + origEnd = req.GetEnd().UTC() + ) + + start, end, needsIngesterSplits := ingesterQueryBounds(execTime, s.iqo, req) + + if ingesterQueryInterval := validation.MaxDurationPerTenant(tenantIDs, s.limits.IngesterQuerySplitDuration); ingesterQueryInterval != 0 && needsIngesterSplits { + // perform splitting using special interval (`split_ingester_queries_by_interval`) + util.ForInterval(ingesterQueryInterval, start, end, endTimeInclusive, factory) + + // rebound after ingester queries have been split out + end = start + start = req.GetStart().UTC() + if endTimeInclusive { + end = end.Add(-util.SplitGap) + } + + // query only overlaps ingester query window, nothing more to do + if start.After(end) || start.Equal(end) { + return reqs, nil + } + + // copy the splits, reset the results + ingesterSplits = reqs + reqs = nil + } else { + start = origStart + end = origEnd + } + + // perform splitting over the rest of the time range + util.ForInterval(interval, origStart, end, endTimeInclusive, factory) + + // move the ingester splits to the end to maintain correct order + reqs = append(reqs, ingesterSplits...) + return reqs, nil +} + +type metricQuerySplitter struct { + limits Limits + iqo util.IngesterQueryOptions +} + +func newMetricQuerySplitter(limits Limits, iqo util.IngesterQueryOptions) *metricQuerySplitter { + return &metricQuerySplitter{limits, iqo} +} + +// reduceSplitIntervalForRangeVector reduces the split interval for a range query based on the duration of the range vector. +// Large range vector durations will not be split into smaller intervals because it can cause the queries to be slow by over-processing data. +func (s *metricQuerySplitter) reduceSplitIntervalForRangeVector(r *LokiRequest, interval time.Duration) (time.Duration, error) { + maxRange, _, err := maxRangeVectorAndOffsetDuration(r.Plan.AST) + if err != nil { + return 0, err + } + if maxRange > interval { + return maxRange, nil + } + return interval, nil +} + +// Round up to the step before the next interval boundary. +func (s *metricQuerySplitter) nextIntervalBoundary(t time.Time, step int64, interval time.Duration) time.Time { + stepNs := step * 1e6 + nsPerInterval := interval.Nanoseconds() + startOfNextInterval := ((t.UnixNano() / nsPerInterval) + 1) * nsPerInterval + // ensure that target is a multiple of steps away from the start time + target := startOfNextInterval - ((startOfNextInterval - t.UnixNano()) % stepNs) + if target == startOfNextInterval { + target -= stepNs + } + return time.Unix(0, target) +} + +func (s *metricQuerySplitter) split(execTime time.Time, tenantIDs []string, r queryrangebase.Request, interval time.Duration) ([]queryrangebase.Request, error) { + var reqs []queryrangebase.Request + + lokiReq := r.(*LokiRequest) + + interval, err := s.reduceSplitIntervalForRangeVector(lokiReq, interval) + if err != nil { + return nil, err + } + + start, end := s.alignStartEnd(r.GetStep(), lokiReq.StartTs, lokiReq.EndTs) + + lokiReq = lokiReq.WithStartEnd(start, end).(*LokiRequest) + + factory := func(start, end time.Time) { + reqs = append(reqs, &LokiRequest{ + Query: lokiReq.Query, + Limit: lokiReq.Limit, + Step: lokiReq.Step, + Interval: lokiReq.Interval, + Direction: lokiReq.Direction, + Path: lokiReq.Path, + StartTs: start, + EndTs: end, + Plan: lokiReq.Plan, + }) + } + + // step is >= configured split interval, let us just split the query interval by step + // TODO this is likely buggy when step >= query range, how should we handle this? + if lokiReq.Step >= interval.Milliseconds() { + util.ForInterval(time.Duration(lokiReq.Step*1e6), lokiReq.StartTs, lokiReq.EndTs, false, factory) + + return reqs, nil + } + + var ( + ingesterSplits []queryrangebase.Request + needsIngesterSplits bool + ) + + origStart := start + origEnd := end + + start, end, needsIngesterSplits = ingesterQueryBounds(execTime, s.iqo, lokiReq) + start, end = s.alignStartEnd(r.GetStep(), start, end) + + if ingesterQueryInterval := validation.MaxDurationPerTenant(tenantIDs, s.limits.IngesterQuerySplitDuration); ingesterQueryInterval != 0 && needsIngesterSplits { + // perform splitting using special interval (`split_ingester_queries_by_interval`) + s.buildMetricSplits(lokiReq.GetStep(), ingesterQueryInterval, start, end, factory) + + // rebound after ingester queries have been split out + // + // the end time should now be the boundary of the `query_ingester_within` window, which is "start" currently; + // but since start is already step-aligned we need to subtract 1ns to align it down by 1 more step so that we + // get a consistent step between splits + end, _ = s.alignStartEnd(r.GetStep(), start.Add(-time.Nanosecond), end) + // we restore the previous start time (the start time of the query) + start = origStart + + // query only overlaps ingester query window, nothing more to do + if start.After(end) || start.Equal(end) { + return reqs, nil + } + + // copy the splits, reset the results + ingesterSplits = reqs + reqs = nil + } else { + start = origStart + end = origEnd + } + + // perform splitting over the rest of the time range + s.buildMetricSplits(lokiReq.GetStep(), interval, start, end, factory) + + // move the ingester splits to the end to maintain correct order + reqs = append(reqs, ingesterSplits...) + + return reqs, nil +} + +func (s *metricQuerySplitter) alignStartEnd(step int64, start, end time.Time) (time.Time, time.Time) { + // step align start and end time of the query. Start time is rounded down and end time is rounded up. + stepNs := step * 1e6 + startNs := start.UnixNano() + + endNs := end.UnixNano() + if mod := endNs % stepNs; mod != 0 { + endNs += stepNs - mod + } + + return time.Unix(0, startNs-startNs%stepNs), time.Unix(0, endNs) +} + +func (s *metricQuerySplitter) buildMetricSplits(step int64, interval time.Duration, start, end time.Time, factory func(start, end time.Time)) { + for splStart := start; splStart.Before(end); splStart = s.nextIntervalBoundary(splStart, step, interval).Add(time.Duration(step) * time.Millisecond) { + splEnd := s.nextIntervalBoundary(splStart, step, interval) + if splEnd.Add(time.Duration(step)*time.Millisecond).After(end) || splEnd.Add(time.Duration(step)*time.Millisecond) == end { + splEnd = end + } + factory(splStart, splEnd) + } +} + +// ingesterQueryBounds determines if we need to split time ranges overlapping the ingester query window (`query_ingesters_within`) +// and retrieve the bounds for those specific splits +func ingesterQueryBounds(execTime time.Time, iqo util.IngesterQueryOptions, req queryrangebase.Request) (time.Time, time.Time, bool) { + start, end := req.GetStart().UTC(), req.GetEnd().UTC() + + // ingesters are not queried, nothing to do + if iqo == nil || iqo.QueryStoreOnly() { + return start, end, false + } + + windowSize := iqo.QueryIngestersWithin() + ingesterWindow := execTime.UTC().Add(-windowSize) + + // clamp to the start time + if ingesterWindow.Before(start) { + ingesterWindow = start + } + + // query range does not overlap with ingester query window, nothing to do + if end.Before(ingesterWindow) { + return start, end, false + } + + return ingesterWindow, end, true +} diff --git a/pkg/util/config.go b/pkg/util/config.go index 6989931fb618e..f54d469690c98 100644 --- a/pkg/util/config.go +++ b/pkg/util/config.go @@ -4,6 +4,7 @@ import ( "fmt" "io" "strings" + "time" "github.com/go-kit/log/level" "github.com/prometheus/common/version" @@ -38,3 +39,10 @@ func PrintConfig(w io.Writer, config interface{}) error { fmt.Fprintf(w, "---\n# Loki Config\n# %s\n%s\n\n", version.Info(), string(lc)) return nil } + +// IngesterQueryOptions exists because querier.Config cannot be passed directly to the queryrange package +// due to an import cycle. +type IngesterQueryOptions interface { + QueryStoreOnly() bool + QueryIngestersWithin() time.Duration +} diff --git a/pkg/util/time.go b/pkg/util/time.go index 8f9e0c01b0a91..b943fea92aad8 100644 --- a/pkg/util/time.go +++ b/pkg/util/time.go @@ -87,6 +87,8 @@ func NewDisableableTicker(interval time.Duration) (func(), <-chan time.Time) { return func() { tick.Stop() }, tick.C } +const SplitGap = time.Millisecond + // ForInterval splits the given start and end time into given interval. // The start and end time in splits would be aligned to the interval // except for the start time of first split and end time of last split which would be kept same as original start/end @@ -107,7 +109,7 @@ func ForInterval(interval time.Duration, start, end time.Time, endTimeInclusive if !newEnd.Before(end) { newEnd = end } else if endTimeInclusive { - newEnd = newEnd.Add(-time.Millisecond) + newEnd = newEnd.Add(-SplitGap) } if firstInterval { callback(ogStart, newEnd) diff --git a/pkg/validation/limits.go b/pkg/validation/limits.go index 7f1f6ea0d7342..d846cfed51b2e 100644 --- a/pkg/validation/limits.go +++ b/pkg/validation/limits.go @@ -106,6 +106,7 @@ type Limits struct { // Query frontend enforced limits. The default is actually parameterized by the queryrange config. QuerySplitDuration model.Duration `yaml:"split_queries_by_interval" json:"split_queries_by_interval"` MetadataQuerySplitDuration model.Duration `yaml:"split_metadata_queries_by_interval" json:"split_metadata_queries_by_interval"` + IngesterQuerySplitDuration model.Duration `yaml:"split_ingester_queries_by_interval" json:"split_ingester_queries_by_interval"` MinShardingLookback model.Duration `yaml:"min_sharding_lookback" json:"min_sharding_lookback"` MaxQueryBytesRead flagext.ByteSize `yaml:"max_query_bytes_read" json:"max_query_bytes_read"` MaxQuerierBytesRead flagext.ByteSize `yaml:"max_querier_bytes_read" json:"max_querier_bytes_read"` @@ -299,6 +300,9 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { _ = l.MetadataQuerySplitDuration.Set("24h") f.Var(&l.MetadataQuerySplitDuration, "querier.split-metadata-queries-by-interval", "Split metadata queries by a time interval and execute in parallel. The value 0 disables splitting metadata queries by time. This also determines how cache keys are chosen when label/series result caching is enabled.") + _ = l.IngesterQuerySplitDuration.Set("0s") + f.Var(&l.IngesterQuerySplitDuration, "querier.split-ingester-queries-by-interval", "Interval to use for time-based splitting when a request is within the `query_ingesters_within` window; defaults to `split-queries-by-interval` by setting to 0.") + f.StringVar(&l.DeletionMode, "compactor.deletion-mode", "filter-and-delete", "Deletion mode. Can be one of 'disabled', 'filter-only', or 'filter-and-delete'. When set to 'filter-only' or 'filter-and-delete', and if retention_enabled is true, then the log entry deletion API endpoints are available.") // Deprecated @@ -574,6 +578,12 @@ func (o *Overrides) MetadataQuerySplitDuration(userID string) time.Duration { return time.Duration(o.getOverridesForUser(userID).MetadataQuerySplitDuration) } +// IngesterQuerySplitDuration returns the tenant specific splitby interval applied in the query frontend when querying +// during the `query_ingesters_within` window. +func (o *Overrides) IngesterQuerySplitDuration(userID string) time.Duration { + return time.Duration(o.getOverridesForUser(userID).IngesterQuerySplitDuration) +} + // MaxQueryBytesRead returns the maximum bytes a query can read. func (o *Overrides) MaxQueryBytesRead(_ context.Context, userID string) int { return o.getOverridesForUser(userID).MaxQueryBytesRead.Val() From 10c88aaefd374ca71cc1277953a4e513c9ce733b Mon Sep 17 00:00:00 2001 From: Poyzan <31743851+poyzannur@users.noreply.github.com> Date: Thu, 11 Jan 2024 13:59:43 +0000 Subject: [PATCH 08/21] Bloom compactor/debug compacting with existing metas (#11638) **What this PR does / why we need it**: This branch was used to debug uploading and getting existing metas correctly. Uncommented functions to clean up archive directories will be addressed once debugging bloom filters is completed. The work was verified in the dev cell. --------- Co-authored-by: Paul Rogers --- pkg/bloomcompactor/bloomcompactor.go | 28 +++++++++---------- .../stores/shipper/bloomshipper/client.go | 3 +- .../shipper/bloomshipper/compress_utils.go | 9 +++--- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/pkg/bloomcompactor/bloomcompactor.go b/pkg/bloomcompactor/bloomcompactor.go index 40e4646c7044c..7f999c0ebfad6 100644 --- a/pkg/bloomcompactor/bloomcompactor.go +++ b/pkg/bloomcompactor/bloomcompactor.go @@ -497,13 +497,12 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, localDst := createLocalDirName(c.cfg.WorkingDirectory, job) blockOptions := v1.NewBlockOptions(bt.GetNGramLength(), bt.GetNGramSkip()) - // TODO(poyzannur) enable once debugging is over - //defer func() { - // //clean up the bloom directory - // if err := os.RemoveAll(localDst); err != nil { - // level.Error(logger).Log("msg", "failed to remove block directory", "dir", localDst, "err", err) - // } - //}() + defer func() { + //clean up the bloom directory + if err := os.RemoveAll(localDst); err != nil { + level.Error(logger).Log("msg", "failed to remove block directory", "dir", localDst, "err", err) + } + }() var resultingBlock bloomshipper.Block defer func() { @@ -551,6 +550,7 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, }() if err != nil { + level.Error(logger).Log("err", err) return err } @@ -565,6 +565,7 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, level.Error(logger).Log("msg", "failed merging existing blocks with new chunks", "err", err) return err } + } archivePath := filepath.Join(c.cfg.WorkingDirectory, uuid.New().String()) @@ -575,13 +576,12 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, return err } - // TODO(poyzannur) enable once debugging is over - //defer func() { - // err = os.Remove(archivePath) - // if err != nil { - // level.Error(logger).Log("msg", "failed removing archive file", "err", err, "file", archivePath) - // } - //}() + defer func() { + err = os.Remove(archivePath) + if err != nil { + level.Error(logger).Log("msg", "failed removing archive file", "err", err, "file", archivePath) + } + }() // Do not change the signature of PutBlocks yet. // Once block size is limited potentially, compactNewChunks will return multiple blocks, hence a list is appropriate. diff --git a/pkg/storage/stores/shipper/bloomshipper/client.go b/pkg/storage/stores/shipper/bloomshipper/client.go index d5d981cddb9e5..7ab99ea7e3e6e 100644 --- a/pkg/storage/stores/shipper/bloomshipper/client.go +++ b/pkg/storage/stores/shipper/bloomshipper/client.go @@ -133,12 +133,13 @@ func (b *BloomClient) GetMetas(ctx context.Context, params MetaSearchParams) ([] periodClient := b.periodicObjectClients[periodFrom] for _, table := range tables { prefix := filepath.Join(rootFolder, table, params.TenantID, metasFolder) - list, _, err := periodClient.List(ctx, prefix, delimiter) + list, _, err := periodClient.List(ctx, prefix, "") if err != nil { return nil, fmt.Errorf("error listing metas under prefix [%s]: %w", prefix, err) } for _, object := range list { metaRef, err := createMetaRef(object.Key, params.TenantID, table) + if err != nil { return nil, err } diff --git a/pkg/storage/stores/shipper/bloomshipper/compress_utils.go b/pkg/storage/stores/shipper/bloomshipper/compress_utils.go index aa30ec4901f00..96af5e987c3d4 100644 --- a/pkg/storage/stores/shipper/bloomshipper/compress_utils.go +++ b/pkg/storage/stores/shipper/bloomshipper/compress_utils.go @@ -5,10 +5,10 @@ import ( "io" "os" "path/filepath" - "strings" "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/google/uuid" v1 "github.com/grafana/loki/pkg/storage/bloom/v1" ) @@ -42,8 +42,9 @@ func UncompressBloomBlock(block *LazyBlock, workingDirectory string, logger log. if err != nil { return "", fmt.Errorf("error writing data to temp file: %w", err) } + level.Info(logger).Log("msg", "extracting archive", "archive", archivePath, "workingDirectory", workingDirectoryPath, "blockPath", block.BlockPath) defer func() { - os.Remove(archivePath) + err = os.Remove(archivePath) if err != nil { level.Error(logger).Log("msg", "removing archive file", "err", err, "file", archivePath) } @@ -57,7 +58,7 @@ func UncompressBloomBlock(block *LazyBlock, workingDirectory string, logger log. func writeDataToTempFile(workingDirectoryPath string, block *LazyBlock) (string, error) { defer block.Data.Close() - archivePath := filepath.Join(workingDirectoryPath, block.BlockPath[strings.LastIndex(block.BlockPath, "/")+1:]) + archivePath := filepath.Join(workingDirectoryPath, uuid.New().String()) archiveFile, err := os.Create(archivePath) if err != nil { @@ -74,7 +75,7 @@ func writeDataToTempFile(workingDirectoryPath string, block *LazyBlock) (string, func extractArchive(archivePath string, workingDirectoryPath string) error { file, err := os.Open(archivePath) if err != nil { - return fmt.Errorf("error opening archive file %s: %w", file.Name(), err) + return fmt.Errorf("error opening archive file %s: %w", archivePath, err) } return v1.UnTarGz(workingDirectoryPath, file) } From 3b2278d6ff4e0a2fd722848467457afeae60afbb Mon Sep 17 00:00:00 2001 From: Peter Stolz <50801264+PeterStolz@users.noreply.github.com> Date: Thu, 11 Jan 2024 17:09:04 +0100 Subject: [PATCH 09/21] docs: Update _index.md addressing #11575 (#11626) **What this PR does / why we need it**: It improves the docs by addressing #11575 allowing users to use the official fluent-bit helm chart over the deprecated grafana one **Which issue(s) this PR fixes**: Fixes #11575 **Special notes for your reviewer**: @JStickler wanted to take a look at this. **Checklist** - [ x ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ x ] Documentation added - [ ] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler --- docs/sources/send-data/fluentbit/_index.md | 60 ++++++++++++++++++---- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/docs/sources/send-data/fluentbit/_index.md b/docs/sources/send-data/fluentbit/_index.md index c9088fdc8f886..a8d052c7262a9 100644 --- a/docs/sources/send-data/fluentbit/_index.md +++ b/docs/sources/send-data/fluentbit/_index.md @@ -8,7 +8,9 @@ weight: 500 --- # Fluent Bit client -[Fluent Bit](https://fluentbit.io/) is a fast and lightweight logs and metrics processor and forwarder that can be configured with the [Grafana Loki output plugin](https://docs.fluentbit.io/manual/pipeline/outputs/loki) to ship logs to Loki. You can define which log files you want to collect using the [`Tail`](https://docs.fluentbit.io/manual/pipeline/inputs/tail) or [`Stdin`](https://docs.fluentbit.io/manual/pipeline/inputs/standard-input) data pipeline input. Additionally, Fluent Bit supports multiple `Filter` and `Parser` plugins (`Kubernetes`, `JSON`, etc.) to structure and alter log lines. +[Fluent Bit](https://fluentbit.io/) is a fast and lightweight logs and metrics processor and forwarder that can be configured with the Grafana Fluent Bit Plugin described here or with the [Fluent-bit Loki output plugin](https://docs.fluentbit.io/manual/pipeline/outputs/loki) to ship logs to Loki. +This plugin has more configuration options compared to the built-in Fluent Bit Loki plugin. +You can define which log files you want to collect using the [`Tail`](https://docs.fluentbit.io/manual/pipeline/inputs/tail) or [`Stdin`](https://docs.fluentbit.io/manual/pipeline/inputs/standard-input) data pipeline input. Additionally, Fluent Bit supports multiple `Filter` and `Parser` plugins (`Kubernetes`, `JSON`, etc.) to structure and alter log lines. ## Usage @@ -63,23 +65,59 @@ To ship logs from Docker containers to Grafana Cloud using Fluent Bit, you can u You can run Fluent Bit as a [Daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to collect all your Kubernetes workload logs. -To do so you can use our [Fluent Bit helm chart](https://github.com/grafana/helm-charts/tree/main/charts/fluent-bit): +To do so you can use the [Fluent Bit helm chart](https://github.com/fluent/helm-charts) with the following `values.yaml` changing the value of `FLUENT_LOKI_URL`: + +```yaml +image: + # Here we use the Docker image which has the plugin installed + repository: grafana/fluent-bit-plugin-loki + tag: main-e2ed1c0 + +args: + - "-e" + - "/fluent-bit/bin/out_grafana_loki.so" + - --workdir=/fluent-bit/etc + - --config=/fluent-bit/etc/conf/fluent-bit.conf + +env: + # Note that for security reasons you should fetch the credentials through a Kubernetes Secret https://kubernetes.io/docs/concepts/configuration/secret/ . You may use the envFrom for this. + - name: FLUENT_LOKI_URL + value: https://user:pass@your-loki.endpoint/loki/api/v1/push + +config: + inputs: | + [INPUT] + Name tail + Tag kube.* + Path /var/log/containers/*.log + # Be aware that local clusters like docker-desktop or kind use the docker log format and not the cri (https://docs.fluentbit.io/manual/installation/kubernetes#container-runtime-interface-cri-parser) + multiline.parser docker, cri + Mem_Buf_Limit 5MB + Skip_Long_Lines On + + outputs: | + [Output] + Name grafana-loki + Match kube.* + Url ${FLUENT_LOKI_URL} + Labels {job="fluent-bit"} + LabelKeys level,app # this sets the values for actual Loki streams and the other labels are converted to structured_metadata https://grafana.com/docs/loki/latest/get-started/labels/structured-metadata/ + BatchWait 1 + BatchSize 1001024 + LineFormat json + LogLevel info + AutoKubernetesLabels true +``` ```bash -helm repo add grafana https://grafana.github.io/helm-charts +helm repo add fluent https://fluent.github.io/helm-charts helm repo update -helm upgrade --install fluent-bit grafana/fluent-bit \ - --set loki.serviceName=loki.svc.cluster.local +helm install fluent-bit fluent/fluent-bit -f values.yaml ``` By default it will collect all containers logs and extract labels from Kubernetes API (`container_name`, `namespace`, etc..). -Alternatively you can install the Loki and Fluent Bit all together using: - -```bash -helm upgrade --install loki-stack grafana/loki-stack \ - --set fluent-bit.enabled=true,promtail.enabled=false -``` +If you also want to host your Loki instance inside the cluster install the [official Loki helm chart](https://grafana.com/docs/loki/latest/setup/install/helm/). ### AWS Elastic Container Service (ECS) From c9c8692d21d34f35de84ae7e629f425f34bd1ab6 Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Thu, 11 Jan 2024 17:59:53 +0100 Subject: [PATCH 10/21] Fix backend target in docker compose (#11663) **What this PR does / why we need it**: At https://github.com/grafana/loki/pull/9899 added the new `backend` SSD target to the docker compose we ship at `production/docker`. Later on, https://github.com/grafana/loki/pull/8836 removed it since the current stable version of Loki was 2.7.3 and `backend` was introduced with 2.8 (RC at the time of this PR). The backend target is now GA, this PR enables it back in our provided docker compose. Additionally, the following improvements/updates are made: - Move storage config from `common.storage.s3` to `storage.aws` - Remove schema period using boltdb-shipper - Update tsdb schema to latest (v13) - Expose minio web console --- production/docker/config/loki.yaml | 32 ++++++++++++++++-------- production/docker/config/prometheus.yaml | 1 + production/docker/docker-compose.yaml | 7 +++--- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/production/docker/config/loki.yaml b/production/docker/config/loki.yaml index 6e4541164a235..0a124e5ccfaae 100644 --- a/production/docker/config/loki.yaml +++ b/production/docker/config/loki.yaml @@ -9,19 +9,20 @@ server: common: path_prefix: /loki - storage: - s3: - endpoint: minio:9000 - insecure: true - bucketnames: loki-data - access_key_id: loki - secret_access_key: supersecret - s3forcepathstyle: true - compactor_address: http://loki-write:3100 + compactor_address: http://loki-backend:3100 replication_factor: 3 +storage_config: + aws: + endpoint: minio:9000 + insecure: true + bucketnames: loki-data + access_key_id: loki + secret_access_key: supersecret + s3forcepathstyle: true + memberlist: - join_members: ["loki-read", "loki-write"] + join_members: ["loki-read", "loki-write", "loki-backend"] dead_node_reclaim_time: 30s gossip_to_dead_nodes_time: 15s left_ingesters_timeout: 30s @@ -54,6 +55,10 @@ ruler: enable_sharding: true wal: dir: /loki/ruler-wal + evaluation: + mode: remote + query_frontend: + address: dns:///loki-read:9095 storage: type: local local: @@ -85,6 +90,13 @@ schema_config: index: prefix: index_ period: 24h + - from: 2024-01-10 + store: tsdb + object_store: s3 + schema: v12 + index: + prefix: index_ + period: 24h limits_config: diff --git a/production/docker/config/prometheus.yaml b/production/docker/config/prometheus.yaml index 9bb03bb209047..3369106f94001 100644 --- a/production/docker/config/prometheus.yaml +++ b/production/docker/config/prometheus.yaml @@ -11,6 +11,7 @@ scrape_configs: - names: - loki-read - loki-write + - loki-backend type: A port: 3100 - job_name: 'promtail' diff --git a/production/docker/docker-compose.yaml b/production/docker/docker-compose.yaml index 5c1b93f829173..a4f74c7bb1182 100644 --- a/production/docker/docker-compose.yaml +++ b/production/docker/docker-compose.yaml @@ -89,7 +89,7 @@ services: - | mkdir -p /data/loki-data && \ mkdir -p /data/loki-ruler && - minio server /data + minio server --address "0.0.0.0:9000" --console-address "0.0.0.0:9001" /data environment: - MINIO_ROOT_USER=loki - MINIO_ROOT_PASSWORD=supersecret @@ -97,6 +97,7 @@ services: - MINIO_UPDATE=off ports: - "9000:9000" + - "9001:9001" volumes: - ./.data/minio:/data networks: @@ -116,7 +117,6 @@ services: image: *lokiImage volumes: - ./config:/etc/loki/ - - ./rules:/loki/rules:ro # only needed for interactive debugging with dlv # cap_add: # - SYS_PTRACE @@ -127,7 +127,7 @@ services: - "7946" # uncomment to use interactive debugging # - "40000-40002:40000" # makes the replicas available on ports 40000, 40001, 40002 - command: "-config.file=/etc/loki/loki.yaml -target=read" + command: "-config.file=/etc/loki/loki.yaml -target=read -legacy-read-mode=false" networks: - loki restart: always @@ -161,6 +161,7 @@ services: image: *lokiImage volumes: - ./config:/etc/loki/ + - ./rules:/loki/rules:ro # only needed for interactive debugging with dlv # cap_add: # - SYS_PTRACE From 9287c93dd3edf1ecb2e41792919ceaf39e09ac59 Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Thu, 11 Jan 2024 18:00:03 +0100 Subject: [PATCH 11/21] Support bloom compactor and gateway in SSD mode (#11661) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **What this PR does / why we need it**: This PR fixes adds support for running the bloom compactor and gateway in SSD mode. If `legacy-read-mode` is `true` (default): - `write` runs: Ingester, Distributor. - `read` runs: QueryFrontend, Querier, QueryScheduler, Ruler, Compactor, IndexGateway, **BloomGateway**, **BloomCompactor**. Otherwise: - `write` runs: Ingester, Distributor. - `read`: QueryFrontend, Querier. - `backend`: QueryScheduler, Ruler, Compactor, IndexGateway, **BloomGateway**, **BloomCompactor**. I tested this out with a local SSD deployment and seems to be building blooms fine (when bloom compactor is enabled). ``` $ tree .data/minio/loki-data/bloom -I "*meta" -I "part*" -L 5 .data/minio/loki-data/bloom └── index_19733 └── docker ├── blooms │   ├── 38dad41ad5f1b79-fe294823bd539109 │   │   ├── 1704976430461-1704978045667-7e33bf85 │   │   ├── 1704976430461-1704978922898-dd3be57e │   │   ├── 1704976430461-1704979829636-33ae0d37 │   │   ├── 1704976430461-1704980743484-33ae0d37 │   │   ├── 1704976430461-1704981642000-33ae0d37 │   │   └── 1704976430461-1704982541351-33ae0d37 │   ├── 437383945fca166-f5ebd41d2edc508f │   │   ├── 1704976430460-1704978044909-b2767d8c │   │   ├── 1704976430460-1704978923151-183db34c │   │   ├── 1704976430460-1704979833202-183db34c │   │   ├── 1704976430460-1704980745263-183db34c │   │   ├── 1704976430460-1704981641237-183db34c │   │   └── 1704976430460-1704982542614-183db34c │   └── 46938d684d3cf87-fd4f06a77900049a │   ├── 1704976430447-1704978045918-ba691f3c │   ├── 1704976430447-1704978045918-fe558bee │   ├── 1704976430447-1704978921878-de5e8dc0 │   ├── 1704976430447-1704979833964-de5e8dc0 │   ├── 1704976430447-1704980744753-de5e8dc0 │   ├── 1704976430447-1704981640725-de5e8dc0 │   └── 1704976430447-1704982541858-de5e8dc0 └── metas ├── 38dad41ad5f1b79-fe294823bd539109-1704976430461-1704978045667-1d179deb ├── 38dad41ad5f1b79-fe294823bd539109-1704976430461-1704978045667-1f3b9273 ├── 38dad41ad5f1b79-fe294823bd539109-1704976430461-1704978922898-296c7d14 ├── 38dad41ad5f1b79-fe294823bd539109-1704976430461-1704979829636-a3833f21 ├── 38dad41ad5f1b79-fe294823bd539109-1704976430461-1704980743484-8abbc022 ├── 38dad41ad5f1b79-fe294823bd539109-1704976430461-1704981642000-7d80fe65 ├── 38dad41ad5f1b79-fe294823bd539109-1704976430461-1704982541351-cfcbb8b6 ├── 437383945fca166-f5ebd41d2edc508f-1704976430460-1704978044909-bdb8aad0 ├── 437383945fca166-f5ebd41d2edc508f-1704976430460-1704978044909-e0d7b52c ├── 437383945fca166-f5ebd41d2edc508f-1704976430460-1704978923151-692d2033 ├── 437383945fca166-f5ebd41d2edc508f-1704976430460-1704979833202-d0b8c046 ├── 437383945fca166-f5ebd41d2edc508f-1704976430460-1704980745263-aa0521f ├── 437383945fca166-f5ebd41d2edc508f-1704976430460-1704981641237-87636172 ├── 437383945fca166-f5ebd41d2edc508f-1704976430460-1704982542614-dba1fd47 ├── 46938d684d3cf87-fd4f06a77900049a-1704976430447-1704978045918-8fb7a27c ├── 46938d684d3cf87-fd4f06a77900049a-1704976430447-1704978045918-e031baad ├── 46938d684d3cf87-fd4f06a77900049a-1704976430447-1704978921878-635ccc8f ├── 46938d684d3cf87-fd4f06a77900049a-1704976430447-1704979833964-376036ef ├── 46938d684d3cf87-fd4f06a77900049a-1704976430447-1704980744753-628b5958 ├── 46938d684d3cf87-fd4f06a77900049a-1704976430447-1704981640725-7cefe675 └── 46938d684d3cf87-fd4f06a77900049a-1704976430447-1704982541858-1e4de9d4 ``` **Which issue(s) this PR fixes**: Fixes # **Special notes for your reviewer**: **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [ ] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --- pkg/loki/loki.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/loki/loki.go b/pkg/loki/loki.go index 0c4a50f813ed9..d4b58fac838f3 100644 --- a/pkg/loki/loki.go +++ b/pkg/loki/loki.go @@ -647,7 +647,7 @@ func (t *Loki) setupModuleManager() error { Read: {QueryFrontend, Querier}, Write: {Ingester, Distributor}, - Backend: {QueryScheduler, Ruler, Compactor, IndexGateway}, + Backend: {QueryScheduler, Ruler, Compactor, IndexGateway, BloomGateway, BloomCompactor}, All: {QueryScheduler, QueryFrontend, Querier, Ingester, Distributor, Ruler, Compactor}, } @@ -694,13 +694,12 @@ func (t *Loki) setupModuleManager() error { } // Add bloom gateway ring in client mode to IndexGateway service dependencies if bloom filtering is enabled. - if t.Cfg.isModuleEnabled(IndexGateway) && t.Cfg.BloomGateway.Enabled { + if t.Cfg.BloomGateway.Enabled { deps[IndexGateway] = append(deps[IndexGateway], BloomGatewayRing) } - //TODO(poyzannur) not sure this is needed for BloomCompactor if t.Cfg.LegacyReadTarget { - deps[Read] = append(deps[Read], QueryScheduler, Ruler, Compactor, IndexGateway, BloomGateway, BloomCompactor) + deps[Read] = append(deps[Read], deps[Backend]...) } if t.Cfg.InternalServer.Enable { From 9759c130fe5e5b52de8afb5d86195e5188a4f37e Mon Sep 17 00:00:00 2001 From: Trevor Whitney Date: Thu, 11 Jan 2024 14:08:44 -0700 Subject: [PATCH 12/21] fix: align semantics of metric and log query label extraction (#11587) both metric and log queries use the first extracted label when multiple values are requested for the same label Fixes #11647 --- CHANGELOG.md | 1 + pkg/logql/log/parser.go | 5 ++++- pkg/logql/log/parser_hints.go | 31 +++++++++++++++--------------- pkg/logql/log/parser_hints_test.go | 20 ++++++++++++++----- 4 files changed, 35 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f9d2d38fbbfed..46e9a24daf1ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ * [11601](https://github.com/grafana/loki/pull/11601) **dannykopping** Ruler: Fixed a panic that can be caused by concurrent read-write access of tenant configs when there are a large amount of rules. * [11606](https://github.com/grafana/loki/pull/11606) **dannykopping** Fixed regression adding newlines to HTTP error response bodies which may break client integrations. * [11657](https://github.com/grafana/loki/pull/11657) **ashwanthgoli** Log results cache: compose empty response based on the request being served to avoid returning incorrect limit or direction. +* [11587](https://github.com/grafana/loki/pull/11587) **trevorwhitney** Fix semantics of label parsing logic of metrics and logs queries. Both only parse the first label if multiple extractions into the same label are requested. ##### Changes diff --git a/pkg/logql/log/parser.go b/pkg/logql/log/parser.go index be059a2831560..c03e7c91cb960 100644 --- a/pkg/logql/log/parser.go +++ b/pkg/logql/log/parser.go @@ -493,11 +493,13 @@ func (l *LogfmtExpressionParser) Process(_ int64, line []byte, lbs *LabelsBuilde return "", false } - if !lbs.ParserLabelHints().ShouldExtract(sanitized) { + _, alwaysExtract := keys[sanitized] + if !alwaysExtract && !lbs.ParserLabelHints().ShouldExtract(sanitized) { return "", false } return sanitized, true }) + if !ok { continue } @@ -530,6 +532,7 @@ func (l *LogfmtExpressionParser) Process(_ int64, line []byte, lbs *LabelsBuilde } } } + if l.strict && l.dec.Err() != nil { addErrLabel(errLogfmt, l.dec.Err(), lbs) return line, true diff --git a/pkg/logql/log/parser_hints.go b/pkg/logql/log/parser_hints.go index cdb61015dd4dd..a8b1f73f3109d 100644 --- a/pkg/logql/log/parser_hints.go +++ b/pkg/logql/log/parser_hints.go @@ -58,10 +58,6 @@ type Hints struct { } func (p *Hints) ShouldExtract(key string) bool { - if len(p.requiredLabels) == 0 { - return true - } - for _, l := range p.extracted { if l == key { return false @@ -74,7 +70,7 @@ func (p *Hints) ShouldExtract(key string) bool { } } - return false + return len(p.requiredLabels) == 0 } func (p *Hints) ShouldExtractPrefix(prefix string) bool { @@ -95,19 +91,25 @@ func (p *Hints) NoLabels() bool { } func (p *Hints) RecordExtracted(key string) { - for _, l := range p.requiredLabels { - if l == key { - p.extracted = append(p.extracted, key) - return - } - } + p.extracted = append(p.extracted, key) } func (p *Hints) AllRequiredExtracted() bool { - if len(p.requiredLabels) == 0 { + if len(p.requiredLabels) == 0 || len(p.extracted) < len(p.requiredLabels) { return false } - return len(p.extracted) == len(p.requiredLabels) + + found := 0 + for _, l := range p.requiredLabels { + for _, e := range p.extracted { + if l == e { + found++ + break + } + } + } + + return len(p.requiredLabels) == found } func (p *Hints) Reset() { @@ -172,9 +174,6 @@ func NewParserHint(requiredLabelNames, groups []string, without, noLabels bool, return ph } - ph.requiredLabels = hints - ph.shouldPreserveError = containsError(hints) - return &Hints{requiredLabels: hints, extracted: extracted, shouldPreserveError: containsError(hints)} } diff --git a/pkg/logql/log/parser_hints_test.go b/pkg/logql/log/parser_hints_test.go index ac232bfd871b4..42d0134bc1d8f 100644 --- a/pkg/logql/log/parser_hints_test.go +++ b/pkg/logql/log/parser_hints_test.go @@ -28,7 +28,10 @@ var ( "response": { "status": 204, "latency_seconds": "30.001" - } + }, + "message": { + "message": "foo", + } }`) packedLine = []byte(`{ @@ -58,14 +61,14 @@ func Test_ParserHints(t *testing.T) { jsonLine, true, 1.0, - `{app="nginx", cluster="us-central-west", cluster_extracted="us-east-west", protocol="HTTP/2.0", remote_user="foo", request_host="foo.grafana.net", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_latency_seconds="30.001", response_status="204", upstream_addr="10.0.0.1:80"}`, + `{app="nginx", cluster="us-central-west", cluster_extracted="us-east-west", message_message="foo", protocol="HTTP/2.0", remote_user="foo", request_host="foo.grafana.net", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_latency_seconds="30.001", response_status="204", upstream_addr="10.0.0.1:80"}`, }, { `sum without (request_host,app,cluster) (rate({app="nginx"} | json | __error__="" | response_status = 204 [1m]))`, jsonLine, true, 1.0, - `{cluster_extracted="us-east-west", protocol="HTTP/2.0", remote_user="foo", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_latency_seconds="30.001", response_status="204", upstream_addr="10.0.0.1:80"}`, + `{cluster_extracted="us-east-west", message_message="foo", protocol="HTTP/2.0", remote_user="foo", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_latency_seconds="30.001", response_status="204", upstream_addr="10.0.0.1:80"}`, }, { `sum by (request_host,app) (rate({app="nginx"} | json | __error__="" | response_status = 204 [1m]))`, @@ -114,14 +117,14 @@ func Test_ParserHints(t *testing.T) { jsonLine, true, 30.001, - `{app="nginx", cluster="us-central-west", cluster_extracted="us-east-west", protocol="HTTP/2.0", remote_user="foo", request_host="foo.grafana.net", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_status="204", upstream_addr="10.0.0.1:80"}`, + `{app="nginx", cluster="us-central-west", cluster_extracted="us-east-west", message_message="foo", protocol="HTTP/2.0", remote_user="foo", request_host="foo.grafana.net", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_status="204", upstream_addr="10.0.0.1:80"}`, }, { `sum without (request_host,app,cluster)(rate({app="nginx"} | json | response_status = 204 | unwrap response_latency_seconds [1m]))`, jsonLine, true, 30.001, - `{cluster_extracted="us-east-west", protocol="HTTP/2.0", remote_user="foo", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_status="204", upstream_addr="10.0.0.1:80"}`, + `{cluster_extracted="us-east-west", message_message="foo", protocol="HTTP/2.0", remote_user="foo", request_method="POST", request_size="101", request_time="30.001", request_uri="/rpc/v2/stage", response_status="204", upstream_addr="10.0.0.1:80"}`, }, { `sum(rate({app="nginx"} | logfmt | org_id=3677 | unwrap Ingester_TotalReached[1m]))`, @@ -214,6 +217,13 @@ func Test_ParserHints(t *testing.T) { 0, ``, }, + { + `sum by (message_message,app)(count_over_time({app="nginx"} | json | response_status = 204 and remote_user = "foo"[1m]))`, + jsonLine, + true, + 1, + `{app="nginx", message_message="foo"}`, + }, } { tt := tt t.Run(tt.expr, func(t *testing.T) { From 6bcac00c70055fd06e4606c96f3cdeae02b8e302 Mon Sep 17 00:00:00 2001 From: Christian Haudum Date: Fri, 12 Jan 2024 09:26:08 +0100 Subject: [PATCH 13/21] Add e2e tests for bloom filtering (#11645) **What?** This PR adds an end-to-end integration test for creating bloom filters and using them in the gateway to filter chunks. **Why?** This test helped to identify bugs in the bloom shipper code that would have taken a very long time to discover in other ways, such as deploying to a dev environment or running a long-running docker compose setup. **Notes** The following commits of this PR are actual changes to the compactor/gateway code to make e2e test work: * https://github.com/grafana/loki/pull/11645/commits/2dcc761ba069a377ece1a3a48fa6c6c59039043c * https://github.com/grafana/loki/pull/11645/commits/84052ddb34c012e12d3e98b86317a6b18bc3d76e * https://github.com/grafana/loki/pull/11645/commits/3ba44f83c81bb305d3fb08d0141b6f080bde03ea The bloom gateway code path for processing blocks was cleaned up, because it still contained the unused "sequention processing" path of blocks, which was initially kept to verify the callback based processing works alike: * https://github.com/grafana/loki/pull/11645/commits/f55d79b84fa6d6118fd66720aa107ba15ef0b862 --------- Signed-off-by: Christian Haudum --- integration/client/client.go | 19 +- integration/cluster/cluster.go | 26 ++- integration/loki_micro_services_test.go | 171 ++++++++++++++++++ pkg/bloomcompactor/bloomcompactor.go | 13 +- pkg/bloomgateway/bloomgateway.go | 7 +- pkg/bloomgateway/bloomgateway_test.go | 145 +++++++-------- pkg/bloomgateway/worker.go | 22 +-- .../stores/shipper/bloomshipper/client.go | 2 +- .../stores/shipper/bloomshipper/shipper.go | 95 +++++----- .../shipper/bloomshipper/shipper_test.go | 84 ++++----- .../stores/shipper/bloomshipper/store.go | 38 ---- .../indexshipper/indexgateway/gateway.go | 6 +- 12 files changed, 372 insertions(+), 256 deletions(-) diff --git a/integration/client/client.go b/integration/client/client.go index dcf2c036dc9e9..2e5a86aa6b3de 100644 --- a/integration/client/client.go +++ b/integration/client/client.go @@ -479,12 +479,21 @@ type Header struct { Name, Value string } -// RunRangeQuery runs a query and returns an error if anything went wrong +// RunRangeQuery runs a 7d query and returns an error if anything went wrong +// This function is kept to keep backwards copatibility of existing tests. +// Better use (*Client).RunRangeQueryWithStartEnd() func (c *Client) RunRangeQuery(ctx context.Context, query string, extraHeaders ...Header) (*Response, error) { + end := c.Now.Add(time.Second) + start := c.Now.Add(-7 * 24 * time.Hour) + return c.RunRangeQueryWithStartEnd(ctx, query, start, end, extraHeaders...) +} + +// RunRangeQuery runs a query and returns an error if anything went wrong +func (c *Client) RunRangeQueryWithStartEnd(ctx context.Context, query string, start, end time.Time, extraHeaders ...Header) (*Response, error) { ctx, cancelFunc := context.WithTimeout(ctx, requestTimeout) defer cancelFunc() - buf, statusCode, err := c.run(ctx, c.rangeQueryURL(query), extraHeaders...) + buf, statusCode, err := c.run(ctx, c.rangeQueryURL(query, start, end), extraHeaders...) if err != nil { return nil, err } @@ -555,11 +564,11 @@ func (c *Client) parseResponse(buf []byte, statusCode int) (*Response, error) { return &lokiResp, nil } -func (c *Client) rangeQueryURL(query string) string { +func (c *Client) rangeQueryURL(query string, start, end time.Time) string { v := url.Values{} v.Set("query", query) - v.Set("start", formatTS(c.Now.Add(-7*24*time.Hour))) - v.Set("end", formatTS(c.Now.Add(time.Second))) + v.Set("start", formatTS(start)) + v.Set("end", formatTS(end)) u, err := url.Parse(c.baseURL) if err != nil { diff --git a/integration/cluster/cluster.go b/integration/cluster/cluster.go index 8ddeac00f1782..831da46f2cb99 100644 --- a/integration/cluster/cluster.go +++ b/integration/cluster/cluster.go @@ -43,7 +43,6 @@ server: grpc_server_max_recv_msg_size: 110485813 grpc_server_max_send_msg_size: 110485813 - common: path_prefix: {{.dataPath}} storage: @@ -70,14 +69,25 @@ storage_config: store-1: directory: {{.sharedDataPath}}/fs-store-1 boltdb_shipper: - active_index_directory: {{.dataPath}}/index + active_index_directory: {{.dataPath}}/boltdb-index cache_location: {{.dataPath}}/boltdb-cache tsdb_shipper: active_index_directory: {{.dataPath}}/tsdb-index cache_location: {{.dataPath}}/tsdb-cache + bloom_shipper: + working_directory: {{.dataPath}}/bloom-shipper + blocks_downloading_queue: + workers_count: 1 + +bloom_gateway: + enabled: false + +bloom_compactor: + enabled: false + working_directory: {{.dataPath}}/bloom-compactor compactor: - working_directory: {{.dataPath}}/retention + working_directory: {{.dataPath}}/compactor retention_enabled: true delete_request_store: store-1 @@ -154,14 +164,14 @@ func New(logLevel level.Value, opts ...func(*Cluster)) *Cluster { } resetMetricRegistry() - sharedPath, err := os.MkdirTemp("", "loki-shared-data") + sharedPath, err := os.MkdirTemp("", "loki-shared-data-") if err != nil { panic(err.Error()) } overridesFile := filepath.Join(sharedPath, "loki-overrides.yaml") - err = os.WriteFile(filepath.Join(sharedPath, "loki-overrides.yaml"), []byte(`overrides:`), 0777) + err = os.WriteFile(overridesFile, []byte(`overrides:`), 0777) if err != nil { panic(fmt.Errorf("error creating overrides file: %w", err)) } @@ -318,12 +328,12 @@ func port(addr string) string { func (c *Component) writeConfig() error { var err error - configFile, err := os.CreateTemp("", "loki-config") + configFile, err := os.CreateTemp("", fmt.Sprintf("loki-%s-config-*.yaml", c.name)) if err != nil { return fmt.Errorf("error creating config file: %w", err) } - c.dataPath, err = os.MkdirTemp("", "loki-data") + c.dataPath, err = os.MkdirTemp("", fmt.Sprintf("loki-%s-data-", c.name)) if err != nil { return fmt.Errorf("error creating data path: %w", err) } @@ -408,6 +418,8 @@ func (c *Component) run() error { c.configFile, "-limits.per-user-override-config", c.overridesFile, + "-limits.per-user-override-period", + "1s", ), flagset); err != nil { return err } diff --git a/integration/loki_micro_services_test.go b/integration/loki_micro_services_test.go index a4d03ed10a673..1f7dc836b5ff6 100644 --- a/integration/loki_micro_services_test.go +++ b/integration/loki_micro_services_test.go @@ -3,11 +3,14 @@ package integration import ( "context" "encoding/json" + "fmt" + "math/rand" "strings" "sync" "testing" "time" + "github.com/go-kit/log/level" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "github.com/prometheus/prometheus/model/labels" @@ -1056,6 +1059,174 @@ func TestCategorizedLabels(t *testing.T) { } } +func TestBloomFiltersEndToEnd(t *testing.T) { + commonFlags := []string{ + "-bloom-compactor.compaction-interval=2s", + "-bloom-compactor.enable-compaction=true", + "-bloom-compactor.enabled=true", + "-bloom-gateway.enable-filtering=true", + "-bloom-gateway.enabled=true", + "-compactor.compaction-interval=1s", + "-frontend.default-validity=0s", + "-ingester.flush-on-shutdown=true", + "-ingester.wal-enabled=false", + "-query-scheduler.use-scheduler-ring=false", + "-store.index-cache-read.embedded-cache.enabled=true", + } + + tenantID := randStringRunes() + + clu := cluster.New( + level.DebugValue(), + cluster.SchemaWithTSDB, + func(c *cluster.Cluster) { c.SetSchemaVer("v13") }, + ) + + defer func() { + assert.NoError(t, clu.Cleanup()) + }() + + var ( + tDistributor = clu.AddComponent( + "distributor", + append( + commonFlags, + "-target=distributor", + )..., + ) + tIndexGateway = clu.AddComponent( + "index-gateway", + append( + commonFlags, + "-target=index-gateway", + )..., + ) + _ = clu.AddComponent( + "bloom-gateway", + append( + commonFlags, + "-target=bloom-gateway", + )..., + ) + ) + require.NoError(t, clu.Run()) + + var ( + tIngester = clu.AddComponent( + "ingester", + append( + commonFlags, + "-target=ingester", + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + tQueryScheduler = clu.AddComponent( + "query-scheduler", + append( + commonFlags, + "-target=query-scheduler", + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + tCompactor = clu.AddComponent( + "compactor", + append( + commonFlags, + "-target=compactor", + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + _ = clu.AddComponent( + "bloom-compactor", + append( + commonFlags, + "-target=bloom-compactor", + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + ) + require.NoError(t, clu.Run()) + + // finally, run the query-frontend and querier. + var ( + tQueryFrontend = clu.AddComponent( + "query-frontend", + append( + commonFlags, + "-target=query-frontend", + "-frontend.scheduler-address="+tQueryScheduler.GRPCURL(), + "-common.compactor-address="+tCompactor.HTTPURL(), + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + _ = clu.AddComponent( + "querier", + append( + commonFlags, + "-target=querier", + "-querier.scheduler-address="+tQueryScheduler.GRPCURL(), + "-common.compactor-address="+tCompactor.HTTPURL(), + "-tsdb.shipper.index-gateway-client.server-address="+tIndexGateway.GRPCURL(), + )..., + ) + ) + require.NoError(t, clu.Run()) + + now := time.Now() + + cliDistributor := client.New(tenantID, "", tDistributor.HTTPURL()) + cliDistributor.Now = now + + cliIngester := client.New(tenantID, "", tIngester.HTTPURL()) + cliIngester.Now = now + + cliQueryFrontend := client.New(tenantID, "", tQueryFrontend.HTTPURL()) + cliQueryFrontend.Now = now + + cliIndexGateway := client.New(tenantID, "", tIndexGateway.HTTPURL()) + cliIndexGateway.Now = now + + lineTpl := `caller=loki_micro_services_test.go msg="push log line" id="%s"` + // ingest logs from 10 different pods + // each line contains a random, unique string + // that string is used to verify filtering using bloom gateway + uniqueStrings := make([]string, 600) + for i := 0; i < len(uniqueStrings); i++ { + id := randStringRunes() + id = fmt.Sprintf("%s-%d", id, i) + uniqueStrings[i] = id + pod := fmt.Sprintf("pod-%d", i%10) + line := fmt.Sprintf(lineTpl, id) + err := cliDistributor.PushLogLine(line, now.Add(-1*time.Hour).Add(time.Duration(i-len(uniqueStrings))*time.Second), nil, map[string]string{"pod": pod}) + require.NoError(t, err) + } + + // restart ingester to flush chunks and that there are zero chunks in memory + require.NoError(t, cliIngester.Flush()) + require.NoError(t, tIngester.Restart()) + + // wait for compactor to compact index and for bloom compactor to build bloom filters + time.Sleep(10 * time.Second) + + // use bloom gateway to perform needle in the haystack queries + randIdx := rand.Intn(len(uniqueStrings)) + q := fmt.Sprintf(`{job="varlog"} |= "%s"`, uniqueStrings[randIdx]) + end := now.Add(-1 * time.Second) + start := end.Add(-24 * time.Hour) + resp, err := cliQueryFrontend.RunRangeQueryWithStartEnd(context.Background(), q, start, end) + require.NoError(t, err) + + // verify response + require.Len(t, resp.Data.Stream, 1) + expectedLine := fmt.Sprintf(lineTpl, uniqueStrings[randIdx]) + require.Equal(t, expectedLine, resp.Data.Stream[0].Values[0][1]) + + // TODO(chaudum): + // verify that bloom blocks have actually been used for querying + // atm, we can only verify by logs, so we should add appropriate metrics for + // uploaded/downloaded blocks and metas +} + func getValueFromMF(mf *dto.MetricFamily, lbs []*dto.LabelPair) float64 { for _, m := range mf.Metric { if !assert.ObjectsAreEqualValues(lbs, m.GetLabel()) { diff --git a/pkg/bloomcompactor/bloomcompactor.go b/pkg/bloomcompactor/bloomcompactor.go index 7f999c0ebfad6..a5f1185f57e84 100644 --- a/pkg/bloomcompactor/bloomcompactor.go +++ b/pkg/bloomcompactor/bloomcompactor.go @@ -51,6 +51,7 @@ import ( "github.com/grafana/loki/pkg/storage" v1 "github.com/grafana/loki/pkg/storage/bloom/v1" chunk_client "github.com/grafana/loki/pkg/storage/chunk/client" + "github.com/grafana/loki/pkg/storage/chunk/client/local" "github.com/grafana/loki/pkg/storage/config" "github.com/grafana/loki/pkg/storage/stores/shipper/bloomshipper" "github.com/grafana/loki/pkg/storage/stores/shipper/indexshipper" @@ -166,10 +167,18 @@ func New( return nil, errors.Wrap(err, "create index shipper") } + // The ObjectClient does not expose the key encoder it uses, + // so check the concrete type and set the FSEncoder if needed. + var keyEncoder chunk_client.KeyEncoder + switch objectClient.(type) { + case *local.FSObjectClient: + keyEncoder = chunk_client.FSEncoder + } + c.storeClients[periodicConfig.From] = storeClient{ object: objectClient, index: index_storage.NewIndexStorageClient(objectClient, periodicConfig.IndexTables.PathPrefix), - chunk: chunk_client.NewClient(objectClient, nil, schemaConfig), + chunk: chunk_client.NewClient(objectClient, keyEncoder, schemaConfig), indexShipper: indexShipper, } } @@ -275,7 +284,7 @@ func (c *Compactor) compactTable(ctx context.Context, logger log.Logger, tableNa return fmt.Errorf("index store client not found for period starting at %s", schemaCfg.From.String()) } - _, tenants, err := sc.index.ListFiles(ctx, tableName, false) + _, tenants, err := sc.index.ListFiles(ctx, tableName, true) if err != nil { return fmt.Errorf("failed to list files for table %s: %w", tableName, err) } diff --git a/pkg/bloomgateway/bloomgateway.go b/pkg/bloomgateway/bloomgateway.go index 403378e016a9f..b0c3251a0843d 100644 --- a/pkg/bloomgateway/bloomgateway.go +++ b/pkg/bloomgateway/bloomgateway.go @@ -180,9 +180,8 @@ func New(cfg Config, schemaCfg config.SchemaConfig, storageCfg storage.Config, o sharding: shardingStrategy, pendingTasks: makePendingTasks(pendingTasksInitialCap), workerConfig: workerConfig{ - maxWaitTime: 200 * time.Millisecond, - maxItems: 100, - processBlocksSequentially: false, + maxWaitTime: 200 * time.Millisecond, + maxItems: 100, }, workerMetrics: newWorkerMetrics(reg, constants.Loki, metricsSubsystem), queueMetrics: queue.NewMetrics(reg, constants.Loki, metricsSubsystem), @@ -323,7 +322,7 @@ func (g *Gateway) FilterChunkRefs(ctx context.Context, req *logproto.FilterChunk case res := <-resCh: responses = append(responses, res) // log line is helpful for debugging tests - // level.Debug(g.logger).Log("msg", "got partial result", "task", task.ID, "tenant", tenantID, "fp", uint64(res.Fp), "chunks", res.Removals.Len(), "progress", fmt.Sprintf("%d/%d", len(responses), requestCount)) + level.Debug(g.logger).Log("msg", "got partial result", "task", task.ID, "tenant", tenantID, "fp_int", uint64(res.Fp), "fp_hex", res.Fp, "chunks_to_remove", res.Removals.Len(), "progress", fmt.Sprintf("%d/%d", len(responses), requestCount)) // wait for all parts of the full response if len(responses) == requestCount { for _, o := range responses { diff --git a/pkg/bloomgateway/bloomgateway_test.go b/pkg/bloomgateway/bloomgateway_test.go index b34e3d55852a5..183a2aad2190e 100644 --- a/pkg/bloomgateway/bloomgateway_test.go +++ b/pkg/bloomgateway/bloomgateway_test.go @@ -269,89 +269,74 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) { }) t.Run("use fuse queriers to filter chunks", func(t *testing.T) { - for _, tc := range []struct { - name string - value bool - }{ - {"sequentially", true}, - {"callback", false}, - } { - t.Run(tc.name, func(t *testing.T) { - - reg := prometheus.NewRegistry() - gw, err := New(cfg, schemaCfg, storageCfg, limits, ss, cm, logger, reg) - require.NoError(t, err) - - now := mktime("2023-10-03 10:00") - - // replace store implementation and re-initialize workers and sub-services - bqs, data := createBlockQueriers(t, 5, now.Add(-8*time.Hour), now, 0, 1024) - gw.bloomStore = newMockBloomStore(bqs) - gw.workerConfig.processBlocksSequentially = tc.value - err = gw.initServices() - require.NoError(t, err) - - t.Log("process blocks in worker sequentially", gw.workerConfig.processBlocksSequentially) - - err = services.StartAndAwaitRunning(context.Background(), gw) - require.NoError(t, err) - t.Cleanup(func() { - err = services.StopAndAwaitTerminated(context.Background(), gw) - require.NoError(t, err) - }) + reg := prometheus.NewRegistry() + gw, err := New(cfg, schemaCfg, storageCfg, limits, ss, cm, logger, reg) + require.NoError(t, err) - chunkRefs := createQueryInputFromBlockData(t, tenantID, data, 100) - - t.Run("no match - return empty response", func(t *testing.T) { - inputChunkRefs := groupRefs(t, chunkRefs) - req := &logproto.FilterChunkRefRequest{ - From: now.Add(-8 * time.Hour), - Through: now, - Refs: inputChunkRefs, - Filters: []syntax.LineFilter{ - {Ty: labels.MatchEqual, Match: "does not match"}, - }, - } - ctx := user.InjectOrgID(context.Background(), tenantID) - res, err := gw.FilterChunkRefs(ctx, req) - require.NoError(t, err) - - expectedResponse := &logproto.FilterChunkRefResponse{ - ChunkRefs: []*logproto.GroupedChunkRefs{}, - } - require.Equal(t, expectedResponse, res) - }) + now := mktime("2023-10-03 10:00") - t.Run("match - return filtered", func(t *testing.T) { - inputChunkRefs := groupRefs(t, chunkRefs) - // hack to get indexed key for a specific series - // the indexed key range for a series is defined as - // i * keysPerSeries ... i * keysPerSeries + keysPerSeries - 1 - // where i is the nth series in a block - // fortunately, i is also used as Checksum for the single chunk of a series - // see mkBasicSeriesWithBlooms() in pkg/storage/bloom/v1/test_util.go - key := inputChunkRefs[0].Refs[0].Checksum*1000 + 500 - - req := &logproto.FilterChunkRefRequest{ - From: now.Add(-8 * time.Hour), - Through: now, - Refs: inputChunkRefs, - Filters: []syntax.LineFilter{ - {Ty: labels.MatchEqual, Match: fmt.Sprintf("series %d", key)}, - }, - } - ctx := user.InjectOrgID(context.Background(), tenantID) - res, err := gw.FilterChunkRefs(ctx, req) - require.NoError(t, err) - - expectedResponse := &logproto.FilterChunkRefResponse{ - ChunkRefs: inputChunkRefs[:1], - } - require.Equal(t, expectedResponse, res) - }) + // replace store implementation and re-initialize workers and sub-services + bqs, data := createBlockQueriers(t, 5, now.Add(-8*time.Hour), now, 0, 1024) + gw.bloomStore = newMockBloomStore(bqs) + err = gw.initServices() + require.NoError(t, err) - }) - } + err = services.StartAndAwaitRunning(context.Background(), gw) + require.NoError(t, err) + t.Cleanup(func() { + err = services.StopAndAwaitTerminated(context.Background(), gw) + require.NoError(t, err) + }) + + chunkRefs := createQueryInputFromBlockData(t, tenantID, data, 100) + + t.Run("no match - return empty response", func(t *testing.T) { + inputChunkRefs := groupRefs(t, chunkRefs) + req := &logproto.FilterChunkRefRequest{ + From: now.Add(-8 * time.Hour), + Through: now, + Refs: inputChunkRefs, + Filters: []syntax.LineFilter{ + {Ty: labels.MatchEqual, Match: "does not match"}, + }, + } + ctx := user.InjectOrgID(context.Background(), tenantID) + res, err := gw.FilterChunkRefs(ctx, req) + require.NoError(t, err) + + expectedResponse := &logproto.FilterChunkRefResponse{ + ChunkRefs: []*logproto.GroupedChunkRefs{}, + } + require.Equal(t, expectedResponse, res) + }) + + t.Run("match - return filtered", func(t *testing.T) { + inputChunkRefs := groupRefs(t, chunkRefs) + // hack to get indexed key for a specific series + // the indexed key range for a series is defined as + // i * keysPerSeries ... i * keysPerSeries + keysPerSeries - 1 + // where i is the nth series in a block + // fortunately, i is also used as Checksum for the single chunk of a series + // see mkBasicSeriesWithBlooms() in pkg/storage/bloom/v1/test_util.go + key := inputChunkRefs[0].Refs[0].Checksum*1000 + 500 + + req := &logproto.FilterChunkRefRequest{ + From: now.Add(-8 * time.Hour), + Through: now, + Refs: inputChunkRefs, + Filters: []syntax.LineFilter{ + {Ty: labels.MatchEqual, Match: fmt.Sprintf("series %d", key)}, + }, + } + ctx := user.InjectOrgID(context.Background(), tenantID) + res, err := gw.FilterChunkRefs(ctx, req) + require.NoError(t, err) + + expectedResponse := &logproto.FilterChunkRefResponse{ + ChunkRefs: inputChunkRefs[:1], + } + require.Equal(t, expectedResponse, res) + }) }) } diff --git a/pkg/bloomgateway/worker.go b/pkg/bloomgateway/worker.go index e82a0daea63c1..a8f9c56d50bab 100644 --- a/pkg/bloomgateway/worker.go +++ b/pkg/bloomgateway/worker.go @@ -20,8 +20,6 @@ import ( type workerConfig struct { maxWaitTime time.Duration maxItems int - - processBlocksSequentially bool } type workerMetrics struct { @@ -188,11 +186,7 @@ func (w *worker) running(ctx context.Context) error { blockRefs = append(blockRefs, b.blockRef) } - if w.cfg.processBlocksSequentially { - err = w.processBlocksSequentially(taskCtx, tasks[0].Tenant, day, blockRefs, boundedRefs) - } else { - err = w.processBlocksWithCallback(taskCtx, tasks[0].Tenant, day, blockRefs, boundedRefs) - } + err = w.processBlocksWithCallback(taskCtx, tasks[0].Tenant, day, blockRefs, boundedRefs) if err != nil { for _, t := range tasks { t.ErrCh <- err @@ -227,20 +221,6 @@ func (w *worker) processBlocksWithCallback(taskCtx context.Context, tenant strin }) } -func (w *worker) processBlocksSequentially(taskCtx context.Context, tenant string, day time.Time, blockRefs []bloomshipper.BlockRef, boundedRefs []boundedTasks) error { - storeFetchStart := time.Now() - blockQueriers, err := w.store.GetBlockQueriersForBlockRefs(taskCtx, tenant, blockRefs) - w.metrics.storeAccessLatency.WithLabelValues(w.id, "GetBlockQueriersForBlockRefs").Observe(time.Since(storeFetchStart).Seconds()) - if err != nil { - return err - } - - for i := range blockQueriers { - processBlock(blockQueriers[i].BlockQuerier, day, boundedRefs[i].tasks) - } - return nil -} - func processBlock(blockQuerier *v1.BlockQuerier, day time.Time, tasks []Task) { schema, err := blockQuerier.Schema() if err != nil { diff --git a/pkg/storage/stores/shipper/bloomshipper/client.go b/pkg/storage/stores/shipper/bloomshipper/client.go index 7ab99ea7e3e6e..b189cba390b82 100644 --- a/pkg/storage/stores/shipper/bloomshipper/client.go +++ b/pkg/storage/stores/shipper/bloomshipper/client.go @@ -144,7 +144,7 @@ func (b *BloomClient) GetMetas(ctx context.Context, params MetaSearchParams) ([] return nil, err } if metaRef.MaxFingerprint < uint64(params.MinFingerprint) || uint64(params.MaxFingerprint) < metaRef.MinFingerprint || - metaRef.StartTimestamp.Before(params.StartTimestamp) || metaRef.EndTimestamp.After(params.EndTimestamp) { + metaRef.EndTimestamp.Before(params.StartTimestamp) || metaRef.StartTimestamp.After(params.EndTimestamp) { continue } meta, err := b.downloadMeta(ctx, metaRef, periodClient) diff --git a/pkg/storage/stores/shipper/bloomshipper/shipper.go b/pkg/storage/stores/shipper/bloomshipper/shipper.go index d7038fc13761c..d9d96fcc7783c 100644 --- a/pkg/storage/stores/shipper/bloomshipper/shipper.go +++ b/pkg/storage/stores/shipper/bloomshipper/shipper.go @@ -1,7 +1,6 @@ package bloomshipper import ( - "cmp" "context" "fmt" "math" @@ -15,6 +14,16 @@ import ( "github.com/grafana/loki/pkg/storage/stores/shipper/bloomshipper/config" ) +type fpRange [2]uint64 + +func (r fpRange) minFp() uint64 { + return r[0] +} + +func (r fpRange) maxFp() uint64 { + return r[1] +} + type Shipper struct { client Client config config.Config @@ -43,7 +52,7 @@ func NewShipper(client Client, config config.Config, limits Limits, logger log.L func (s *Shipper) GetBlockRefs(ctx context.Context, tenantID string, from, through model.Time) ([]BlockRef, error) { level.Debug(s.logger).Log("msg", "GetBlockRefs", "tenant", tenantID, "from", from, "through", through) - blockRefs, err := s.getActiveBlockRefs(ctx, tenantID, from, through, []uint64{0, math.MaxUint64}) + blockRefs, err := s.getActiveBlockRefs(ctx, tenantID, from, through, []fpRange{{0, math.MaxUint64}}) if err != nil { return nil, fmt.Errorf("error fetching active block references : %w", err) } @@ -55,30 +64,36 @@ func (s *Shipper) Fetch(ctx context.Context, tenantID string, blocks []BlockRef, defer cancelFunc() blocksChannel, errorsChannel := s.blockDownloader.downloadBlocks(cancelContext, tenantID, blocks) + // track how many blocks are still remaning to be downloaded + remaining := len(blocks) + for { select { case <-ctx.Done(): return fmt.Errorf("failed to fetch blocks: %w", ctx.Err()) - case result, ok := <-blocksChannel: - if !ok { + case result, sentBeforeClosed := <-blocksChannel: + if !sentBeforeClosed { return nil } err := runCallback(callback, result) if err != nil { return err } - case err := <-errorsChannel: - if err != nil { - return fmt.Errorf("error downloading blocks : %w", err) + remaining-- + if remaining == 0 { + return nil } + case err := <-errorsChannel: + return fmt.Errorf("error downloading blocks : %w", err) } } } func runCallback(callback ForEachBlockCallback, block blockWithQuerier) error { - defer func(result blockWithQuerier) { - _ = result.Close() + defer func(b blockWithQuerier) { + _ = b.Close() }(block) + err := callback(block.closableBlockQuerier.BlockQuerier, block.MinFingerprint, block.MaxFingerprint) if err != nil { return fmt.Errorf("error running callback function for block %s err: %w", block.BlockPath, err) @@ -86,17 +101,6 @@ func runCallback(callback ForEachBlockCallback, block blockWithQuerier) error { return nil } -func (s *Shipper) ForEachBlock(ctx context.Context, tenantID string, from, through model.Time, fingerprints []uint64, callback ForEachBlockCallback) error { - level.Debug(s.logger).Log("msg", "ForEachBlock", "tenant", tenantID, "from", from, "through", through, "fingerprints", len(fingerprints)) - - blockRefs, err := s.getActiveBlockRefs(ctx, tenantID, from, through, fingerprints) - if err != nil { - return fmt.Errorf("error fetching active block references : %w", err) - } - - return s.Fetch(ctx, tenantID, blockRefs, callback) -} - func (s *Shipper) Stop() { s.client.Stop() s.blockDownloader.stop() @@ -112,18 +116,19 @@ func getFirstLast[T any](s []T) (T, T) { return s[0], s[len(s)-1] } -func (s *Shipper) getActiveBlockRefs(ctx context.Context, tenantID string, from, through model.Time, fingerprints []uint64) ([]BlockRef, error) { - minFingerprint, maxFingerprint := getFirstLast(fingerprints) +func (s *Shipper) getActiveBlockRefs(ctx context.Context, tenantID string, from, through model.Time, fingerprints []fpRange) ([]BlockRef, error) { + minFpRange, maxFpRange := getFirstLast(fingerprints) metas, err := s.client.GetMetas(ctx, MetaSearchParams{ TenantID: tenantID, - MinFingerprint: model.Fingerprint(minFingerprint), - MaxFingerprint: model.Fingerprint(maxFingerprint), + MinFingerprint: model.Fingerprint(minFpRange.minFp()), + MaxFingerprint: model.Fingerprint(maxFpRange.maxFp()), StartTimestamp: from, EndTimestamp: through, }) if err != nil { return []BlockRef{}, fmt.Errorf("error fetching meta.json files: %w", err) } + level.Debug(s.logger).Log("msg", "dowloaded metas", "count", len(metas)) activeBlocks := s.findBlocks(metas, from, through, fingerprints) slices.SortStableFunc(activeBlocks, func(a, b BlockRef) int { if a.MinFingerprint < b.MinFingerprint { @@ -138,7 +143,7 @@ func (s *Shipper) getActiveBlockRefs(ctx context.Context, tenantID string, from, return activeBlocks, nil } -func (s *Shipper) findBlocks(metas []Meta, startTimestamp, endTimestamp model.Time, fingerprints []uint64) []BlockRef { +func (s *Shipper) findBlocks(metas []Meta, startTimestamp, endTimestamp model.Time, fingerprints []fpRange) []BlockRef { outdatedBlocks := make(map[string]interface{}) for _, meta := range metas { for _, tombstone := range meta.Tombstones { @@ -164,39 +169,29 @@ func (s *Shipper) findBlocks(metas []Meta, startTimestamp, endTimestamp model.Ti return blockRefs } -// getPosition returns the smallest index of element v in slice s where v > s[i] -// TODO(chaudum): Use binary search to find index instead of iteration. -func getPosition[S ~[]E, E cmp.Ordered](s S, v E) int { - for i := range s { - if v > s[i] { - continue - } - return i - } - return len(s) -} - -func isOutsideRange(b *BlockRef, startTimestamp, endTimestamp model.Time, fingerprints []uint64) bool { +// isOutsideRange tests if a given BlockRef b is outside of search boundaries +// defined by min/max timestamp and min/max fingerprint. +// Fingerprint ranges must be sorted in ascending order. +func isOutsideRange(b *BlockRef, startTimestamp, endTimestamp model.Time, fingerprints []fpRange) bool { // First, check time range if b.EndTimestamp < startTimestamp || b.StartTimestamp > endTimestamp { return true } // Then, check if outside of min/max of fingerprint slice - minFp, maxFp := getFirstLast(fingerprints) - if b.MaxFingerprint < minFp || b.MinFingerprint > maxFp { + minFpRange, maxFpRange := getFirstLast(fingerprints) + if b.MaxFingerprint < minFpRange.minFp() || b.MinFingerprint > maxFpRange.maxFp() { return true } - // Check if the block range is inside a "gap" in the fingerprint slice - // e.g. - // fingerprints = [1, 2, 6, 7, 8] - // block = [3, 4, 5] - idx := getPosition[[]uint64](fingerprints, b.MinFingerprint) - // in case b.MinFingerprint is outside of the fingerprints range, return true - // this is already covered in the range check above, but I keep it as a second gate - if idx > len(fingerprints)-1 { - return true + prev := fpRange{0, 0} + for i := 0; i < len(fingerprints); i++ { + fpr := fingerprints[i] + if b.MinFingerprint > prev.maxFp() && b.MaxFingerprint < fpr.minFp() { + return true + } + prev = fpr } - return b.MaxFingerprint < fingerprints[idx] + + return false } diff --git a/pkg/storage/stores/shipper/bloomshipper/shipper_test.go b/pkg/storage/stores/shipper/bloomshipper/shipper_test.go index 83c9379cd44c6..859aa38c82a61 100644 --- a/pkg/storage/stores/shipper/bloomshipper/shipper_test.go +++ b/pkg/storage/stores/shipper/bloomshipper/shipper_test.go @@ -4,6 +4,7 @@ import ( "fmt" "math" "testing" + "time" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" @@ -40,7 +41,7 @@ func Test_Shipper_findBlocks(t *testing.T) { } shipper := &Shipper{} - blocks := shipper.findBlocks(metas, 300, 400, []uint64{100, 200}) + blocks := shipper.findBlocks(metas, model.Now().Add(-2*time.Hour), model.Now().Add(-1*time.Hour), []fpRange{{100, 200}}) expectedBlockRefs := []BlockRef{ createMatchingBlockRef("block2"), @@ -53,8 +54,8 @@ func Test_Shipper_findBlocks(t *testing.T) { tests := map[string]struct { minFingerprint uint64 maxFingerprint uint64 - startTimestamp int64 - endTimestamp int64 + startTimestamp model.Time + endTimestamp model.Time filtered bool }{ "expected block not to be filtered out if minFingerprint and startTimestamp are within range": { @@ -94,7 +95,7 @@ func Test_Shipper_findBlocks(t *testing.T) { t.Run(name, func(t *testing.T) { shipper := &Shipper{} ref := createBlockRef("fake-block", data.minFingerprint, data.maxFingerprint, data.startTimestamp, data.endTimestamp) - blocks := shipper.findBlocks([]Meta{{Blocks: []BlockRef{ref}}}, 300, 400, []uint64{100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200}) + blocks := shipper.findBlocks([]Meta{{Blocks: []BlockRef{ref}}}, 300, 400, []fpRange{{100, 200}}) if data.filtered { require.Empty(t, blocks) return @@ -105,94 +106,83 @@ func Test_Shipper_findBlocks(t *testing.T) { } } -func TestGetPosition(t *testing.T) { - for i, tc := range []struct { - s []int - v int - exp int - }{ - {s: []int{}, v: 1, exp: 0}, - {s: []int{1, 2, 3}, v: 0, exp: 0}, - {s: []int{1, 2, 3}, v: 2, exp: 1}, - {s: []int{1, 2, 3}, v: 4, exp: 3}, - {s: []int{1, 2, 4, 5}, v: 3, exp: 2}, - } { - tc := tc - name := fmt.Sprintf("case-%d", i) - t.Run(name, func(t *testing.T) { - got := getPosition[[]int](tc.s, tc.v) - require.Equal(t, tc.exp, got) - }) - } -} - func TestIsOutsideRange(t *testing.T) { + startTs := model.Time(1000) + endTs := model.Time(2000) + t.Run("is outside if startTs > through", func(t *testing.T) { - b := createBlockRef("block", 0, math.MaxUint64, 100, 200) - isOutside := isOutsideRange(&b, 0, 90, []uint64{}) + b := createBlockRef("block", 0, math.MaxUint64, startTs, endTs) + isOutside := isOutsideRange(&b, 0, 900, []fpRange{}) require.True(t, isOutside) }) t.Run("is outside if endTs < from", func(t *testing.T) { - b := createBlockRef("block", 0, math.MaxUint64, 100, 200) - isOutside := isOutsideRange(&b, 210, 300, []uint64{}) + b := createBlockRef("block", 0, math.MaxUint64, startTs, endTs) + isOutside := isOutsideRange(&b, 2100, 3000, []fpRange{}) require.True(t, isOutside) }) t.Run("is outside if endFp < first fingerprint", func(t *testing.T) { - b := createBlockRef("block", 0, 90, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{100, 200}) + b := createBlockRef("block", 0, 90, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{100, 199}}) require.True(t, isOutside) }) t.Run("is outside if startFp > last fingerprint", func(t *testing.T) { - b := createBlockRef("block", 210, math.MaxUint64, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{100, 200}) + b := createBlockRef("block", 200, math.MaxUint64, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 49}, {100, 149}}) require.True(t, isOutside) }) t.Run("is outside if within gaps in fingerprints", func(t *testing.T) { - b := createBlockRef("block", 100, 200, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{0, 99, 201, 300}) + b := createBlockRef("block", 100, 199, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 99}, {200, 299}}) require.True(t, isOutside) }) t.Run("is not outside if within fingerprints 1", func(t *testing.T) { - b := createBlockRef("block", 100, 200, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{0, 100, 200, 300}) + b := createBlockRef("block", 10, 90, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 99}, {200, 299}}) require.False(t, isOutside) }) t.Run("is not outside if within fingerprints 2", func(t *testing.T) { - b := createBlockRef("block", 100, 150, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{0, 100, 200, 300}) + b := createBlockRef("block", 210, 290, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 99}, {200, 299}}) + require.False(t, isOutside) + }) + + t.Run("is not outside if spans across multiple fingerprint ranges", func(t *testing.T) { + b := createBlockRef("block", 50, 250, startTs, endTs) + isOutside := isOutsideRange(&b, startTs, endTs, []fpRange{{0, 99}, {200, 299}}) require.False(t, isOutside) }) - t.Run("is not outside if within fingerprints 3", func(t *testing.T) { - b := createBlockRef("block", 150, 200, 100, 200) - isOutside := isOutsideRange(&b, 100, 200, []uint64{0, 100, 200, 300}) + t.Run("is not outside if fingerprint range and time range are larger than block", func(t *testing.T) { + b := createBlockRef("block", math.MaxUint64/3, math.MaxUint64/3*2, startTs, endTs) + isOutside := isOutsideRange(&b, 0, 3000, []fpRange{{0, math.MaxUint64}}) require.False(t, isOutside) }) } func createMatchingBlockRef(blockPath string) BlockRef { - return createBlockRef(blockPath, 0, uint64(math.MaxUint64), 0, math.MaxInt) + return createBlockRef(blockPath, 0, math.MaxUint64, model.Time(0), model.Now()) } func createBlockRef( blockPath string, minFingerprint, maxFingerprint uint64, - startTimestamp, endTimestamp int64, + startTimestamp, endTimestamp model.Time, ) BlockRef { + day := startTimestamp.Unix() / int64(24*time.Hour/time.Second) return BlockRef{ Ref: Ref{ TenantID: "fake", - TableName: "16600", + TableName: fmt.Sprintf("%d", day), MinFingerprint: minFingerprint, MaxFingerprint: maxFingerprint, - StartTimestamp: model.Time(startTimestamp), - EndTimestamp: model.Time(endTimestamp), + StartTimestamp: startTimestamp, + EndTimestamp: endTimestamp, Checksum: 0, }, // block path is unique, and it's used to distinguish the blocks so the rest of the fields might be skipped in this test diff --git a/pkg/storage/stores/shipper/bloomshipper/store.go b/pkg/storage/stores/shipper/bloomshipper/store.go index 06e1d7a4675bf..40c23658e9a1a 100644 --- a/pkg/storage/stores/shipper/bloomshipper/store.go +++ b/pkg/storage/stores/shipper/bloomshipper/store.go @@ -2,7 +2,6 @@ package bloomshipper import ( "context" - "sort" "time" "github.com/prometheus/common/model" @@ -14,7 +13,6 @@ type ForEachBlockCallback func(bq *v1.BlockQuerier, minFp, maxFp uint64) error type ReadShipper interface { GetBlockRefs(ctx context.Context, tenant string, from, through model.Time) ([]BlockRef, error) - ForEachBlock(ctx context.Context, tenant string, from, through model.Time, fingerprints []uint64, callback ForEachBlockCallback) error Fetch(ctx context.Context, tenant string, blocks []BlockRef, callback ForEachBlockCallback) error } @@ -30,8 +28,6 @@ type BlockQuerierWithFingerprintRange struct { type Store interface { GetBlockRefs(ctx context.Context, tenant string, from, through time.Time) ([]BlockRef, error) - GetBlockQueriers(ctx context.Context, tenant string, from, through time.Time, fingerprints []uint64) ([]BlockQuerierWithFingerprintRange, error) - GetBlockQueriersForBlockRefs(ctx context.Context, tenant string, blocks []BlockRef) ([]BlockQuerierWithFingerprintRange, error) ForEach(ctx context.Context, tenant string, blocks []BlockRef, callback ForEachBlockCallback) error Stop() } @@ -60,40 +56,6 @@ func (bs *BloomStore) ForEach(ctx context.Context, tenant string, blocks []Block return bs.shipper.Fetch(ctx, tenant, blocks, callback) } -// GetQueriersForBlocks implements Store -func (bs *BloomStore) GetBlockQueriersForBlockRefs(ctx context.Context, tenant string, blocks []BlockRef) ([]BlockQuerierWithFingerprintRange, error) { - bqs := make([]BlockQuerierWithFingerprintRange, 0, 32) - err := bs.shipper.Fetch(ctx, tenant, blocks, func(bq *v1.BlockQuerier, minFp uint64, maxFp uint64) error { - bqs = append(bqs, BlockQuerierWithFingerprintRange{ - BlockQuerier: bq, - MinFp: model.Fingerprint(minFp), - MaxFp: model.Fingerprint(maxFp), - }) - return nil - }) - sort.Slice(bqs, func(i, j int) bool { - return bqs[i].MinFp < bqs[j].MinFp - }) - return bqs, err -} - -// BlockQueriers implements Store -func (bs *BloomStore) GetBlockQueriers(ctx context.Context, tenant string, from, through time.Time, fingerprints []uint64) ([]BlockQuerierWithFingerprintRange, error) { - bqs := make([]BlockQuerierWithFingerprintRange, 0, 32) - err := bs.shipper.ForEachBlock(ctx, tenant, toModelTime(from), toModelTime(through), fingerprints, func(bq *v1.BlockQuerier, minFp uint64, maxFp uint64) error { - bqs = append(bqs, BlockQuerierWithFingerprintRange{ - BlockQuerier: bq, - MinFp: model.Fingerprint(minFp), - MaxFp: model.Fingerprint(maxFp), - }) - return nil - }) - sort.Slice(bqs, func(i, j int) bool { - return bqs[i].MinFp < bqs[j].MinFp - }) - return bqs, err -} - func toModelTime(t time.Time) model.Time { return model.TimeFromUnixNano(t.UnixNano()) } diff --git a/pkg/storage/stores/shipper/indexshipper/indexgateway/gateway.go b/pkg/storage/stores/shipper/indexshipper/indexgateway/gateway.go index 1040bd6c1b565..8b0f186386bdf 100644 --- a/pkg/storage/stores/shipper/indexshipper/indexgateway/gateway.go +++ b/pkg/storage/stores/shipper/indexshipper/indexgateway/gateway.go @@ -204,7 +204,7 @@ func (g *Gateway) GetChunkRef(ctx context.Context, req *logproto.GetChunkRefRequ return nil, err } - predicate := chunk.NewPredicate(matchers, *(&req.Filters)) + predicate := chunk.NewPredicate(matchers, req.Filters) chunks, _, err := g.indexQuerier.GetChunks(ctx, instanceID, req.From, req.Through, predicate) if err != nil { return nil, err @@ -219,8 +219,11 @@ func (g *Gateway) GetChunkRef(ctx context.Context, req *logproto.GetChunkRefRequ } } + initialChunkCount := len(result.Refs) + // Return unfiltered results if there is no bloom querier (Bloom Gateway disabled) or if there are not filters. if g.bloomQuerier == nil || len(req.Filters) == 0 { + level.Info(g.log).Log("msg", "chunk filtering is not enabled or there is no line filter", "filters", len(req.Filters)) return result, nil } @@ -234,6 +237,7 @@ func (g *Gateway) GetChunkRef(ctx context.Context, req *logproto.GetChunkRefRequ } result.Refs = chunkRefs + level.Info(g.log).Log("msg", "return filtered chunk refs", "unfiltered", initialChunkCount, "filtered", len(result.Refs)) return result, nil } From 5517eaa1b8e125013d55b89caaeb17e772abd850 Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Fri, 12 Jan 2024 09:28:10 +0100 Subject: [PATCH 14/21] feat: Add tracing integration to profiling. (#11633) Same as https://github.com/grafana/tempo/pull/3276 this adds profiling integration to tracing instrumentation allowing to get profile for a single request removing the noise of everything else. --- CHANGELOG.md | 3 +- cmd/loki/main.go | 6 +- pkg/tracing/config.go | 4 +- .../grafana/dskit/spanprofiler/README.md | 104 +++++++++++++++++ .../dskit/spanprofiler/spanprofiler.go | 107 +++++++++++++++++ .../grafana/dskit/spanprofiler/tracer.go | 109 ++++++++++++++++++ vendor/modules.txt | 1 + 7 files changed, 331 insertions(+), 3 deletions(-) create mode 100644 vendor/github.com/grafana/dskit/spanprofiler/README.md create mode 100644 vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go create mode 100644 vendor/github.com/grafana/dskit/spanprofiler/tracer.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 46e9a24daf1ef..0e723e64176c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,8 @@ ##### Enhancements -* [11571](https://github.com/grafana/loki/pull/11571) **MichelHollands**: Add a metrics.go log line for requests from querier to ingester +* [11633](https://github.com/grafana/loki/pull/11633) **cyriltovena**: Add profiling integrations to tracing instrumentation. +* [11571](https://github.com/grafana/loki/pull/11571) **MichelHollands**: Add a metrics.go log line for requests from querier to ingester * [11477](https://github.com/grafana/loki/pull/11477) **MichelHollands**: support GET for /ingester/shutdown * [11363](https://github.com/grafana/loki/pull/11363) **kavirajk**: bugfix(memcached): Make memcached batch fetch truely context aware. * [11319](https://github.com/grafana/loki/pull/11319) **someStrangerFromTheAbyss**: Helm: Add extraContainers to the write pods. diff --git a/cmd/loki/main.go b/cmd/loki/main.go index 845104eee8de5..937a5c16fab80 100644 --- a/cmd/loki/main.go +++ b/cmd/loki/main.go @@ -10,7 +10,9 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/log" + "github.com/grafana/dskit/spanprofiler" "github.com/grafana/dskit/tracing" + "github.com/opentracing/opentracing-go" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/version" @@ -84,7 +86,9 @@ func main() { if err != nil { level.Error(util_log.Logger).Log("msg", "error in initializing tracing. tracing will not be enabled", "err", err) } - + if config.Tracing.ProfilingEnabled { + opentracing.SetGlobalTracer(spanprofiler.NewTracer(opentracing.GlobalTracer())) + } defer func() { if trace != nil { if err := trace.Close(); err != nil { diff --git a/pkg/tracing/config.go b/pkg/tracing/config.go index 1c97d88a845df..f9faefa6a7303 100644 --- a/pkg/tracing/config.go +++ b/pkg/tracing/config.go @@ -5,7 +5,8 @@ import ( ) type Config struct { - Enabled bool `yaml:"enabled"` + Enabled bool `yaml:"enabled"` + ProfilingEnabled bool `yaml:"profiling_enabled" category:"experimental" doc:"hidden"` } func (cfg *Config) RegisterFlags(f *flag.FlagSet) { @@ -14,4 +15,5 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { f.BoolVar(&cfg.Enabled, prefix+"tracing.enabled", true, "Set to false to disable tracing.") + f.BoolVar(&cfg.ProfilingEnabled, prefix+"tracing.profiling-enabled", true, "Set to true to enable profiling integration.") } diff --git a/vendor/github.com/grafana/dskit/spanprofiler/README.md b/vendor/github.com/grafana/dskit/spanprofiler/README.md new file mode 100644 index 0000000000000..a415985f6649e --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/README.md @@ -0,0 +1,104 @@ +# Span Profiler for OpenTracing-Go + +## Overview + +The Span Profiler for OpenTracing-Go is a package that seamlessly integrates `opentracing-go` instrumentation with +profiling through the use of pprof labels. + +Accessing trace span profiles is made convenient through the Grafana Explore view. You can find a complete example setup +with Grafana Tempo in the [Pyroscope repository](https://github.com/grafana/pyroscope/tree/main/examples/tracing/tempo): + +![image](https://github.com/grafana/otel-profiling-go/assets/12090599/31e33cd1-818b-4116-b952-c9ec7b1fb593) + +## Usage + +There are two primary ways to use the Span Profiler: + +### 1. Wrap the Global Tracer. + +You can wrap the global tracer using `spanprofiler.NewTracer`: + +```go +import ( + "github.com/opentracing/opentracing-go" + "github.com/grafana/dskit/spanprofiler" +) + +func main() { + // Initialize your OpenTracing tracer + tracer := opentracing.GlobalTracer() + // Wrap it with the tracer-profiler + wrappedTracer := spanprofiler.NewTracer(tracer) + // Use the wrapped tracer in your application + opentracing.SetGlobalTracer(wrappedTracer) + + // Or, as an oneliner: + // opentracing.SetGlobalTracer(spanprofiler.NewTracer(opentracing.GlobalTracer())) + + // Your application logic here +} +``` + +For efficiency, the tracer selectively records profiles for _root_ spans — the initial _local_ span in a process — since +a trace may encompass thousands of spans. All stack trace samples accumulated during the execution of their child spans +contribute to the root span's profile. In practical terms, this signifies that, for instance, an HTTP request results +in a singular profile, irrespective of the numerous spans within the trace. It's important to note that these profiles +don't extend beyond the boundaries of a single process. + +The limitation of this approach is that only spans created within the same goroutine, or its children, as the parent are +taken into account. Consequently, in scenarios involving asynchronous execution, where the parent span context is passed +to another goroutine, explicit profiling becomes necessary using `spanprofiler.StartSpanFromContext`. + +### 2. Profile individual spans. + +The `spanprofiler.StartSpanFromContext` function allows you to granularly control which spans to profile: + +```go +func YourOperationName(ctx context.Background()) { + // Start a span and enable profiling for it + span, ctx := spanprofiler.StartSpanFromContext(ctx, "YourOperationName", tracer) + defer span.Finish() // Finish the span when done + + // Use the span in your application logic +} +``` + +The function guarantees that the span is to be profiled. + +Both methods can be employed either in conjunction or independently. Our recommendation is to utilize the tracer for +seamless integration, reserving explicit span profiling only for cases where spans are spawned in detached goroutines. + +## Implementation details + +When a new trace span is created, and is eligible for profiling, the tracer sets `span_id` and `span_name` [pprof labels](https://github.com/google/pprof/blob/master/doc/README.md#tag-filtering) +that point to the respective span. These labels are stored in the goroutine's local storage and inherited by any +subsequent child goroutines. + +`span_name` is available as a regular label and can be used in the query expressions. For example, the following query +will show you profile for the code that is not covered with traces: +``` +{service_name="my-service",span_name=""} +``` + +Additionally, trace spans are identified by the `pyroscope.profile.id` attribute, indicating the associated profile. +This allows to find such spans in the trace view (in the screenshot) and fetch profiles for specific spans. + +It's important to note that the presence of this attribute does not guarantee profile availability; stack trace samples +might not be collected if the CPU time utilized falls below the sample interval (10ms). + +It is crucial to understand that this module doesn't directly control the pprof profiler; its initialization is still +necessary for profile collection. This initialization can be achieved through the `runtime/pprof` package, or using the +[Pyroscope client](https://github.com/grafana/pyroscope-go). + +Limitations: + - Only CPU profiling is fully supported at the moment. + - Only [Jaeger tracer](https://github.com/jaegertracing/jaeger-client-go) implementation is supported. + +## Performance implications + +The typical performance impact is generally imperceptible and primarily arises from the cost of pprof labeling. However, +intensive use of pprof labels may have negative impact on the profiled application. + +In the case of the tracer provided by this package, the `StartSpan` method wrapper introduces an approximate 20% increase +in CPU time compared to the original call. In vase majority of cases, the overhead constitutes less than 0.01% of the total +CPU time and is considered safe for deployment in production systems. diff --git a/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go b/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go new file mode 100644 index 0000000000000..8481d04498d5a --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/spanprofiler.go @@ -0,0 +1,107 @@ +package spanprofiler + +import ( + "context" + "runtime/pprof" + + "github.com/opentracing/opentracing-go" + "github.com/uber/jaeger-client-go" +) + +// StartSpanFromContext starts and returns a Span with `operationName`, using +// any Span found within `ctx` as a ChildOfRef. If no such parent could be +// found, StartSpanFromContext creates a root (parentless) Span. +// +// The call sets `operationName` as `span_name` pprof label, and the new span +// identifier as `span_id` pprof label, if the trace is sampled. +// +// The second return value is a context.Context object built around the +// returned Span. +// +// Example usage: +// +// SomeFunction(ctx context.Context, ...) { +// sp, ctx := opentracing.StartSpanFromContext(ctx, "SomeFunction") +// defer sp.Finish() +// ... +// } +func StartSpanFromContext(ctx context.Context, operationName string, opts ...opentracing.StartSpanOption) (opentracing.Span, context.Context) { + return StartSpanFromContextWithTracer(ctx, opentracing.GlobalTracer(), operationName, opts...) +} + +// StartSpanFromContextWithTracer starts and returns a span with `operationName` +// using a span found within the context as a ChildOfRef. If that doesn't exist +// it creates a root span. It also returns a context.Context object built +// around the returned span. +// +// The call sets `operationName` as `span_name` pprof label, and the new span +// identifier as `span_id` pprof label, if the trace is sampled. +// +// It's behavior is identical to StartSpanFromContext except that it takes an explicit +// tracer as opposed to using the global tracer. +func StartSpanFromContextWithTracer(ctx context.Context, tracer opentracing.Tracer, operationName string, opts ...opentracing.StartSpanOption) (opentracing.Span, context.Context) { + span, ctx := opentracing.StartSpanFromContextWithTracer(ctx, tracer, operationName, opts...) + spanCtx, ok := span.Context().(jaeger.SpanContext) + if ok { + span = wrapJaegerSpanWithGoroutineLabels(ctx, span, operationName, sampledSpanID(spanCtx)) + } + return span, ctx +} + +func wrapJaegerSpanWithGoroutineLabels( + parentCtx context.Context, + span opentracing.Span, + operationName string, + spanID string, +) *spanWrapper { + // Note that pprof labels are propagated through the goroutine's local + // storage and are always copied to child goroutines. This way, stack + // trace samples collected during execution of child spans will be taken + // into account at the root. + var ctx context.Context + if spanID != "" { + ctx = pprof.WithLabels(parentCtx, pprof.Labels( + spanNameLabelName, operationName, + spanIDLabelName, spanID)) + } else { + // Even if the trace has not been sampled, we still need to keep track + // of samples that belong to the span (all spans with the given name). + ctx = pprof.WithLabels(parentCtx, pprof.Labels( + spanNameLabelName, operationName)) + } + // Goroutine labels should be set as early as possible, + // in order to capture the overhead of the function call. + pprof.SetGoroutineLabels(ctx) + // We create a span wrapper to ensure we remove the newly attached pprof + // labels when span finishes. The need of this wrapper is questioned: + // as we do not have the original context, we could leave the goroutine + // labels – normally, span is finished at the very end of the goroutine's + // lifetime, so no significant side effects should take place. + w := spanWrapper{ + parentPprofCtx: parentCtx, + currentPprofCtx: ctx, + } + w.Span = span.SetTag(profileIDTagKey, spanID) + return &w +} + +type spanWrapper struct { + parentPprofCtx context.Context + currentPprofCtx context.Context + opentracing.Span +} + +func (s *spanWrapper) Finish() { + s.Span.Finish() + pprof.SetGoroutineLabels(s.parentPprofCtx) + s.currentPprofCtx = s.parentPprofCtx +} + +// sampledSpanID returns the span ID, if the span is sampled, +// otherwise an empty string is returned. +func sampledSpanID(spanCtx jaeger.SpanContext) string { + if spanCtx.IsSampled() { + return spanCtx.SpanID().String() + } + return "" +} diff --git a/vendor/github.com/grafana/dskit/spanprofiler/tracer.go b/vendor/github.com/grafana/dskit/spanprofiler/tracer.go new file mode 100644 index 0000000000000..c28b52b11d444 --- /dev/null +++ b/vendor/github.com/grafana/dskit/spanprofiler/tracer.go @@ -0,0 +1,109 @@ +package spanprofiler + +import ( + "context" + "unsafe" + + "github.com/opentracing/opentracing-go" + "github.com/uber/jaeger-client-go" +) + +const ( + profileIDTagKey = "pyroscope.profile.id" + + spanIDLabelName = "span_id" + spanNameLabelName = "span_name" +) + +type tracer struct{ opentracing.Tracer } + +// NewTracer creates a new opentracing.Tracer with the span profiler integrated. +// +// For efficiency, the tracer selectively records profiles for _root_ spans +// — the initial _local_ span in a process — since a trace may encompass +// thousands of spans. All stack trace samples accumulated during the execution +// of their child spans contribute to the root span's profile. In practical +// terms, this signifies that, for instance, an HTTP request results in a +// singular profile, irrespective of the numerous spans within the trace. It's +// important to note that these profiles don't extend beyond the boundaries of +// a single process. +// +// The limitation of this approach is that only spans created within the same +// goroutine, or its children, as the parent are taken into account. +// Consequently, in scenarios involving asynchronous execution, where the parent +// span context is passed to another goroutine, explicit profiling becomes +// necessary using `spanprofiler.StartSpanFromContext`. +func NewTracer(tr opentracing.Tracer) opentracing.Tracer { return &tracer{tr} } + +func (t *tracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span { + span := t.Tracer.StartSpan(operationName, opts...) + spanCtx, ok := span.Context().(jaeger.SpanContext) + if !ok { + return span + } + // pprof labels are attached only once, at the span root level. + if !isRootSpan(opts...) { + return span + } + // The pprof label API assumes that pairs of labels are passed through the + // context. Unfortunately, the opentracing Tracer API doesn't match this + // concept: this makes it impossible to save an existing pprof context and + // all the original pprof labels associated with the goroutine. + ctx := context.Background() + return wrapJaegerSpanWithGoroutineLabels(ctx, span, operationName, sampledSpanID(spanCtx)) +} + +// isRootSpan reports whether the span is a root span. +// +// There are only two valid cases: if the span is the first span in the trace, +// or is the first _local_ span in the trace. +// +// An exception is made for FollowsFrom reference: spans without an explicit +// parent are considered as root ones. +func isRootSpan(opts ...opentracing.StartSpanOption) bool { + parent, ok := parentSpanContextFromRef(opts...) + return !ok || isRemoteSpan(parent) +} + +// parentSpanContextFromRef returns the first parent reference. +func parentSpanContextFromRef(options ...opentracing.StartSpanOption) (sc jaeger.SpanContext, ok bool) { + var sso opentracing.StartSpanOptions + for _, option := range options { + option.Apply(&sso) + } + for _, ref := range sso.References { + if ref.Type == opentracing.ChildOfRef && ref.ReferencedContext != nil { + sc, ok = ref.ReferencedContext.(jaeger.SpanContext) + return sc, ok + } + } + return sc, ok +} + +// isRemoteSpan reports whether the span context represents a remote parent. +// +// NOTE(kolesnikovae): this is ugly, but the only reliable method I found. +// The opentracing-go package and Jaeger client are not meant to change as +// both are deprecated. +func isRemoteSpan(c jaeger.SpanContext) bool { + jaegerCtx := *(*jaegerSpanCtx)(unsafe.Pointer(&c)) + return jaegerCtx.remote +} + +// jaegerSpanCtx represents memory layout of the jaeger.SpanContext type. +type jaegerSpanCtx struct { + traceID [16]byte // TraceID + spanID [8]byte // SpanID + parentID [8]byte // SpanID + baggage uintptr // map[string]string + debugID [2]uintptr // string + + // samplingState is a pointer to a struct that has "localRootSpan" member, + // which we could probably use: that would allow omitting quite expensive + // parentSpanContextFromRef call. However, interpreting the pointer and + // the complex struct memory layout is more complicated and dangerous. + samplingState uintptr + + // remote indicates that span context represents a remote parent + remote bool +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 8ce4557f461ba..b69a2f1e5315d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -894,6 +894,7 @@ github.com/grafana/dskit/server github.com/grafana/dskit/services github.com/grafana/dskit/signals github.com/grafana/dskit/spanlogger +github.com/grafana/dskit/spanprofiler github.com/grafana/dskit/tenant github.com/grafana/dskit/test github.com/grafana/dskit/tracing From ef75ba35d6b24d0ad2835789d7f4652f2ec244d0 Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Fri, 12 Jan 2024 13:15:03 +0100 Subject: [PATCH 15/21] Deprecate and flip -legacy-read-mode to false by default (#11665) **What this PR does / why we need it**: This PR addresses a pending TODO to flip the value of the `-legacy-read-mode` to false by default. We also deprecate it as we plan to remove it after Loki 3.0. --- CHANGELOG.md | 1 + docs/sources/setup/upgrade/_index.md | 1 + pkg/loki/loki.go | 7 +++---- pkg/loki/modules_test.go | 3 +++ tools/deprecated-config-checker/checker/checker_test.go | 1 + tools/deprecated-config-checker/deprecated-config.yaml | 1 + tools/deprecated-config-checker/test-fixtures/config.yaml | 1 + 7 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e723e64176c7..57aeda6ad719b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,6 +69,7 @@ * [10959](https://github.com/grafana/loki/pull/10959) **slim-bean** introduce a backoff wait on subquery retries. * [11121](https://github.com/grafana/loki/pull/11121) **periklis** Ensure all lifecycler cfgs ref a valid IPv6 addr and port combination * [10650](https://github.com/grafana/loki/pull/10650) **matthewpi** Ensure the frontend uses a valid IPv6 addr and port combination +* [11665](https://github.com/grafana/loki/pull/11665) **salvacorts** Deprecate and flip `-legacy-read-mode` flag to `false` by default. #### Promtail diff --git a/docs/sources/setup/upgrade/_index.md b/docs/sources/setup/upgrade/_index.md index 663201820e1e6..84fac5835b31f 100644 --- a/docs/sources/setup/upgrade/_index.md +++ b/docs/sources/setup/upgrade/_index.md @@ -165,6 +165,7 @@ This new metric will provide a more clear signal that there is an issue with ing | `querier.tsdb-max-query-parallelism` | 128 | 512 | - | | `query-scheduler.max-outstanding-requests-per-tenant` | 32000 | 100 | - | | `validation.max-label-names-per-series` | 15 | 30 | - | +| `legacy-read-mode` | false | true | Deprecated. It will be removed in the next minor release. | {{% /responsive-table %}} #### Write dedupe cache is deprecated diff --git a/pkg/loki/loki.go b/pkg/loki/loki.go index d4b58fac838f3..6ef0572d6bad0 100644 --- a/pkg/loki/loki.go +++ b/pkg/loki/loki.go @@ -101,7 +101,7 @@ type Config struct { Tracing tracing.Config `yaml:"tracing"` Analytics analytics.Config `yaml:"analytics"` - LegacyReadTarget bool `yaml:"legacy_read_target,omitempty" doc:"hidden"` + LegacyReadTarget bool `yaml:"legacy_read_target,omitempty" doc:"hidden|deprecated"` Common common.Config `yaml:"common,omitempty"` @@ -136,9 +136,8 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) { "It will, however, distort metrics, because it is counted as live memory. ", ) - //TODO(trevorwhitney): flip this to false with Loki 3.0 - f.BoolVar(&c.LegacyReadTarget, "legacy-read-mode", true, "Set to false to disable the legacy read mode and use new scalable mode with 3rd backend target. "+ - "The default will be flipped to false in the next Loki release.") + f.BoolVar(&c.LegacyReadTarget, "legacy-read-mode", false, "Deprecated. Set to true to enable the legacy read mode which includes the components from the backend target. "+ + "This setting is deprecated and will be removed in the next minor release.") f.DurationVar(&c.ShutdownDelay, "shutdown-delay", 0, "How long to wait between SIGTERM and shutdown. After receiving SIGTERM, Loki will report 503 Service Unavailable status via /ready endpoint.") diff --git a/pkg/loki/modules_test.go b/pkg/loki/modules_test.go index 19980e2944120..0d07242b75370 100644 --- a/pkg/loki/modules_test.go +++ b/pkg/loki/modules_test.go @@ -175,6 +175,9 @@ func TestIndexGatewayRingMode_when_TargetIsLegacyReadOrBackend(t *testing.T) { { name: "leagcy read", target: Read, + transformer: func(cfg *Config) { + cfg.LegacyReadTarget = true + }, }, { name: "backend", diff --git a/tools/deprecated-config-checker/checker/checker_test.go b/tools/deprecated-config-checker/checker/checker_test.go index efecefb1700f0..d9fdf4dc607b0 100644 --- a/tools/deprecated-config-checker/checker/checker_test.go +++ b/tools/deprecated-config-checker/checker/checker_test.go @@ -39,6 +39,7 @@ var ( } expectedConfigDeprecates = []string{ + "legacy-read-mode", "ruler.remote_write.client", "index_gateway.ring.replication_factor", "storage_config.bigtable", diff --git a/tools/deprecated-config-checker/deprecated-config.yaml b/tools/deprecated-config-checker/deprecated-config.yaml index ab4c3c073d738..46b89971bdd2e 100644 --- a/tools/deprecated-config-checker/deprecated-config.yaml +++ b/tools/deprecated-config-checker/deprecated-config.yaml @@ -13,6 +13,7 @@ # _msg: "Use tsdb (preferred) or boltdb-shipper instead." # # Note that even though the configs in schema_config takes a list, here we specify the deprecated fields for each item in the list. +legacy-read-mode: "Legacy read SSD mode is deprecated and will be eventually removed. Use the new read and backend targets." ruler: remote_write: diff --git a/tools/deprecated-config-checker/test-fixtures/config.yaml b/tools/deprecated-config-checker/test-fixtures/config.yaml index d5a326c8647f3..be875f3ac10f8 100644 --- a/tools/deprecated-config-checker/test-fixtures/config.yaml +++ b/tools/deprecated-config-checker/test-fixtures/config.yaml @@ -1,4 +1,5 @@ auth_enabled: false +legacy-read-mode: true server: http_listen_port: 3100 From 0694d797dec010393567704211638219c1971b46 Mon Sep 17 00:00:00 2001 From: Joao Marcal Date: Fri, 12 Jan 2024 13:40:24 +0100 Subject: [PATCH 16/21] mixins: add route to write Distributor Latency dashboard (#11637) Co-authored-by: Periklis Tsirakidis --- CHANGELOG.md | 1 + .../loki-mixin-compiled-ssd/dashboards/loki-writes.json | 6 +++--- production/loki-mixin-compiled/dashboards/loki-writes.json | 6 +++--- production/loki-mixin/dashboards/loki-writes.libsonnet | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57aeda6ad719b..fbf61c789213b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -98,6 +98,7 @@ #### Mixins * [11087](https://github.com/grafana/loki/pull/11087) **JoaoBraveCoding**: Adds structured metadata panels for ingested data +* [11637](https://github.com/grafana/loki/pull/11637) **JoaoBraveCoding**: Add route to write Distributor Latency dashboard #### Fixes diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json b/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json index bcd620e69e4a9..9d2544082d158 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(cluster_job:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}) / sum(cluster_job:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"})", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", diff --git a/production/loki-mixin-compiled/dashboards/loki-writes.json b/production/loki-mixin-compiled/dashboards/loki-writes.json index fdb347f56055f..b7cf83f95f44b 100644 --- a/production/loki-mixin-compiled/dashboards/loki-writes.json +++ b/production/loki-mixin-compiled/dashboards/loki-writes.json @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (cluster_job:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(cluster_job:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}) / sum(cluster_job:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", diff --git a/production/loki-mixin/dashboards/loki-writes.libsonnet b/production/loki-mixin/dashboards/loki-writes.libsonnet index a12f4f7cea6e0..d5c85337a29db 100644 --- a/production/loki-mixin/dashboards/loki-writes.libsonnet +++ b/production/loki-mixin/dashboards/loki-writes.libsonnet @@ -65,7 +65,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.distributor, + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.distributor + [utils.selector.eq('route', 'api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle')], ) ) ) From c7ad168a33f131f6feeabae4c985874e03e17aca Mon Sep 17 00:00:00 2001 From: Joao Marcal Date: Fri, 12 Jan 2024 14:18:33 +0100 Subject: [PATCH 17/21] operator: updates mixins to fix structured metadata dashboards (#11671) --- operator/CHANGELOG.md | 1 + .../grafana-dashboard-lokistack-reads.json | 24 +++++++++---------- .../grafana-dashboard-lokistack-writes.json | 12 +++++----- operator/jsonnet/config.libsonnet | 1 - operator/jsonnet/jsonnetfile.json | 2 +- operator/jsonnet/jsonnetfile.lock.json | 4 ++-- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/operator/CHANGELOG.md b/operator/CHANGELOG.md index f6cfa9a5cda01..ad9b319b625c9 100644 --- a/operator/CHANGELOG.md +++ b/operator/CHANGELOG.md @@ -1,5 +1,6 @@ ## Main +- [11671](https://github.com/grafana/loki/pull/11671) **JoaoBraveCoding**: Update mixins to fix structured metadata dashboards - [11624](https://github.com/grafana/loki/pull/11624) **xperimental**: React to changes in ConfigMap used for storage CA - [11481](https://github.com/grafana/loki/pull/11481) **JoaoBraveCoding**: Adds AWS STS support - [11533](https://github.com/grafana/loki/pull/11533) **periklis**: Add serviceaccount per LokiStack resource diff --git a/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-reads.json b/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-reads.json index e1adb4dd6cc0a..df5ea66e6d2a5 100644 --- a/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-reads.json +++ b/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-reads.json @@ -217,9 +217,9 @@ "group": "A", "mode": "normal" } - } - }, - "unit": "s" + }, + "unit": "s" + } }, "fill": 1, "id": 3, @@ -493,9 +493,9 @@ "group": "A", "mode": "normal" } - } - }, - "unit": "s" + }, + "unit": "s" + } }, "fill": 1, "id": 6, @@ -769,9 +769,9 @@ "group": "A", "mode": "normal" } - } - }, - "unit": "s" + }, + "unit": "s" + } }, "fill": 1, "id": 9, @@ -1045,9 +1045,9 @@ "group": "A", "mode": "normal" } - } - }, - "unit": "s" + }, + "unit": "s" + } }, "fill": 1, "id": 15, diff --git a/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-writes.json b/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-writes.json index 58107485d370c..8053d353b1135 100644 --- a/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-writes.json +++ b/operator/internal/manifests/openshift/internal/dashboards/static/grafana-dashboard-lokistack-writes.json @@ -66,7 +66,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{namespace=\"$namespace\",job=~\".+-distributor-http\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{status}}", @@ -142,7 +142,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le) (namespace_job_route:loki_request_duration_seconds_bucket:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"loki_api_v1_push\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le) (namespace_job_route:loki_request_duration_seconds_bucket:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "99th Percentile", @@ -150,7 +150,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (le) (namespace_job_route:loki_request_duration_seconds_bucket:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"loki_api_v1_push\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le) (namespace_job_route:loki_request_duration_seconds_bucket:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", "format": "time_series", "intervalFactor": 2, "legendFormat": "50th Percentile", @@ -158,7 +158,7 @@ "step": 10 }, { - "expr": "1e3 * sum(namespace_job_route:loki_request_duration_seconds_sum:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"loki_api_v1_push\"}) / sum(namespace_job_route:loki_request_duration_seconds_count:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"loki_api_v1_push\"})", + "expr": "1e3 * sum(namespace_job_route:loki_request_duration_seconds_sum:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}) / sum(namespace_job_route:loki_request_duration_seconds_count:sum_rate{namespace=\"$namespace\", job=~\".+-distributor-http\", route=\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Average", @@ -246,7 +246,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\",}[$__rate_interval])) / sum(rate(loki_distributor_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\",}[$__rate_interval]))", + "expr": "sum (rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",}[$__rate_interval])) / sum(rate(loki_distributor_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "bytes", @@ -322,7 +322,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (tenant) (rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\",}[$__rate_interval])) / ignoring(tenant) group_left sum(rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",route=\"loki_api_v1_push\",}[$__rate_interval]))", + "expr": "sum by (tenant) (rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",}[$__rate_interval])) / ignoring(tenant) group_left sum(rate(loki_distributor_structured_metadata_bytes_received_total{namespace=\"$namespace\",job=~\".+-distributor-http\",}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", diff --git a/operator/jsonnet/config.libsonnet b/operator/jsonnet/config.libsonnet index efdc1c6103d5c..82dc625da2a4f 100644 --- a/operator/jsonnet/config.libsonnet +++ b/operator/jsonnet/config.libsonnet @@ -238,7 +238,6 @@ local utils = (import 'github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonn distributor:: [ utils.selector.eq('namespace', '$namespace'), utils.selector.re('job', '.+-distributor-http'), - utils.selector.eq('route', 'loki_api_v1_push'), ], ingester:: [ utils.selector.eq('namespace', '$namespace'), diff --git a/operator/jsonnet/jsonnetfile.json b/operator/jsonnet/jsonnetfile.json index 4b25fb159b3d8..2bc2549a3c600 100644 --- a/operator/jsonnet/jsonnetfile.json +++ b/operator/jsonnet/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "production/loki-mixin" } }, - "version": "bd505f8e2d37172ff35a89f4ac42efec9566a263" + "version": "0694d797dec010393567704211638219c1971b46" } ], "legacyImports": true diff --git a/operator/jsonnet/jsonnetfile.lock.json b/operator/jsonnet/jsonnetfile.lock.json index 27d2e6e8756c6..3a0710db7565f 100644 --- a/operator/jsonnet/jsonnetfile.lock.json +++ b/operator/jsonnet/jsonnetfile.lock.json @@ -38,8 +38,8 @@ "subdir": "production/loki-mixin" } }, - "version": "bd505f8e2d37172ff35a89f4ac42efec9566a263", - "sum": "yiXXBAcWfMkYSJthU2OZSgHHmveWvmRT6aM1V0MaAjs=" + "version": "0694d797dec010393567704211638219c1971b46", + "sum": "Pw/9T/ZRjXLqTivU5xkJnrP5kFdET2FDUjjG1G96GmQ=" }, { "source": { From 6ae46dc6efc4052be2d028262ec919d53f07cb39 Mon Sep 17 00:00:00 2001 From: Dreamy Date: Fri, 12 Jan 2024 15:42:58 +0100 Subject: [PATCH 18/21] helm: added missing namespace to query-scheduler-discovery service (#11648) **What this PR does / why we need it**: This PR adds the namespace attribute to the query-scheduler-service descriptor because it is needed when deploying loki in non-default namespace **Which issue(s) this PR fixes**: Fixes #10048 **Special notes for your reviewer**: **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [x] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [x] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [x] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) I hope I did everything right with the changelog, chat and readme updates since this my first PR, please tell me if something needs fixing. --- production/helm/loki/CHANGELOG.md | 4 ++++ production/helm/loki/Chart.yaml | 2 +- production/helm/loki/README.md | 2 +- .../loki/templates/backend/query-scheduler-discovery.yaml | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/production/helm/loki/CHANGELOG.md b/production/helm/loki/CHANGELOG.md index ca04f5d18ce5d..272aa69428852 100644 --- a/production/helm/loki/CHANGELOG.md +++ b/production/helm/loki/CHANGELOG.md @@ -13,6 +13,10 @@ Entries should include a reference to the pull request that introduced the chang [//]: # ( : do not remove this line. This locator is used by the CI pipeline to automatically create a changelog entry for each new Loki release. Add other chart versions and respective changelog entries bellow this line.) +## 5.41.6 + +- [BUGFIX] Added missing namespace to query-scheduler-discovery service when deploying loki in a specific namespace. + ## 5.41.5 - [BUGFIX] Added "swift" type object storage to resolve Loki HELM Chart error. diff --git a/production/helm/loki/Chart.yaml b/production/helm/loki/Chart.yaml index 1e08c0c8f0d1e..cb43a70c965b7 100644 --- a/production/helm/loki/Chart.yaml +++ b/production/helm/loki/Chart.yaml @@ -3,7 +3,7 @@ name: loki description: Helm chart for Grafana Loki in simple, scalable mode type: application appVersion: 2.9.3 -version: 5.41.5 +version: 5.41.6 home: https://grafana.github.io/helm-charts sources: - https://github.com/grafana/loki diff --git a/production/helm/loki/README.md b/production/helm/loki/README.md index ec3360d378d76..6b4ec081e9bba 100644 --- a/production/helm/loki/README.md +++ b/production/helm/loki/README.md @@ -1,6 +1,6 @@ # loki -![Version: 5.41.5](https://img.shields.io/badge/Version-5.41.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.9.3](https://img.shields.io/badge/AppVersion-2.9.3-informational?style=flat-square) +![Version: 5.41.6](https://img.shields.io/badge/Version-5.41.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.9.3](https://img.shields.io/badge/AppVersion-2.9.3-informational?style=flat-square) Helm chart for Grafana Loki in simple, scalable mode diff --git a/production/helm/loki/templates/backend/query-scheduler-discovery.yaml b/production/helm/loki/templates/backend/query-scheduler-discovery.yaml index 01865863e57ef..a9dedbb545649 100644 --- a/production/helm/loki/templates/backend/query-scheduler-discovery.yaml +++ b/production/helm/loki/templates/backend/query-scheduler-discovery.yaml @@ -5,6 +5,7 @@ apiVersion: v1 kind: Service metadata: name: query-scheduler-discovery + namespace: {{ $.Release.Namespace }} labels: {{- include "loki.backendSelectorLabels" . | nindent 4 }} prometheus.io/service-monitor: "false" From a5aa8b315da22ef94763f9e048b1a77840e724fd Mon Sep 17 00:00:00 2001 From: Vladyslav Diachenko <82767850+vlad-diachenko@users.noreply.github.com> Date: Fri, 12 Jan 2024 19:21:20 +0200 Subject: [PATCH 19/21] [bloom-compactor] downloading chunks in batches (#11649) **What this PR does / why we need it**: Added chunks batches iterator to download chunks in batches instead of downloading all of them at once. Otherwise, when the stream contains a lot of chunks, it can lead to OOM. **Special notes for your reviewer**: **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [ ] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Signed-off-by: Vladyslav Diachenko --- docs/sources/configure/_index.md | 4 + pkg/bloomcompactor/bloomcompactor.go | 5 +- pkg/bloomcompactor/chunkcompactor.go | 53 ++++---- pkg/bloomcompactor/chunkcompactor_test.go | 12 +- pkg/bloomcompactor/chunksbatchesiterator.go | 48 +++++++ .../chunksbatchesiterator_test.go | 96 ++++++++++++++ pkg/bloomcompactor/config.go | 1 + pkg/bloomcompactor/mergecompactor.go | 10 +- pkg/bloomcompactor/sharding_test.go | 15 ++- pkg/storage/bloom/v1/bloom_tokenizer.go | 120 ++++++++++-------- pkg/storage/bloom/v1/bloom_tokenizer_test.go | 4 +- pkg/validation/limits.go | 6 + pkg/validation/limits_test.go | 9 +- 13 files changed, 280 insertions(+), 103 deletions(-) create mode 100644 pkg/bloomcompactor/chunksbatchesiterator.go create mode 100644 pkg/bloomcompactor/chunksbatchesiterator_test.go diff --git a/docs/sources/configure/_index.md b/docs/sources/configure/_index.md index 9bf65788c8a22..51ecb12af62f1 100644 --- a/docs/sources/configure/_index.md +++ b/docs/sources/configure/_index.md @@ -3094,6 +3094,10 @@ shard_streams: # CLI flag: -bloom-compactor.enable-compaction [bloom_compactor_enable_compaction: | default = false] +# The batch size of the chunks the bloom-compactor downloads at once. +# CLI flag: -bloom-compactor.chunks-batch-size +[bloom_compactor_chunks_batch_size: | default = 100] + # Length of the n-grams created when computing blooms from log lines. # CLI flag: -bloom-compactor.ngram-length [bloom_ngram_length: | default = 4] diff --git a/pkg/bloomcompactor/bloomcompactor.go b/pkg/bloomcompactor/bloomcompactor.go index a5f1185f57e84..dbe307ff18822 100644 --- a/pkg/bloomcompactor/bloomcompactor.go +++ b/pkg/bloomcompactor/bloomcompactor.go @@ -535,8 +535,7 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, return err } - fpRate := c.limits.BloomFalsePositiveRate(job.tenantID) - resultingBlock, err = compactNewChunks(ctx, logger, job, fpRate, bt, storeClient.chunk, builder) + resultingBlock, err = compactNewChunks(ctx, logger, job, bt, storeClient.chunk, builder, c.limits) if err != nil { return level.Error(logger).Log("msg", "failed compacting new chunks", "err", err) } @@ -545,7 +544,7 @@ func (c *Compactor) runCompact(ctx context.Context, logger log.Logger, job Job, // When already compacted metas exists, we need to merge all blocks with amending blooms with new series level.Info(logger).Log("msg", "already compacted metas exists, use mergeBlockBuilder") - var populate = createPopulateFunc(ctx, logger, job, storeClient, bt) + var populate = createPopulateFunc(ctx, job, storeClient, bt, c.limits) seriesIter := makeSeriesIterFromSeriesMeta(job) diff --git a/pkg/bloomcompactor/chunkcompactor.go b/pkg/bloomcompactor/chunkcompactor.go index f0f59882b31c4..c4993ccc62a59 100644 --- a/pkg/bloomcompactor/chunkcompactor.go +++ b/pkg/bloomcompactor/chunkcompactor.go @@ -22,7 +22,7 @@ import ( ) type compactorTokenizer interface { - PopulateSeriesWithBloom(bloom *v1.SeriesWithBloom, chunks []chunk.Chunk) error + PopulateSeriesWithBloom(bloom *v1.SeriesWithBloom, chunkBatchesIterator v1.Iterator[[]chunk.Chunk]) error } type chunkClient interface { @@ -86,7 +86,7 @@ func makeChunkRefs(chksMetas []tsdbindex.ChunkMeta, tenant string, fp model.Fing return chunkRefs } -func buildBloomFromSeries(seriesMeta seriesMeta, fpRate float64, tokenizer compactorTokenizer, chunks []chunk.Chunk) (v1.SeriesWithBloom, error) { +func buildBloomFromSeries(seriesMeta seriesMeta, fpRate float64, tokenizer compactorTokenizer, chunks v1.Iterator[[]chunk.Chunk]) (v1.SeriesWithBloom, error) { // Create a bloom for this series bloomForChks := v1.SeriesWithBloom{ Series: &v1.Series{ @@ -155,21 +155,20 @@ func createLocalDirName(workingDir string, job Job) string { } // Compacts given list of chunks, uploads them to storage and returns a list of bloomBlocks -func compactNewChunks( - ctx context.Context, +func compactNewChunks(ctx context.Context, logger log.Logger, job Job, - fpRate float64, bt compactorTokenizer, storeClient chunkClient, builder blockBuilder, + limits Limits, ) (bloomshipper.Block, error) { // Ensure the context has not been canceled (ie. compactor shutdown has been triggered). if err := ctx.Err(); err != nil { return bloomshipper.Block{}, err } - bloomIter := newLazyBloomBuilder(ctx, job, storeClient, bt, fpRate, logger) + bloomIter := newLazyBloomBuilder(ctx, job, storeClient, bt, logger, limits) // Build and upload bloomBlock to storage block, err := buildBlockFromBlooms(ctx, logger, builder, bloomIter, job) @@ -182,13 +181,14 @@ func compactNewChunks( } type lazyBloomBuilder struct { - ctx context.Context - metas v1.Iterator[seriesMeta] - tenant string - client chunkClient - bt compactorTokenizer - fpRate float64 - logger log.Logger + ctx context.Context + metas v1.Iterator[seriesMeta] + tenant string + client chunkClient + bt compactorTokenizer + fpRate float64 + logger log.Logger + chunksBatchSize int cur v1.SeriesWithBloom // retured by At() err error // returned by Err() @@ -198,15 +198,16 @@ type lazyBloomBuilder struct { // which are used by the blockBuilder to write a bloom block. // We use an interator to avoid loading all blooms into memory first, before // building the block. -func newLazyBloomBuilder(ctx context.Context, job Job, client chunkClient, bt compactorTokenizer, fpRate float64, logger log.Logger) *lazyBloomBuilder { +func newLazyBloomBuilder(ctx context.Context, job Job, client chunkClient, bt compactorTokenizer, logger log.Logger, limits Limits) *lazyBloomBuilder { return &lazyBloomBuilder{ - ctx: ctx, - metas: v1.NewSliceIter(job.seriesMetas), - client: client, - tenant: job.tenantID, - bt: bt, - fpRate: fpRate, - logger: logger, + ctx: ctx, + metas: v1.NewSliceIter(job.seriesMetas), + client: client, + tenant: job.tenantID, + bt: bt, + fpRate: limits.BloomFalsePositiveRate(job.tenantID), + logger: logger, + chunksBatchSize: limits.BloomCompactorChunksBatchSize(job.tenantID), } } @@ -218,20 +219,18 @@ func (it *lazyBloomBuilder) Next() bool { } meta := it.metas.At() - // Get chunks data from list of chunkRefs - chks, err := it.client.GetChunks(it.ctx, makeChunkRefs(meta.chunkRefs, it.tenant, meta.seriesFP)) + batchesIterator, err := newChunkBatchesIterator(it.ctx, it.client, makeChunkRefs(meta.chunkRefs, it.tenant, meta.seriesFP), it.chunksBatchSize) if err != nil { it.err = err it.cur = v1.SeriesWithBloom{} - level.Debug(it.logger).Log("err in getChunks", err) + level.Debug(it.logger).Log("msg", "err creating chunks batches iterator", "err", err) return false } - - it.cur, err = buildBloomFromSeries(meta, it.fpRate, it.bt, chks) + it.cur, err = buildBloomFromSeries(meta, it.fpRate, it.bt, batchesIterator) if err != nil { it.err = err it.cur = v1.SeriesWithBloom{} - level.Debug(it.logger).Log("err in buildBloomFromSeries", err) + level.Debug(it.logger).Log("msg", "err in buildBloomFromSeries", "err", err) return false } return true diff --git a/pkg/bloomcompactor/chunkcompactor_test.go b/pkg/bloomcompactor/chunkcompactor_test.go index 2d31e05f18f83..8bc94fd26537a 100644 --- a/pkg/bloomcompactor/chunkcompactor_test.go +++ b/pkg/bloomcompactor/chunkcompactor_test.go @@ -59,7 +59,7 @@ func TestChunkCompactor_BuildBloomFromSeries(t *testing.T) { chunks := []chunk.Chunk{createTestChunk(fp, label)} mbt := mockBloomTokenizer{} - bloom, err := buildBloomFromSeries(seriesMeta, fpRate, &mbt, chunks) + bloom, err := buildBloomFromSeries(seriesMeta, fpRate, &mbt, v1.NewSliceIter([][]chunk.Chunk{chunks})) require.NoError(t, err) require.Equal(t, seriesMeta.seriesFP, bloom.Series.Fingerprint) require.Equal(t, chunks, mbt.chunks) @@ -110,7 +110,7 @@ func TestChunkCompactor_CompactNewChunks(t *testing.T) { pbb := mockPersistentBlockBuilder{} // Run Compaction - compactedBlock, err := compactNewChunks(context.Background(), logger, job, fpRate, &mbt, &mcc, &pbb) + compactedBlock, err := compactNewChunks(context.Background(), logger, job, &mbt, &mcc, &pbb, mockLimits{fpRate: fpRate}) // Validate Compaction Succeeds require.NoError(t, err) @@ -169,7 +169,7 @@ func TestLazyBloomBuilder(t *testing.T) { mbt := &mockBloomTokenizer{} mcc := &mockChunkClient{} - it := newLazyBloomBuilder(context.Background(), job, mcc, mbt, fpRate, logger) + it := newLazyBloomBuilder(context.Background(), job, mcc, mbt, logger, mockLimits{chunksDownloadingBatchSize: 10, fpRate: fpRate}) // first seriesMeta has 1 chunks require.True(t, it.Next()) @@ -199,8 +199,10 @@ type mockBloomTokenizer struct { chunks []chunk.Chunk } -func (mbt *mockBloomTokenizer) PopulateSeriesWithBloom(_ *v1.SeriesWithBloom, c []chunk.Chunk) error { - mbt.chunks = append(mbt.chunks, c...) +func (mbt *mockBloomTokenizer) PopulateSeriesWithBloom(_ *v1.SeriesWithBloom, c v1.Iterator[[]chunk.Chunk]) error { + for c.Next() { + mbt.chunks = append(mbt.chunks, c.At()...) + } return nil } diff --git a/pkg/bloomcompactor/chunksbatchesiterator.go b/pkg/bloomcompactor/chunksbatchesiterator.go new file mode 100644 index 0000000000000..a4494b02b7e47 --- /dev/null +++ b/pkg/bloomcompactor/chunksbatchesiterator.go @@ -0,0 +1,48 @@ +package bloomcompactor + +import ( + "context" + "errors" + + "github.com/grafana/loki/pkg/storage/chunk" +) + +type chunksBatchesIterator struct { + context context.Context + client chunkClient + chunksToDownload []chunk.Chunk + batchSize int + + currentBatch []chunk.Chunk + err error +} + +func newChunkBatchesIterator(context context.Context, client chunkClient, chunksToDownload []chunk.Chunk, batchSize int) (*chunksBatchesIterator, error) { + if batchSize <= 0 { + return nil, errors.New("batchSize must be greater than 0") + } + return &chunksBatchesIterator{context: context, client: client, chunksToDownload: chunksToDownload, batchSize: batchSize}, nil +} + +func (c *chunksBatchesIterator) Next() bool { + if len(c.chunksToDownload) == 0 { + return false + } + batchSize := c.batchSize + chunksToDownloadCount := len(c.chunksToDownload) + if chunksToDownloadCount < batchSize { + batchSize = chunksToDownloadCount + } + chunksToDownload := c.chunksToDownload[:batchSize] + c.chunksToDownload = c.chunksToDownload[batchSize:] + c.currentBatch, c.err = c.client.GetChunks(c.context, chunksToDownload) + return c.err == nil +} + +func (c *chunksBatchesIterator) Err() error { + return c.err +} + +func (c *chunksBatchesIterator) At() []chunk.Chunk { + return c.currentBatch +} diff --git a/pkg/bloomcompactor/chunksbatchesiterator_test.go b/pkg/bloomcompactor/chunksbatchesiterator_test.go new file mode 100644 index 0000000000000..170f2662b508b --- /dev/null +++ b/pkg/bloomcompactor/chunksbatchesiterator_test.go @@ -0,0 +1,96 @@ +package bloomcompactor + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/grafana/loki/pkg/storage/chunk" + tsdbindex "github.com/grafana/loki/pkg/storage/stores/shipper/indexshipper/tsdb/index" +) + +func Test_chunksBatchesIterator(t *testing.T) { + tests := map[string]struct { + batchSize int + chunksToDownload []chunk.Chunk + constructorError error + + hadNextCount int + }{ + "expected error if batch size is set to 0": { + batchSize: 0, + constructorError: errors.New("batchSize must be greater than 0"), + }, + "expected no error if there are no chunks": { + hadNextCount: 0, + batchSize: 10, + }, + "expected 1 call to the client": { + chunksToDownload: createFakeChunks(10), + hadNextCount: 1, + batchSize: 20, + }, + "expected 1 call to the client(2)": { + chunksToDownload: createFakeChunks(10), + hadNextCount: 1, + batchSize: 10, + }, + "expected 2 calls to the client": { + chunksToDownload: createFakeChunks(10), + hadNextCount: 2, + batchSize: 6, + }, + "expected 10 calls to the client": { + chunksToDownload: createFakeChunks(10), + hadNextCount: 10, + batchSize: 1, + }, + } + for name, data := range tests { + t.Run(name, func(t *testing.T) { + client := &fakeClient{} + iterator, err := newChunkBatchesIterator(context.Background(), client, data.chunksToDownload, data.batchSize) + if data.constructorError != nil { + require.Equal(t, err, data.constructorError) + return + } + hadNextCount := 0 + var downloadedChunks []chunk.Chunk + for iterator.Next() { + hadNextCount++ + downloaded := iterator.At() + downloadedChunks = append(downloadedChunks, downloaded...) + require.LessOrEqual(t, len(downloaded), data.batchSize) + } + require.NoError(t, iterator.Err()) + require.Equal(t, data.chunksToDownload, downloadedChunks) + require.Equal(t, data.hadNextCount, client.callsCount) + require.Equal(t, data.hadNextCount, hadNextCount) + }) + } +} + +func createFakeChunks(count int) []chunk.Chunk { + metas := make([]tsdbindex.ChunkMeta, 0, count) + for i := 0; i < count; i++ { + metas = append(metas, tsdbindex.ChunkMeta{ + Checksum: uint32(i), + MinTime: int64(i), + MaxTime: int64(i + 100), + KB: uint32(i * 100), + Entries: uint32(i * 10), + }) + } + return makeChunkRefs(metas, "fake", 0xFFFF) +} + +type fakeClient struct { + callsCount int +} + +func (f *fakeClient) GetChunks(_ context.Context, chunks []chunk.Chunk) ([]chunk.Chunk, error) { + f.callsCount++ + return chunks, nil +} diff --git a/pkg/bloomcompactor/config.go b/pkg/bloomcompactor/config.go index c3969ac6af386..884034fdd043d 100644 --- a/pkg/bloomcompactor/config.go +++ b/pkg/bloomcompactor/config.go @@ -41,6 +41,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { type Limits interface { downloads.Limits BloomCompactorShardSize(tenantID string) int + BloomCompactorChunksBatchSize(userID string) int BloomCompactorMaxTableAge(tenantID string) time.Duration BloomCompactorEnabled(tenantID string) bool BloomNGramLength(tenantID string) int diff --git a/pkg/bloomcompactor/mergecompactor.go b/pkg/bloomcompactor/mergecompactor.go index 0cf55cef86a7c..6e2143f75135c 100644 --- a/pkg/bloomcompactor/mergecompactor.go +++ b/pkg/bloomcompactor/mergecompactor.go @@ -2,6 +2,7 @@ package bloomcompactor import ( "context" + "fmt" "github.com/grafana/dskit/concurrency" @@ -74,7 +75,7 @@ func makeBlockIterFromBlocks(ctx context.Context, logger log.Logger, return blockIters, blockPaths, nil } -func createPopulateFunc(ctx context.Context, logger log.Logger, job Job, storeClient storeClient, bt *v1.BloomTokenizer) func(series *v1.Series, bloom *v1.Bloom) error { +func createPopulateFunc(ctx context.Context, job Job, storeClient storeClient, bt *v1.BloomTokenizer, limits Limits) func(series *v1.Series, bloom *v1.Bloom) error { return func(series *v1.Series, bloom *v1.Bloom) error { bloomForChks := v1.SeriesWithBloom{ Series: series, @@ -95,12 +96,11 @@ func createPopulateFunc(ctx context.Context, logger log.Logger, job Job, storeCl } } - chks, err := storeClient.chunk.GetChunks(ctx, chunkRefs) + batchesIterator, err := newChunkBatchesIterator(ctx, storeClient.chunk, chunkRefs, limits.BloomCompactorChunksBatchSize(job.tenantID)) if err != nil { - level.Error(logger).Log("msg", "failed downloading chunks", "err", err) - return err + return fmt.Errorf("error creating chunks batches iterator: %w", err) } - err = bt.PopulateSeriesWithBloom(&bloomForChks, chks) + err = bt.PopulateSeriesWithBloom(&bloomForChks, batchesIterator) if err != nil { return err } diff --git a/pkg/bloomcompactor/sharding_test.go b/pkg/bloomcompactor/sharding_test.go index fc77536f6061f..4e79752279fb9 100644 --- a/pkg/bloomcompactor/sharding_test.go +++ b/pkg/bloomcompactor/sharding_test.go @@ -128,9 +128,22 @@ func TestShuffleSharding(t *testing.T) { type mockLimits struct { *validation.Overrides - bloomCompactorShardSize int + bloomCompactorShardSize int + chunksDownloadingBatchSize int + fpRate float64 +} + +func (m mockLimits) BloomFalsePositiveRate(_ string) float64 { + return m.fpRate } func (m mockLimits) BloomCompactorShardSize(_ string) int { return m.bloomCompactorShardSize } + +func (m mockLimits) BloomCompactorChunksBatchSize(_ string) int { + if m.chunksDownloadingBatchSize != 0 { + return m.chunksDownloadingBatchSize + } + return 1 +} diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 2eaeb576b318d..946aeaf54495c 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -2,6 +2,7 @@ package v1 import ( "context" + "fmt" "math" "time" @@ -82,75 +83,82 @@ func prefixedToken(ngram int, chk logproto.ChunkRef) ([]byte, int) { } // PopulateSeriesWithBloom is intended to be called on the write path, and is used to populate the bloom filter for a given series. -func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBloom, chunks []chunk.Chunk) error { +func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBloom, chunks Iterator[[]chunk.Chunk]) error { startTime := time.Now().UnixMilli() level.Debug(util_log.Logger).Log("msg", "PopulateSeriesWithBloom") clearCache(bt.cache) chunkTotalUncompressedSize := 0 - for idx := range chunks { - lc := chunks[idx].Data.(*chunkenc.Facade).LokiChunk() - tokenBuf, prefixLn := prefixedToken(bt.lineTokenizer.N, chunks[idx].ChunkRef) - chunkTotalUncompressedSize += lc.UncompressedSize() - - itr, err := lc.Iterator( - context.Background(), - time.Unix(0, 0), // TODO: Parameterize/better handle the timestamps? - time.Unix(0, math.MaxInt64), - logproto.FORWARD, - log.NewNoopPipeline().ForStream(chunks[idx].Metric), - ) - if err != nil { - level.Error(util_log.Logger).Log("msg", "chunk iterator cannot be created", "err", err) - return err - } - - defer itr.Close() - - for itr.Next() && itr.Error() == nil { - chunkTokenizer := NewPrefixedTokenIter(tokenBuf, prefixLn, bt.lineTokenizer.Tokens(itr.Entry().Line)) - for chunkTokenizer.Next() { - tok := chunkTokenizer.At() - if tok != nil { - str := string(tok) - _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters - if !found { - bt.cache[str] = nil - - seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) - - if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other - clearCache(bt.cache) + for chunks.Next() { + chunksBatch := chunks.At() + for idx := range chunksBatch { + lc := chunksBatch[idx].Data.(*chunkenc.Facade).LokiChunk() + tokenBuf, prefixLn := prefixedToken(bt.lineTokenizer.N, chunksBatch[idx].ChunkRef) + chunkTotalUncompressedSize += lc.UncompressedSize() + + itr, err := lc.Iterator( + context.Background(), + time.Unix(0, 0), // TODO: Parameterize/better handle the timestamps? + time.Unix(0, math.MaxInt64), + logproto.FORWARD, + log.NewNoopPipeline().ForStream(chunksBatch[idx].Metric), + ) + if err != nil { + level.Error(util_log.Logger).Log("msg", "chunk iterator cannot be created", "err", err) + return err + } + + defer itr.Close() + + for itr.Next() && itr.Error() == nil { + chunkTokenizer := NewPrefixedTokenIter(tokenBuf, prefixLn, bt.lineTokenizer.Tokens(itr.Entry().Line)) + for chunkTokenizer.Next() { + tok := chunkTokenizer.At() + if tok != nil { + str := string(tok) + _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters + if !found { + bt.cache[str] = nil + + seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) + + if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other + clearCache(bt.cache) + } } } } - } - lineTokenizer := bt.lineTokenizer.Tokens(itr.Entry().Line) - for lineTokenizer.Next() { - tok := lineTokenizer.At() - if tok != nil { - str := string(tok) - _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters - if !found { - bt.cache[str] = nil - - seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) - - if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other - clearCache(bt.cache) + lineTokenizer := bt.lineTokenizer.Tokens(itr.Entry().Line) + for lineTokenizer.Next() { + tok := lineTokenizer.At() + if tok != nil { + str := string(tok) + _, found := bt.cache[str] // A cache is used ahead of the SBF, as it cuts out the costly operations of scaling bloom filters + if !found { + bt.cache[str] = nil + + seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok) + + if len(bt.cache) >= cacheSize { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other + clearCache(bt.cache) + } } } } - } - } - seriesWithBloom.Series.Chunks = append(seriesWithBloom.Series.Chunks, ChunkRef{ - Start: chunks[idx].From, - End: chunks[idx].Through, - Checksum: chunks[idx].Checksum, - }) - } // for each chunk + } + seriesWithBloom.Series.Chunks = append(seriesWithBloom.Series.Chunks, ChunkRef{ + Start: chunksBatch[idx].From, + End: chunksBatch[idx].Through, + Checksum: chunksBatch[idx].Checksum, + }) + } // for each chunk + } + if err := chunks.Err(); err != nil { + level.Error(util_log.Logger).Log("msg", "error downloading chunks batch", "err", err) + return fmt.Errorf("error downloading chunks batch: %w", err) + } endTime := time.Now().UnixMilli() diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index f22c741651246..0fad08e78f080 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -123,7 +123,7 @@ func TestPopulateSeriesWithBloom(t *testing.T) { Series: &series, } - err := bt.PopulateSeriesWithBloom(&swb, chunks) + err := bt.PopulateSeriesWithBloom(&swb, NewSliceIter([][]chunk.Chunk{chunks})) require.NoError(t, err) tokenizer := NewNGramTokenizer(DefaultNGramLength, DefaultNGramSkip) itr := tokenizer.Tokens(testLine) @@ -171,7 +171,7 @@ func BenchmarkPopulateSeriesWithBloom(b *testing.B) { Series: &series, } - err := bt.PopulateSeriesWithBloom(&swb, chunks) + err := bt.PopulateSeriesWithBloom(&swb, NewSliceIter([][]chunk.Chunk{chunks})) require.NoError(b, err) } } diff --git a/pkg/validation/limits.go b/pkg/validation/limits.go index d846cfed51b2e..45dd34f201e8d 100644 --- a/pkg/validation/limits.go +++ b/pkg/validation/limits.go @@ -188,6 +188,7 @@ type Limits struct { BloomCompactorShardSize int `yaml:"bloom_compactor_shard_size" json:"bloom_compactor_shard_size"` BloomCompactorMaxTableAge time.Duration `yaml:"bloom_compactor_max_table_age" json:"bloom_compactor_max_table_age"` BloomCompactorEnabled bool `yaml:"bloom_compactor_enable_compaction" json:"bloom_compactor_enable_compaction"` + BloomCompactorChunksBatchSize int `yaml:"bloom_compactor_chunks_batch_size" json:"bloom_compactor_chunks_batch_size"` BloomNGramLength int `yaml:"bloom_ngram_length" json:"bloom_ngram_length"` BloomNGramSkip int `yaml:"bloom_ngram_skip" json:"bloom_ngram_skip"` BloomFalsePositiveRate float64 `yaml:"bloom_false_positive_rate" json:"bloom_false_positive_rate"` @@ -316,6 +317,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.BloomCompactorShardSize, "bloom-compactor.shard-size", 1, "The shard size defines how many bloom compactors should be used by a tenant when computing blooms. If it's set to 0, shuffle sharding is disabled.") f.DurationVar(&l.BloomCompactorMaxTableAge, "bloom-compactor.max-table-age", 7*24*time.Hour, "The maximum age of a table before it is compacted. Do not compact tables older than the the configured time. Default to 7 days. 0s means no limit.") f.BoolVar(&l.BloomCompactorEnabled, "bloom-compactor.enable-compaction", false, "Whether to compact chunks into bloom filters.") + f.IntVar(&l.BloomCompactorChunksBatchSize, "bloom-compactor.chunks-batch-size", 100, "The batch size of the chunks the bloom-compactor downloads at once.") f.IntVar(&l.BloomNGramLength, "bloom-compactor.ngram-length", 4, "Length of the n-grams created when computing blooms from log lines.") f.IntVar(&l.BloomNGramSkip, "bloom-compactor.ngram-skip", 0, "Skip factor for the n-grams created when computing blooms from log lines.") f.Float64Var(&l.BloomFalsePositiveRate, "bloom-compactor.false-positive-rate", 0.01, "Scalable Bloom Filter desired false-positive rate.") @@ -838,6 +840,10 @@ func (o *Overrides) BloomGatewayEnabled(userID string) bool { return o.getOverridesForUser(userID).BloomGatewayEnabled } +func (o *Overrides) BloomCompactorChunksBatchSize(userID string) int { + return o.getOverridesForUser(userID).BloomCompactorChunksBatchSize +} + func (o *Overrides) BloomCompactorShardSize(userID string) int { return o.getOverridesForUser(userID).BloomCompactorShardSize } diff --git a/pkg/validation/limits_test.go b/pkg/validation/limits_test.go index 4e449e421c5a7..908531f9858f6 100644 --- a/pkg/validation/limits_test.go +++ b/pkg/validation/limits_test.go @@ -6,7 +6,6 @@ import ( "testing" "time" - "github.com/pkg/errors" "github.com/prometheus/common/model" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -290,7 +289,7 @@ query_timeout: 5m } } -func TestLimitsValidation(t *testing.T) { +func TestLimitsValidation_deletionMode(t *testing.T) { for _, tc := range []struct { mode string expected error @@ -300,7 +299,9 @@ func TestLimitsValidation(t *testing.T) { {mode: "filter-and-delete", expected: nil}, {mode: "something-else", expected: deletionmode.ErrUnknownMode}, } { - limits := Limits{DeletionMode: tc.mode} - require.True(t, errors.Is(limits.Validate(), tc.expected)) + t.Run(tc.mode, func(t *testing.T) { + limits := Limits{DeletionMode: tc.mode} + require.ErrorIs(t, limits.Validate(), tc.expected) + }) } } From edba360e439b7ebe7a2e293383de53290706790a Mon Sep 17 00:00:00 2001 From: Alexey Solodkiy Date: Fri, 12 Jan 2024 22:14:57 +0300 Subject: [PATCH 20/21] Update query_examples.md (#11670) Co-authored-by: J Stickler --- docs/sources/query/query_examples.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sources/query/query_examples.md b/docs/sources/query/query_examples.md index 7eb4145acf745..1298a36f4a915 100644 --- a/docs/sources/query/query_examples.md +++ b/docs/sources/query/query_examples.md @@ -50,7 +50,7 @@ These LogQL query examples have explanations of what the queries accomplish. != "grafana_com" |= "session opened" != "sudo: " - |regexp "(^(?P\\S+ {1,2}){11})" + | regexp "(^(?P\\S+ {1,2}){11})" | line_format "USER = {{.user}}" ``` From 0065fd6e95fc7531abf3d3d8aab33ec0f8aeea8f Mon Sep 17 00:00:00 2001 From: Periklis Tsirakidis Date: Fri, 12 Jan 2024 20:18:30 +0100 Subject: [PATCH 21/21] operator: Refactor CreateOrUpdateLokiStack handler (#11592) Co-authored-by: Robert Jacob --- .../handlers/internal/gateway/base_domain.go | 5 +- .../handlers/internal/gateway/gateway.go | 87 ++ .../handlers/internal/gateway/gateway_test.go | 390 ++++++++ .../handlers/internal/gateway/modes.go | 3 +- .../handlers/internal/gateway/modes_test.go | 38 +- .../internal/gateway/tenant_configsecret.go | 8 +- .../gateway/tenant_configsecret_test.go | 14 +- .../internal/gateway/tenant_secrets.go | 12 +- .../internal/gateway/tenant_secrets_test.go | 10 +- .../handlers/internal/rules/cleanup.go | 39 +- .../handlers/internal/rules/cleanup_test.go | 223 +++++ .../handlers/internal/rules/config.go | 7 +- .../internal/handlers/internal/rules/rules.go | 104 +- .../handlers/internal/rules/rules_test.go | 251 ++++- .../handlers/internal/storage/ca_configmap.go | 33 +- .../internal/storage/ca_configmap_test.go | 8 +- .../handlers/internal/storage/secrets.go | 34 +- .../handlers/internal/storage/secrets_test.go | 10 +- .../handlers/internal/storage/storage.go | 91 ++ .../handlers/internal/storage/storage_test.go | 477 +++++++++ .../handlers/lokistack_create_or_update.go | 231 +---- .../lokistack_create_or_update_test.go | 914 +----------------- 22 files changed, 1781 insertions(+), 1208 deletions(-) create mode 100644 operator/internal/handlers/internal/gateway/gateway.go create mode 100644 operator/internal/handlers/internal/gateway/gateway_test.go create mode 100644 operator/internal/handlers/internal/rules/cleanup_test.go create mode 100644 operator/internal/handlers/internal/storage/storage.go create mode 100644 operator/internal/handlers/internal/storage/storage_test.go diff --git a/operator/internal/handlers/internal/gateway/base_domain.go b/operator/internal/handlers/internal/gateway/base_domain.go index 5bdea31658d1d..893659ca5d29b 100644 --- a/operator/internal/handlers/internal/gateway/base_domain.go +++ b/operator/internal/handlers/internal/gateway/base_domain.go @@ -6,7 +6,6 @@ import ( "github.com/ViaQ/logerr/v2/kverrors" configv1 "github.com/openshift/api/config/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" @@ -14,11 +13,11 @@ import ( "github.com/grafana/loki/operator/internal/status" ) -// GetOpenShiftBaseDomain returns the cluster DNS base domain on OpenShift +// getOpenShiftBaseDomain returns the cluster DNS base domain on OpenShift // clusters to auto-create redirect URLs for OpenShift Auth or an error. // If the config.openshift.io/DNS object is not found the whole lokistack // resoure is set to a degraded state. -func GetOpenShiftBaseDomain(ctx context.Context, k k8s.Client, req ctrl.Request) (string, error) { +func getOpenShiftBaseDomain(ctx context.Context, k k8s.Client) (string, error) { var cluster configv1.DNS key := client.ObjectKey{Name: "cluster"} if err := k.Get(ctx, key, &cluster); err != nil { diff --git a/operator/internal/handlers/internal/gateway/gateway.go b/operator/internal/handlers/internal/gateway/gateway.go new file mode 100644 index 0000000000000..0b05801f2e9aa --- /dev/null +++ b/operator/internal/handlers/internal/gateway/gateway.go @@ -0,0 +1,87 @@ +package gateway + +import ( + "context" + "fmt" + + "github.com/go-logr/logr" + + configv1 "github.com/grafana/loki/operator/apis/config/v1" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/handlers/internal/openshift" + "github.com/grafana/loki/operator/internal/manifests" + "github.com/grafana/loki/operator/internal/status" +) + +// BuildOptions returns the options needed to generate Kubernetes resource +// manifests for the lokistack-gateway. +// The returned error can be a status.DegradedError in the following cases: +// - The tenants spec is missing. +// - The tenants spec is invalid. +func BuildOptions(ctx context.Context, log logr.Logger, k k8s.Client, stack *lokiv1.LokiStack, fg configv1.FeatureGates) (string, manifests.Tenants, error) { + var ( + err error + baseDomain string + secrets []*manifests.TenantSecrets + configs map[string]manifests.TenantConfig + tenants manifests.Tenants + ) + + if !fg.LokiStackGateway { + return "", tenants, nil + } + + if stack.Spec.Tenants == nil { + return "", tenants, &status.DegradedError{ + Message: "Invalid tenants configuration: TenantsSpec cannot be nil when gateway flag is enabled", + Reason: lokiv1.ReasonInvalidTenantsConfiguration, + Requeue: false, + } + } + + if err = validateModes(stack); err != nil { + return "", tenants, &status.DegradedError{ + Message: fmt.Sprintf("Invalid tenants configuration: %s", err), + Reason: lokiv1.ReasonInvalidTenantsConfiguration, + Requeue: false, + } + } + + switch stack.Spec.Tenants.Mode { + case lokiv1.OpenshiftLogging, lokiv1.OpenshiftNetwork: + baseDomain, err = getOpenShiftBaseDomain(ctx, k) + if err != nil { + return "", tenants, err + } + + if stack.Spec.Proxy == nil { + // If the LokiStack has no proxy set but there is a cluster-wide proxy setting, + // set the LokiStack proxy to that. + ocpProxy, proxyErr := openshift.GetProxy(ctx, k) + if proxyErr != nil { + return "", tenants, proxyErr + } + + stack.Spec.Proxy = ocpProxy + } + default: + secrets, err = getTenantSecrets(ctx, k, stack) + if err != nil { + return "", tenants, err + } + } + + // extract the existing tenant's id, cookieSecret if exists, otherwise create new. + configs, err = getTenantConfigFromSecret(ctx, k, stack) + if err != nil { + log.Error(err, "error in getting tenant secret data") + } + + tenants = manifests.Tenants{ + Secrets: secrets, + Configs: configs, + } + + return baseDomain, tenants, nil +} diff --git a/operator/internal/handlers/internal/gateway/gateway_test.go b/operator/internal/handlers/internal/gateway/gateway_test.go new file mode 100644 index 0000000000000..2c8f846f55825 --- /dev/null +++ b/operator/internal/handlers/internal/gateway/gateway_test.go @@ -0,0 +1,390 @@ +package gateway + +import ( + "context" + "io" + "testing" + + "github.com/ViaQ/logerr/v2/log" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + configv1 "github.com/grafana/loki/operator/apis/config/v1" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" + "github.com/grafana/loki/operator/internal/status" +) + +var ( + logger = log.NewLogger("testing", log.WithOutput(io.Discard)) + + defaultSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "endpoint": []byte("s3://your-endpoint"), + "region": []byte("a-region"), + "bucketnames": []byte("bucket1,bucket2"), + "access_key_id": []byte("a-secret-id"), + "access_key_secret": []byte("a-secret-key"), + }, + } + + defaultGatewaySecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-gateway-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "clientID": []byte("client-123"), + "clientSecret": []byte("client-secret-xyz"), + "issuerCAPath": []byte("/tmp/test/ca.pem"), + }, + } + + invalidSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{}, + } +) + +func TestBuildOptions_WhenInvalidTenantsConfiguration_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid tenants configuration: mandatory configuration - missing OPA Url", + Reason: lokiv1.ReasonInvalidTenantsConfiguration, + Requeue: false, + } + + fg := configv1.FeatureGates{ + LokiStackGateway: true, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: nil, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, err := BuildOptions(context.TODO(), logger, k, stack, fg) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenMissingGatewaySecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing secrets for tenant test", + Reason: lokiv1.ReasonMissingGatewayTenantSecret, + Requeue: true, + } + + fg := configv1.FeatureGates{ + LokiStackGateway: true, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + o, ok := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && ok { + k.SetClientObject(o, stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, err := BuildOptions(context.TODO(), logger, k, stack, fg) + + // make sure error is returned to re-trigger reconciliation + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenInvalidGatewaySecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid gateway tenant secret contents", + Reason: lokiv1.ReasonInvalidGatewayTenantSecret, + Requeue: true, + } + + fg := configv1.FeatureGates{ + LokiStackGateway: true, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: invalidSecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + o, ok := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && ok { + k.SetClientObject(o, stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + if name.Name == invalidSecret.Name { + k.SetClientObject(object, &invalidSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, err := BuildOptions(context.TODO(), logger, k, stack, fg) + + // make sure error is returned to re-trigger reconciliation + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_MissingTenantsSpec_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid tenants configuration: TenantsSpec cannot be nil when gateway flag is enabled", + Reason: lokiv1.ReasonInvalidTenantsConfiguration, + Requeue: false, + } + + fg := configv1.FeatureGates{ + LokiStackGateway: true, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Tenants: nil, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + o, ok := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && ok { + k.SetClientObject(o, stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, err := BuildOptions(context.TODO(), logger, k, stack, fg) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} diff --git a/operator/internal/handlers/internal/gateway/modes.go b/operator/internal/handlers/internal/gateway/modes.go index fd6bf5fae3515..8fd9855b352dc 100644 --- a/operator/internal/handlers/internal/gateway/modes.go +++ b/operator/internal/handlers/internal/gateway/modes.go @@ -6,8 +6,7 @@ import ( lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" ) -// ValidateModes validates the tenants mode specification. -func ValidateModes(stack lokiv1.LokiStack) error { +func validateModes(stack *lokiv1.LokiStack) error { if stack.Spec.Tenants.Mode == lokiv1.Static { if stack.Spec.Tenants.Authentication == nil { return kverrors.New("mandatory configuration - missing tenants' authentication configuration") diff --git a/operator/internal/handlers/internal/gateway/modes_test.go b/operator/internal/handlers/internal/gateway/modes_test.go index f54d348f6b25f..f7899c1eae85c 100644 --- a/operator/internal/handlers/internal/gateway/modes_test.go +++ b/operator/internal/handlers/internal/gateway/modes_test.go @@ -13,13 +13,13 @@ func TestValidateModes_StaticMode(t *testing.T) { type test struct { name string wantErr string - stack lokiv1.LokiStack + stack *lokiv1.LokiStack } table := []test{ { name: "missing authentication spec", wantErr: "mandatory configuration - missing tenants' authentication configuration", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -39,7 +39,7 @@ func TestValidateModes_StaticMode(t *testing.T) { { name: "missing roles spec", wantErr: "mandatory configuration - missing roles configuration", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -74,7 +74,7 @@ func TestValidateModes_StaticMode(t *testing.T) { { name: "missing role bindings spec", wantErr: "mandatory configuration - missing role bindings configuration", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -117,7 +117,7 @@ func TestValidateModes_StaticMode(t *testing.T) { { name: "incompatible OPA URL provided", wantErr: "incompatible configuration - OPA URL not required for mode static", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -174,7 +174,7 @@ func TestValidateModes_StaticMode(t *testing.T) { { name: "all set", wantErr: "", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -231,7 +231,7 @@ func TestValidateModes_StaticMode(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - err := ValidateModes(tst.stack) + err := validateModes(tst.stack) if tst.wantErr != "" { require.EqualError(t, err, tst.wantErr) } @@ -243,13 +243,13 @@ func TestValidateModes_DynamicMode(t *testing.T) { type test struct { name string wantErr string - stack lokiv1.LokiStack + stack *lokiv1.LokiStack } table := []test{ { name: "missing authentication spec", wantErr: "mandatory configuration - missing tenants configuration", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -269,7 +269,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { { name: "missing OPA URL spec", wantErr: "mandatory configuration - missing OPA Url", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -304,7 +304,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { { name: "incompatible roles configuration provided", wantErr: "incompatible configuration - static roles not required for mode dynamic", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -349,7 +349,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { { name: "incompatible roleBindings configuration provided", wantErr: "incompatible configuration - static roleBindings not required for mode dynamic", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -398,7 +398,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { { name: "all set", wantErr: "", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -438,7 +438,7 @@ func TestValidateModes_DynamicMode(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - err := ValidateModes(tst.stack) + err := validateModes(tst.stack) if tst.wantErr != "" { require.EqualError(t, err, tst.wantErr) } @@ -450,13 +450,13 @@ func TestValidateModes_OpenshiftLoggingMode(t *testing.T) { type test struct { name string wantErr string - stack lokiv1.LokiStack + stack *lokiv1.LokiStack } table := []test{ { name: "incompatible authentication spec provided", wantErr: "incompatible configuration - custom tenants configuration not required", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -488,7 +488,7 @@ func TestValidateModes_OpenshiftLoggingMode(t *testing.T) { { name: "incompatible authorization spec provided", wantErr: "incompatible configuration - custom tenants configuration not required", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -514,7 +514,7 @@ func TestValidateModes_OpenshiftLoggingMode(t *testing.T) { { name: "all set", wantErr: "", - stack: lokiv1.LokiStack{ + stack: &lokiv1.LokiStack{ TypeMeta: metav1.TypeMeta{ Kind: "LokiStack", }, @@ -537,7 +537,7 @@ func TestValidateModes_OpenshiftLoggingMode(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - err := ValidateModes(tst.stack) + err := validateModes(tst.stack) if tst.wantErr != "" { require.EqualError(t, err, tst.wantErr) } diff --git a/operator/internal/handlers/internal/gateway/tenant_configsecret.go b/operator/internal/handlers/internal/gateway/tenant_configsecret.go index c5b06c9c5c87b..f4e6c493bc069 100644 --- a/operator/internal/handlers/internal/gateway/tenant_configsecret.go +++ b/operator/internal/handlers/internal/gateway/tenant_configsecret.go @@ -6,10 +6,10 @@ import ( "github.com/ViaQ/logerr/v2/kverrors" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/json" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/yaml" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/manifests" ) @@ -35,11 +35,11 @@ type openShiftSpec struct { CookieSecret string `json:"cookieSecret"` } -// GetTenantConfigSecretData returns the tenantName, tenantId, cookieSecret +// getTenantConfigFromSecret returns the tenantName, tenantId, cookieSecret // clusters to auto-create redirect URLs for OpenShift Auth or an error. -func GetTenantConfigSecretData(ctx context.Context, k k8s.Client, req ctrl.Request) (map[string]manifests.TenantConfig, error) { +func getTenantConfigFromSecret(ctx context.Context, k k8s.Client, stack *lokiv1.LokiStack) (map[string]manifests.TenantConfig, error) { var tenantSecret corev1.Secret - key := client.ObjectKey{Name: manifests.GatewayName(req.Name), Namespace: req.Namespace} + key := client.ObjectKey{Name: manifests.GatewayName(stack.Name), Namespace: stack.Namespace} if err := k.Get(ctx, key, &tenantSecret); err != nil { return nil, kverrors.Wrap(err, "couldn't find tenant secret.") } diff --git a/operator/internal/handlers/internal/gateway/tenant_configsecret_test.go b/operator/internal/handlers/internal/gateway/tenant_configsecret_test.go index f0035a89a16ff..15e85a2295465 100644 --- a/operator/internal/handlers/internal/gateway/tenant_configsecret_test.go +++ b/operator/internal/handlers/internal/gateway/tenant_configsecret_test.go @@ -10,9 +10,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" "github.com/grafana/loki/operator/internal/manifests" ) @@ -38,8 +38,8 @@ tenants: func TestGetTenantConfigSecretData_SecretExist(t *testing.T) { k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ + s := &lokiv1.LokiStack{ + ObjectMeta: metav1.ObjectMeta{ Name: "lokistack-dev", Namespace: "some-ns", }, @@ -60,7 +60,7 @@ func TestGetTenantConfigSecretData_SecretExist(t *testing.T) { return nil } - ts, err := GetTenantConfigSecretData(context.TODO(), k, r) + ts, err := getTenantConfigFromSecret(context.TODO(), k, s) require.NotNil(t, ts) require.NoError(t, err) @@ -86,8 +86,8 @@ func TestGetTenantConfigSecretData_SecretExist(t *testing.T) { func TestGetTenantConfigSecretData_SecretNotExist(t *testing.T) { k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ + s := &lokiv1.LokiStack{ + ObjectMeta: metav1.ObjectMeta{ Name: "lokistack-dev", Namespace: "some-ns", }, @@ -97,7 +97,7 @@ func TestGetTenantConfigSecretData_SecretNotExist(t *testing.T) { return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") } - ts, err := GetTenantConfigSecretData(context.TODO(), k, r) + ts, err := getTenantConfigFromSecret(context.TODO(), k, s) require.Nil(t, ts) require.Error(t, err) } diff --git a/operator/internal/handlers/internal/gateway/tenant_secrets.go b/operator/internal/handlers/internal/gateway/tenant_secrets.go index fd2775dfa06ac..6cc39ae05e254 100644 --- a/operator/internal/handlers/internal/gateway/tenant_secrets.go +++ b/operator/internal/handlers/internal/gateway/tenant_secrets.go @@ -7,7 +7,6 @@ import ( "github.com/ViaQ/logerr/v2/kverrors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" @@ -16,14 +15,13 @@ import ( "github.com/grafana/loki/operator/internal/status" ) -// GetTenantSecrets returns the list to gateway tenant secrets for a tenant mode. +// getTenantSecrets returns the list to gateway tenant secrets for a tenant mode. // For modes static and dynamic the secrets are fetched from external provided // secrets. For modes openshift-logging and openshift-network a secret per default tenants are created. // All secrets live in the same namespace as the lokistack request. -func GetTenantSecrets( +func getTenantSecrets( ctx context.Context, k k8s.Client, - req ctrl.Request, stack *lokiv1.LokiStack, ) ([]*manifests.TenantSecrets, error) { var ( @@ -34,7 +32,7 @@ func GetTenantSecrets( for _, tenant := range stack.Spec.Tenants.Authentication { switch { case tenant.OIDC != nil: - key := client.ObjectKey{Name: tenant.OIDC.Secret.Name, Namespace: req.Namespace} + key := client.ObjectKey{Name: tenant.OIDC.Secret.Name, Namespace: stack.Namespace} if err := k.Get(ctx, key, &gatewaySecret); err != nil { if apierrors.IsNotFound(err) { return nil, &status.DegradedError{ @@ -60,7 +58,7 @@ func GetTenantSecrets( OIDCSecret: oidcSecret, } if tenant.OIDC.IssuerCA != nil { - caPath, err := extractCAPath(ctx, k, req.Namespace, tenant.TenantName, tenant.OIDC.IssuerCA) + caPath, err := extractCAPath(ctx, k, stack.Namespace, tenant.TenantName, tenant.OIDC.IssuerCA) if err != nil { return nil, err } @@ -68,7 +66,7 @@ func GetTenantSecrets( } tenantSecrets = append(tenantSecrets, tennantSecret) case tenant.MTLS != nil: - caPath, err := extractCAPath(ctx, k, req.Namespace, tenant.TenantName, tenant.MTLS.CA) + caPath, err := extractCAPath(ctx, k, stack.Namespace, tenant.TenantName, tenant.MTLS.CA) if err != nil { return nil, err } diff --git a/operator/internal/handlers/internal/gateway/tenant_secrets_test.go b/operator/internal/handlers/internal/gateway/tenant_secrets_test.go index d0292108d8290..d0ccc0962e0b9 100644 --- a/operator/internal/handlers/internal/gateway/tenant_secrets_test.go +++ b/operator/internal/handlers/internal/gateway/tenant_secrets_test.go @@ -9,7 +9,6 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" @@ -93,13 +92,6 @@ func TestGetTenantSecrets(t *testing.T) { } { t.Run(strings.Join([]string{string(mode), tc.name}, "_"), func(t *testing.T) { k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - s := &lokiv1.LokiStack{ ObjectMeta: metav1.ObjectMeta{ Name: "mystack", @@ -119,7 +111,7 @@ func TestGetTenantSecrets(t *testing.T) { } return nil } - ts, err := GetTenantSecrets(context.TODO(), k, r, s) + ts, err := getTenantSecrets(context.TODO(), k, s) require.NoError(t, err) require.ElementsMatch(t, ts, tc.expected) }) diff --git a/operator/internal/handlers/internal/rules/cleanup.go b/operator/internal/handlers/internal/rules/cleanup.go index 81805947efddf..abd5bacd5c032 100644 --- a/operator/internal/handlers/internal/rules/cleanup.go +++ b/operator/internal/handlers/internal/rules/cleanup.go @@ -4,25 +4,49 @@ import ( "context" "github.com/ViaQ/logerr/v2/kverrors" + "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/labels" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + v1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/manifests" ) -// RemoveRulesConfigMap removes the rules configmaps if any exists. -func RemoveRulesConfigMap(ctx context.Context, req ctrl.Request, c client.Client) error { +// Cleanup removes the ruler component's statefulset and configmaps if available, or +// else it returns an error to retry the reconciliation loop. +func Cleanup(ctx context.Context, log logr.Logger, k k8s.Client, stack *v1.LokiStack) error { + if stack.Spec.Rules != nil && stack.Spec.Rules.Enabled { + return nil + } + + stackKey := client.ObjectKeyFromObject(stack) + + // Clean up ruler resources + if err := removeRulesConfigMap(ctx, k, stackKey); err != nil { + log.Error(err, "failed to remove rules ConfigMap") + return err + } + + if err := removeRuler(ctx, k, stackKey); err != nil { + log.Error(err, "failed to remove ruler StatefulSet") + return err + } + + return nil +} + +func removeRulesConfigMap(ctx context.Context, c client.Client, key client.ObjectKey) error { var rulesCmList corev1.ConfigMapList err := c.List(ctx, &rulesCmList, &client.ListOptions{ - Namespace: req.Namespace, + Namespace: key.Namespace, LabelSelector: labels.SelectorFromSet(labels.Set{ "app.kubernetes.io/component": manifests.LabelRulerComponent, - "app.kubernetes.io/instance": req.Name, + "app.kubernetes.io/instance": key.Name, }), }) if err != nil { @@ -41,10 +65,9 @@ func RemoveRulesConfigMap(ctx context.Context, req ctrl.Request, c client.Client return nil } -// RemoveRuler removes the ruler statefulset if it exists. -func RemoveRuler(ctx context.Context, req ctrl.Request, c client.Client) error { +func removeRuler(ctx context.Context, c client.Client, stack client.ObjectKey) error { // Check if the Statefulset exists before proceeding. - key := client.ObjectKey{Name: manifests.RulerName(req.Name), Namespace: req.Namespace} + key := client.ObjectKey{Name: manifests.RulerName(stack.Name), Namespace: stack.Namespace} var ruler appsv1.StatefulSet if err := c.Get(ctx, key, &ruler); err != nil { diff --git a/operator/internal/handlers/internal/rules/cleanup_test.go b/operator/internal/handlers/internal/rules/cleanup_test.go new file mode 100644 index 0000000000000..5ecd9f69c345f --- /dev/null +++ b/operator/internal/handlers/internal/rules/cleanup_test.go @@ -0,0 +1,223 @@ +package rules + +import ( + "context" + "io" + "testing" + + "github.com/ViaQ/logerr/v2/log" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" +) + +var ( + logger = log.NewLogger("testing", log.WithOutput(io.Discard)) + + defaultSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "endpoint": []byte("s3://your-endpoint"), + "region": []byte("a-region"), + "bucketnames": []byte("bucket1,bucket2"), + "access_key_id": []byte("a-secret-id"), + "access_key_secret": []byte("a-secret-key"), + }, + } + + defaultGatewaySecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-gateway-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "clientID": []byte("client-123"), + "clientSecret": []byte("client-secret-xyz"), + "issuerCAPath": []byte("/tmp/test/ca.pem"), + }, + } + + rulesCM = corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack-rules-0", + Namespace: "some-ns", + }, + } + + rulerSS = appsv1.StatefulSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "StatefulSet", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack-ruler", + Namespace: "some-ns", + }, + } +) + +func TestCleanup_RemovesRulerResourcesWhenDisabled(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + stack := lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Rules: &lokiv1.RulesSpec{ + Enabled: true, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { + _, ok := out.(*lokiv1.RulerConfig) + if ok { + return apierrors.NewNotFound(schema.GroupResource{}, "no ruler config") + } + + _, isLokiStack := out.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(out, &stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(out, &defaultSecret) + return nil + } + if defaultGatewaySecret.Name == name.Name { + k.SetClientObject(out, &defaultGatewaySecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") + } + + k.CreateStub = func(_ context.Context, o client.Object, _ ...client.CreateOption) error { + assert.Equal(t, r.Namespace, o.GetNamespace()) + return nil + } + + k.StatusStub = func() client.StatusWriter { return sw } + + k.DeleteStub = func(_ context.Context, o client.Object, _ ...client.DeleteOption) error { + assert.Equal(t, r.Namespace, o.GetNamespace()) + return nil + } + + k.ListStub = func(_ context.Context, list client.ObjectList, options ...client.ListOption) error { + switch list.(type) { + case *corev1.ConfigMapList: + k.SetClientObjectList(list, &corev1.ConfigMapList{ + Items: []corev1.ConfigMap{ + rulesCM, + }, + }) + } + return nil + } + + err := Cleanup(context.TODO(), logger, k, &stack) + require.NoError(t, err) + + // make sure delete not called + require.Zero(t, k.DeleteCallCount()) + + // Disable the ruler + stack.Spec.Rules.Enabled = false + + // Get should return ruler resources + k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { + _, ok := out.(*lokiv1.RulerConfig) + if ok { + return apierrors.NewNotFound(schema.GroupResource{}, "no ruler config") + } + if rulesCM.Name == name.Name { + k.SetClientObject(out, &rulesCM) + return nil + } + if rulerSS.Name == name.Name { + k.SetClientObject(out, &rulerSS) + return nil + } + + _, isLokiStack := out.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(out, &stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(out, &defaultSecret) + return nil + } + if defaultGatewaySecret.Name == name.Name { + k.SetClientObject(out, &defaultGatewaySecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") + } + + err = Cleanup(context.TODO(), logger, k, &stack) + require.NoError(t, err) + + // make sure delete was called twice (delete rules configmap and ruler statefulset) + require.Equal(t, 2, k.DeleteCallCount()) +} diff --git a/operator/internal/handlers/internal/rules/config.go b/operator/internal/handlers/internal/rules/config.go index f66b92ee06c11..ec4413fc49ecd 100644 --- a/operator/internal/handlers/internal/rules/config.go +++ b/operator/internal/handlers/internal/rules/config.go @@ -5,19 +5,16 @@ import ( "github.com/ViaQ/logerr/v2/kverrors" apierrors "k8s.io/apimachinery/pkg/api/errors" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s" ) -// GetRulerConfig returns the ruler config spec for a lokistack resource or an error. +// getRulerConfig returns the ruler config spec for a lokistack resource or an error. // If the config is not found, we skip without an error. -func GetRulerConfig(ctx context.Context, k k8s.Client, req ctrl.Request) (*lokiv1.RulerConfigSpec, error) { +func getRulerConfig(ctx context.Context, k k8s.Client, key client.ObjectKey) (*lokiv1.RulerConfigSpec, error) { var rc lokiv1.RulerConfig - - key := client.ObjectKey{Name: req.Name, Namespace: req.Namespace} if err := k.Get(ctx, key, &rc); err != nil { if apierrors.IsNotFound(err) { return nil, nil diff --git a/operator/internal/handlers/internal/rules/rules.go b/operator/internal/handlers/internal/rules/rules.go index ac4a6d78f0306..e21335e98c095 100644 --- a/operator/internal/handlers/internal/rules/rules.go +++ b/operator/internal/handlers/internal/rules/rules.go @@ -4,20 +4,114 @@ import ( "context" "github.com/ViaQ/logerr/v2/kverrors" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/handlers/internal/openshift" + "github.com/grafana/loki/operator/internal/manifests" + manifestsocp "github.com/grafana/loki/operator/internal/manifests/openshift" + "github.com/grafana/loki/operator/internal/status" ) -// List returns a slice of AlertingRules and a slice of RecordingRules for the given spec or an error. Three cases apply: -// - Return only matching rules in the stack namespace if no namespace selector given. -// - Return only matching rules in the stack namespace and in namespaces matching the namespace selector. -// - Return no rules if rules selector does not apply at all. -func List(ctx context.Context, k k8s.Client, stackNs string, rs *lokiv1.RulesSpec) ([]lokiv1.AlertingRule, []lokiv1.RecordingRule, error) { +// BuildOptions returns the ruler options needed to generate Kubernetes resource manifests. +// The returned error can be a status.DegradedError in the following cases: +// - When remote write is enabled and the authorization Secret is missing. +// - When remote write is enabled and the authorization Secret data is invalid. +func BuildOptions( + ctx context.Context, + log logr.Logger, + k k8s.Client, + stack *lokiv1.LokiStack, +) ([]lokiv1.AlertingRule, []lokiv1.RecordingRule, manifests.Ruler, manifestsocp.Options, error) { + if stack.Spec.Rules == nil || !stack.Spec.Rules.Enabled { + return nil, nil, manifests.Ruler{}, manifestsocp.Options{}, nil + } + + var ( + err error + alertingRules []lokiv1.AlertingRule + recordingRules []lokiv1.RecordingRule + rulerConfig *lokiv1.RulerConfigSpec + rulerSecret *manifests.RulerSecret + ruler manifests.Ruler + ocpOpts manifestsocp.Options + + stackKey = client.ObjectKeyFromObject(stack) + ) + + alertingRules, recordingRules, err = listRules(ctx, k, stack.Namespace, stack.Spec.Rules) + if err != nil { + log.Error(err, "failed to lookup rules", "spec", stack.Spec.Rules) + } + + rulerConfig, err = getRulerConfig(ctx, k, stackKey) + if err != nil { + log.Error(err, "failed to lookup ruler config", "key", stackKey) + } + + if rulerConfig != nil && rulerConfig.RemoteWriteSpec != nil && rulerConfig.RemoteWriteSpec.ClientSpec != nil { + var rs corev1.Secret + key := client.ObjectKey{Name: rulerConfig.RemoteWriteSpec.ClientSpec.AuthorizationSecretName, Namespace: stack.Namespace} + if err = k.Get(ctx, key, &rs); err != nil { + if apierrors.IsNotFound(err) { + return nil, nil, ruler, ocpOpts, &status.DegradedError{ + Message: "Missing ruler remote write authorization secret", + Reason: lokiv1.ReasonMissingRulerSecret, + Requeue: false, + } + } + return nil, nil, ruler, ocpOpts, kverrors.Wrap(err, "failed to lookup lokistack ruler secret", "name", key) + } + + rulerSecret, err = ExtractRulerSecret(&rs, rulerConfig.RemoteWriteSpec.ClientSpec.AuthorizationType) + if err != nil { + return nil, nil, ruler, ocpOpts, &status.DegradedError{ + Message: "Invalid ruler remote write authorization secret contents", + Reason: lokiv1.ReasonInvalidRulerSecret, + Requeue: false, + } + } + } + + ocpAmEnabled, err := openshift.AlertManagerSVCExists(ctx, stack.Spec, k) + if err != nil { + log.Error(err, "failed to check OCP AlertManager") + return nil, nil, ruler, ocpOpts, err + } + + ocpUWAmEnabled, err := openshift.UserWorkloadAlertManagerSVCExists(ctx, stack.Spec, k) + if err != nil { + log.Error(err, "failed to check OCP User Workload AlertManager") + return nil, nil, ruler, ocpOpts, err + } + + ruler = manifests.Ruler{ + Spec: rulerConfig, + Secret: rulerSecret, + } + + ocpOpts = manifestsocp.Options{ + BuildOpts: manifestsocp.BuildOptions{ + AlertManagerEnabled: ocpAmEnabled, + UserWorkloadAlertManagerEnabled: ocpUWAmEnabled, + }, + } + + return alertingRules, recordingRules, ruler, ocpOpts, nil +} + +// listRules returns a slice of AlertingRules and a slice of RecordingRules for the given spec or an error. +// Three cases apply: +// - Return only matching rules in the stack namespace if no namespace selector is given. +// - Return only matching rules in the stack namespace and in namespaces matching the namespace selector. +// - Return no rules if rules selector does not apply at all. +func listRules(ctx context.Context, k k8s.Client, stackNs string, rs *lokiv1.RulesSpec) ([]lokiv1.AlertingRule, []lokiv1.RecordingRule, error) { nsl, err := selectRulesNamespaces(ctx, k, stackNs, rs) if err != nil { return nil, nil, err diff --git a/operator/internal/handlers/internal/rules/rules_test.go b/operator/internal/handlers/internal/rules/rules_test.go index 8bc52afb6a9a4..e33a2ac928a6a 100644 --- a/operator/internal/handlers/internal/rules/rules_test.go +++ b/operator/internal/handlers/internal/rules/rules_test.go @@ -1,4 +1,4 @@ -package rules_test +package rules import ( "context" @@ -11,13 +11,252 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" - "github.com/grafana/loki/operator/internal/handlers/internal/rules" + "github.com/grafana/loki/operator/internal/status" ) +func TestBuildOptions_WhenMissingRemoteWriteSecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + stack := lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Rules: &lokiv1.RulesSpec{ + Enabled: true, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + rulerCfg := &lokiv1.RulerConfig{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.RulerConfigSpec{ + RemoteWriteSpec: &lokiv1.RemoteWriteSpec{ + Enabled: true, + ClientSpec: &lokiv1.RemoteWriteClientSpec{ + AuthorizationType: lokiv1.BasicAuthorization, + AuthorizationSecretName: "test", + }, + }, + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing ruler remote write authorization secret", + Reason: lokiv1.ReasonMissingRulerSecret, + Requeue: false, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { + _, isRulerConfig := out.(*lokiv1.RulerConfig) + if r.Name == name.Name && r.Namespace == name.Namespace && isRulerConfig { + k.SetClientObject(out, rulerCfg) + return nil + } + + _, isLokiStack := out.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(out, &stack) + return nil + } + if defaultSecret.Name == name.Name { + k.SetClientObject(out, &defaultSecret) + return nil + } + if defaultGatewaySecret.Name == name.Name { + k.SetClientObject(out, &defaultGatewaySecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, _, _, err := BuildOptions(context.TODO(), logger, k, &stack) + + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenInvalidRemoteWriteSecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + stack := lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + Rules: &lokiv1.RulesSpec{ + Enabled: true, + }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "dynamic", + Authentication: []lokiv1.AuthenticationSpec{ + { + TenantName: "test", + TenantID: "1234", + OIDC: &lokiv1.OIDCSpec{ + Secret: &lokiv1.TenantSecretSpec{ + Name: defaultGatewaySecret.Name, + }, + }, + }, + }, + Authorization: &lokiv1.AuthorizationSpec{ + OPA: &lokiv1.OPASpec{ + URL: "some-url", + }, + }, + }, + }, + } + + rulerCfg := &lokiv1.RulerConfig{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.RulerConfigSpec{ + RemoteWriteSpec: &lokiv1.RemoteWriteSpec{ + Enabled: true, + ClientSpec: &lokiv1.RemoteWriteClientSpec{ + AuthorizationType: lokiv1.BasicAuthorization, + AuthorizationSecretName: "some-client-secret", + }, + }, + }, + } + + invalidSecret := corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-client-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{}, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid ruler remote write authorization secret contents", + Reason: lokiv1.ReasonInvalidRulerSecret, + Requeue: false, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { + _, isRulerConfig := out.(*lokiv1.RulerConfig) + if r.Name == name.Name && r.Namespace == name.Namespace && isRulerConfig { + k.SetClientObject(out, rulerCfg) + return nil + } + + _, isLokiStack := out.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(out, &stack) + return nil + } + if invalidSecret.Name == name.Name { + k.SetClientObject(out, &invalidSecret) + return nil + } + if defaultGatewaySecret.Name == name.Name { + k.SetClientObject(out, &defaultGatewaySecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, _, _, _, err := BuildOptions(context.TODO(), logger, k, &stack) + + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + func TestList_AlertingRulesMatchSelector_WithDefaultStackNamespaceRules(t *testing.T) { const stackNs = "some-ns" @@ -83,7 +322,7 @@ func TestList_AlertingRulesMatchSelector_WithDefaultStackNamespaceRules(t *testi return nil } - rules, _, err := rules.List(context.TODO(), k, stackNs, rs) + rules, _, err := listRules(context.TODO(), k, stackNs, rs) require.NoError(t, err) require.NotEmpty(t, rules) @@ -185,7 +424,7 @@ func TestList_AlertingRulesMatchSelector_FilteredByNamespaceSelector(t *testing. return nil } - rules, _, err := rules.List(context.TODO(), k, stackNs, rs) + rules, _, err := listRules(context.TODO(), k, stackNs, rs) require.NoError(t, err) require.NotEmpty(t, rules) @@ -257,7 +496,7 @@ func TestList_RecordingRulesMatchSelector_WithDefaultStackNamespaceRules(t *test return nil } - _, rules, err := rules.List(context.TODO(), k, stackNs, rs) + _, rules, err := listRules(context.TODO(), k, stackNs, rs) require.NoError(t, err) require.NotEmpty(t, rules) @@ -358,7 +597,7 @@ func TestList_RecordingRulesMatchSelector_FilteredByNamespaceSelector(t *testing return nil } - _, rules, err := rules.List(context.TODO(), k, stackNs, rs) + _, rules, err := listRules(context.TODO(), k, stackNs, rs) require.NoError(t, err) require.NotEmpty(t, rules) diff --git a/operator/internal/handlers/internal/storage/ca_configmap.go b/operator/internal/handlers/internal/storage/ca_configmap.go index ce70591e55cfa..904e63373a207 100644 --- a/operator/internal/handlers/internal/storage/ca_configmap.go +++ b/operator/internal/handlers/internal/storage/ca_configmap.go @@ -1,10 +1,22 @@ package storage import ( + "context" "crypto/sha1" "fmt" + "github.com/ViaQ/logerr/v2/kverrors" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" + + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/status" +) + +const ( + defaultCAKey = "service-ca.crt" ) type caKeyError string @@ -13,9 +25,26 @@ func (e caKeyError) Error() string { return fmt.Sprintf("key not present or data empty: %s", string(e)) } -// CheckCAConfigMap checks if the given CA configMap has an non-empty entry for the key used as CA certificate. +func getCAConfigMap(ctx context.Context, k k8s.Client, stack *lokiv1.LokiStack, name string) (*corev1.ConfigMap, error) { + var cm corev1.ConfigMap + key := client.ObjectKey{Name: name, Namespace: stack.Namespace} + if err := k.Get(ctx, key, &cm); err != nil { + if apierrors.IsNotFound(err) { + return nil, &status.DegradedError{ + Message: "Missing object storage CA config map", + Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, + Requeue: false, + } + } + return nil, kverrors.Wrap(err, "failed to lookup lokistack object storage CA config map", "name", key) + } + + return &cm, nil +} + +// checkCAConfigMap checks if the given CA configMap has an non-empty entry for the key used as CA certificate. // If the key is present it will return a hash of the current key name and contents. -func CheckCAConfigMap(cm *corev1.ConfigMap, key string) (string, error) { +func checkCAConfigMap(cm *corev1.ConfigMap, key string) (string, error) { data := cm.Data[key] if data == "" { return "", caKeyError(key) diff --git a/operator/internal/handlers/internal/storage/ca_configmap_test.go b/operator/internal/handlers/internal/storage/ca_configmap_test.go index bd3d4d56a690a..33d5156defe95 100644 --- a/operator/internal/handlers/internal/storage/ca_configmap_test.go +++ b/operator/internal/handlers/internal/storage/ca_configmap_test.go @@ -1,15 +1,13 @@ -package storage_test +package storage import ( "testing" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" - - "github.com/grafana/loki/operator/internal/handlers/internal/storage" ) -func TestIsValidConfigMap(t *testing.T) { +func TestCheckValidConfigMap(t *testing.T) { type test struct { name string cm *corev1.ConfigMap @@ -47,7 +45,7 @@ func TestIsValidConfigMap(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - hash, err := storage.CheckCAConfigMap(tst.cm, "service-ca.crt") + hash, err := checkCAConfigMap(tst.cm, "service-ca.crt") require.Equal(t, tst.wantHash, hash) if tst.wantErrorMsg == "" { diff --git a/operator/internal/handlers/internal/storage/secrets.go b/operator/internal/handlers/internal/storage/secrets.go index 1341728e7cecf..0ef5f197a625e 100644 --- a/operator/internal/handlers/internal/storage/secrets.go +++ b/operator/internal/handlers/internal/storage/secrets.go @@ -1,24 +1,46 @@ package storage import ( + "context" "crypto/sha1" "fmt" "sort" "github.com/ViaQ/logerr/v2/kverrors" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/manifests/storage" + "github.com/grafana/loki/operator/internal/status" ) var hashSeparator = []byte(",") -// ExtractSecret reads a k8s secret into a manifest object storage struct if valid. -func ExtractSecret(s *corev1.Secret, secretType lokiv1.ObjectStorageSecretType) (*storage.Options, error) { +func getSecret(ctx context.Context, k k8s.Client, stack *lokiv1.LokiStack) (*corev1.Secret, error) { + var storageSecret corev1.Secret + key := client.ObjectKey{Name: stack.Spec.Storage.Secret.Name, Namespace: stack.Namespace} + if err := k.Get(ctx, key, &storageSecret); err != nil { + if apierrors.IsNotFound(err) { + return nil, &status.DegradedError{ + Message: "Missing object storage secret", + Reason: lokiv1.ReasonMissingObjectStorageSecret, + Requeue: false, + } + } + return nil, kverrors.Wrap(err, "failed to lookup lokistack storage secret", "name", key) + } + + return &storageSecret, nil +} + +// extractSecret reads a k8s secret into a manifest object storage struct if valid. +func extractSecret(s *corev1.Secret, secretType lokiv1.ObjectStorageSecretType) (storage.Options, error) { hash, err := hashSecretData(s) if err != nil { - return nil, kverrors.Wrap(err, "error calculating hash for secret", "type", secretType) + return storage.Options{}, kverrors.Wrap(err, "error calculating hash for secret", "type", secretType) } storageOpts := storage.Options{ @@ -39,13 +61,13 @@ func ExtractSecret(s *corev1.Secret, secretType lokiv1.ObjectStorageSecretType) case lokiv1.ObjectStorageSecretAlibabaCloud: storageOpts.AlibabaCloud, err = extractAlibabaCloudConfigSecret(s) default: - return nil, kverrors.New("unknown secret type", "type", secretType) + return storage.Options{}, kverrors.New("unknown secret type", "type", secretType) } if err != nil { - return nil, err + return storage.Options{}, err } - return &storageOpts, nil + return storageOpts, nil } func hashSecretData(s *corev1.Secret) (string, error) { diff --git a/operator/internal/handlers/internal/storage/secrets_test.go b/operator/internal/handlers/internal/storage/secrets_test.go index 46ddc133f9f42..c72c63ea1ee12 100644 --- a/operator/internal/handlers/internal/storage/secrets_test.go +++ b/operator/internal/handlers/internal/storage/secrets_test.go @@ -135,7 +135,7 @@ func TestAzureExtract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - opts, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretAzure) + opts, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretAzure) if !tst.wantErr { require.NoError(t, err) require.NotEmpty(t, opts.SecretName) @@ -186,7 +186,7 @@ func TestGCSExtract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - _, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretGCS) + _, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretGCS) if !tst.wantErr { require.NoError(t, err) } @@ -360,7 +360,7 @@ func TestS3Extract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - opts, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretS3) + opts, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretS3) if !tst.wantErr { require.NoError(t, err) require.NotEmpty(t, opts.SecretName) @@ -509,7 +509,7 @@ func TestSwiftExtract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - opts, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretSwift) + opts, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretSwift) if !tst.wantErr { require.NoError(t, err) require.NotEmpty(t, opts.SecretName) @@ -583,7 +583,7 @@ func TestAlibabaCloudExtract(t *testing.T) { t.Run(tst.name, func(t *testing.T) { t.Parallel() - opts, err := ExtractSecret(tst.secret, lokiv1.ObjectStorageSecretAlibabaCloud) + opts, err := extractSecret(tst.secret, lokiv1.ObjectStorageSecretAlibabaCloud) if !tst.wantErr { require.NoError(t, err) require.NotEmpty(t, opts.SecretName) diff --git a/operator/internal/handlers/internal/storage/storage.go b/operator/internal/handlers/internal/storage/storage.go new file mode 100644 index 0000000000000..e1657121ccd6d --- /dev/null +++ b/operator/internal/handlers/internal/storage/storage.go @@ -0,0 +1,91 @@ +package storage + +import ( + "context" + "fmt" + "time" + + configv1 "github.com/grafana/loki/operator/apis/config/v1" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s" + "github.com/grafana/loki/operator/internal/manifests/storage" + "github.com/grafana/loki/operator/internal/status" +) + +// BuildOptions returns the object storage options to generate Kubernetes resource manifests +// which require access to object storage buckets. +// The returned error can be a status.DegradedError in the following cases: +// - The user-provided object storage secret is missing. +// - The object storage Secret data is invalid. +// - The object storage schema config is invalid. +// - The object storage CA ConfigMap is missing if one referenced. +// - The object storage CA ConfigMap data is invalid. +func BuildOptions(ctx context.Context, k k8s.Client, stack *lokiv1.LokiStack, fg configv1.FeatureGates) (storage.Options, error) { + storageSecret, err := getSecret(ctx, k, stack) + if err != nil { + return storage.Options{}, err + } + + objStore, err := extractSecret(storageSecret, stack.Spec.Storage.Secret.Type) + if err != nil { + return storage.Options{}, &status.DegradedError{ + Message: fmt.Sprintf("Invalid object storage secret contents: %s", err), + Reason: lokiv1.ReasonInvalidObjectStorageSecret, + Requeue: false, + } + } + objStore.OpenShiftEnabled = fg.OpenShift.Enabled + + storageSchemas, err := storage.BuildSchemaConfig( + time.Now().UTC(), + stack.Spec.Storage, + stack.Status.Storage, + ) + if err != nil { + return storage.Options{}, &status.DegradedError{ + Message: fmt.Sprintf("Invalid object storage schema contents: %s", err), + Reason: lokiv1.ReasonInvalidObjectStorageSchema, + Requeue: false, + } + } + + objStore.Schemas = storageSchemas + + if stack.Spec.Storage.TLS == nil { + return objStore, nil + } + + tlsConfig := stack.Spec.Storage.TLS + if tlsConfig.CA == "" { + return storage.Options{}, &status.DegradedError{ + Message: "Missing object storage CA config map", + Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, + Requeue: false, + } + } + + cm, err := getCAConfigMap(ctx, k, stack, tlsConfig.CA) + if err != nil { + return storage.Options{}, err + } + + caKey := defaultCAKey + if tlsConfig.CAKey != "" { + caKey = tlsConfig.CAKey + } + + var caHash string + caHash, err = checkCAConfigMap(cm, caKey) + if err != nil { + return storage.Options{}, &status.DegradedError{ + Message: fmt.Sprintf("Invalid object storage CA configmap contents: %s", err), + Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, + Requeue: false, + } + } + + objStore.SecretSHA1 = fmt.Sprintf("%s;%s", objStore.SecretSHA1, caHash) + objStore.TLS = &storage.TLSConfig{CA: cm.Name, Key: caKey} + + return objStore, nil +} diff --git a/operator/internal/handlers/internal/storage/storage_test.go b/operator/internal/handlers/internal/storage/storage_test.go new file mode 100644 index 0000000000000..f56e446d6da8f --- /dev/null +++ b/operator/internal/handlers/internal/storage/storage_test.go @@ -0,0 +1,477 @@ +package storage + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + configv1 "github.com/grafana/loki/operator/apis/config/v1" + lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" + "github.com/grafana/loki/operator/internal/external/k8s/k8sfakes" + "github.com/grafana/loki/operator/internal/status" +) + +var ( + featureGates = configv1.FeatureGates{ + ServiceMonitors: false, + ServiceMonitorTLSEndpoints: false, + BuiltInCertManagement: configv1.BuiltInCertManagement{ + Enabled: true, + CACertValidity: "10m", + CACertRefresh: "5m", + CertValidity: "2m", + CertRefresh: "1m", + }, + } + + defaultSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{ + "endpoint": []byte("s3://your-endpoint"), + "region": []byte("a-region"), + "bucketnames": []byte("bucket1,bucket2"), + "access_key_id": []byte("a-secret-id"), + "access_key_secret": []byte("a-secret-key"), + }, + } + + invalidSecret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-secret", + Namespace: "some-ns", + }, + Data: map[string][]byte{}, + } + + invalidCAConfigMap = corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-stack-ca-configmap", + Namespace: "some-ns", + }, + Data: map[string]string{}, + } +) + +func TestBuildOptions_WhenMissingSecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing object storage secret", + Reason: lokiv1.ReasonMissingObjectStorageSecret, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenInvalidSecret_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid object storage secret contents: missing secret field", + Reason: lokiv1.ReasonInvalidObjectStorageSecret, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: invalidSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + if name.Name == invalidSecret.Name { + k.SetClientObject(object, &invalidSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WithInvalidStorageSchema_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid object storage schema contents: spec does not contain any schemas", + Reason: lokiv1.ReasonInvalidObjectStorageSchema, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{}, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + }, + }, + Status: lokiv1.LokiStackStatus{ + Storage: lokiv1.LokiStackStorageStatus{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + { + Version: lokiv1.ObjectStorageSchemaV12, + EffectiveDate: "2021-10-11", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + if name.Name == defaultSecret.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenMissingCAConfigMap_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing object storage CA config map", + Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + TLS: &lokiv1.ObjectStorageTLSSpec{ + CASpec: lokiv1.CASpec{ + CA: "not-existing", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + + if name.Name == defaultSecret.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenEmptyCAConfigMapName_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Missing object storage CA config map", + Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + TLS: &lokiv1.ObjectStorageTLSSpec{ + CASpec: lokiv1.CASpec{ + CA: "", + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + + if name.Name == defaultSecret.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} + +func TestBuildOptions_WhenInvalidCAConfigMap_SetDegraded(t *testing.T) { + sw := &k8sfakes.FakeStatusWriter{} + k := &k8sfakes.FakeClient{} + r := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "my-stack", + Namespace: "some-ns", + }, + } + + degradedErr := &status.DegradedError{ + Message: "Invalid object storage CA configmap contents: key not present or data empty: service-ca.crt", + Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, + Requeue: false, + } + + stack := &lokiv1.LokiStack{ + TypeMeta: metav1.TypeMeta{ + Kind: "LokiStack", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-stack", + Namespace: "some-ns", + UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", + }, + Spec: lokiv1.LokiStackSpec{ + Size: lokiv1.SizeOneXExtraSmall, + Storage: lokiv1.ObjectStorageSpec{ + Schemas: []lokiv1.ObjectStorageSchema{ + { + Version: lokiv1.ObjectStorageSchemaV11, + EffectiveDate: "2020-10-11", + }, + }, + Secret: lokiv1.ObjectStorageSecretSpec{ + Name: defaultSecret.Name, + Type: lokiv1.ObjectStorageSecretS3, + }, + TLS: &lokiv1.ObjectStorageTLSSpec{ + CASpec: lokiv1.CASpec{ + CA: invalidCAConfigMap.Name, + }, + }, + }, + }, + } + + k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { + _, isLokiStack := object.(*lokiv1.LokiStack) + if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + k.SetClientObject(object, stack) + return nil + } + if name.Name == defaultSecret.Name { + k.SetClientObject(object, &defaultSecret) + return nil + } + + if name.Name == invalidCAConfigMap.Name { + k.SetClientObject(object, &invalidCAConfigMap) + return nil + } + return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + } + + k.StatusStub = func() client.StatusWriter { return sw } + + _, err := BuildOptions(context.TODO(), k, stack, featureGates) + + // make sure error is returned + require.Error(t, err) + require.Equal(t, degradedErr, err) +} diff --git a/operator/internal/handlers/lokistack_create_or_update.go b/operator/internal/handlers/lokistack_create_or_update.go index a6963f7574321..b64713f2d0fda 100644 --- a/operator/internal/handlers/lokistack_create_or_update.go +++ b/operator/internal/handlers/lokistack_create_or_update.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "os" - "time" "github.com/ViaQ/logerr/v2/kverrors" "github.com/go-logr/logr" @@ -20,22 +19,15 @@ import ( lokiv1 "github.com/grafana/loki/operator/apis/loki/v1" "github.com/grafana/loki/operator/internal/external/k8s" "github.com/grafana/loki/operator/internal/handlers/internal/gateway" - "github.com/grafana/loki/operator/internal/handlers/internal/openshift" "github.com/grafana/loki/operator/internal/handlers/internal/rules" "github.com/grafana/loki/operator/internal/handlers/internal/serviceaccounts" "github.com/grafana/loki/operator/internal/handlers/internal/storage" "github.com/grafana/loki/operator/internal/handlers/internal/tlsprofile" "github.com/grafana/loki/operator/internal/manifests" - manifests_openshift "github.com/grafana/loki/operator/internal/manifests/openshift" - storageoptions "github.com/grafana/loki/operator/internal/manifests/storage" "github.com/grafana/loki/operator/internal/metrics" "github.com/grafana/loki/operator/internal/status" ) -const ( - defaultCAKey = "service-ca.crt" -) - // CreateOrUpdateLokiStack handles LokiStack create and update events. func CreateOrUpdateLokiStack( ctx context.Context, @@ -67,205 +59,23 @@ func CreateOrUpdateLokiStack( gwImg = manifests.DefaultLokiStackGatewayImage } - var storageSecret corev1.Secret - key := client.ObjectKey{Name: stack.Spec.Storage.Secret.Name, Namespace: stack.Namespace} - if err := k.Get(ctx, key, &storageSecret); err != nil { - if apierrors.IsNotFound(err) { - return &status.DegradedError{ - Message: "Missing object storage secret", - Reason: lokiv1.ReasonMissingObjectStorageSecret, - Requeue: false, - } - } - return kverrors.Wrap(err, "failed to lookup lokistack storage secret", "name", key) - } - - objStore, err := storage.ExtractSecret(&storageSecret, stack.Spec.Storage.Secret.Type) + objStore, err := storage.BuildOptions(ctx, k, &stack, fg) if err != nil { - return &status.DegradedError{ - Message: fmt.Sprintf("Invalid object storage secret contents: %s", err), - Reason: lokiv1.ReasonInvalidObjectStorageSecret, - Requeue: false, - } + return err } - objStore.OpenShiftEnabled = fg.OpenShift.Enabled - storageSchemas, err := storageoptions.BuildSchemaConfig( - time.Now().UTC(), - stack.Spec.Storage, - stack.Status.Storage, - ) + baseDomain, tenants, err := gateway.BuildOptions(ctx, ll, k, &stack, fg) if err != nil { - return &status.DegradedError{ - Message: fmt.Sprintf("Invalid object storage schema contents: %s", err), - Reason: lokiv1.ReasonInvalidObjectStorageSchema, - Requeue: false, - } - } - - objStore.Schemas = storageSchemas - - if stack.Spec.Storage.TLS != nil { - tlsConfig := stack.Spec.Storage.TLS - - if tlsConfig.CA == "" { - return &status.DegradedError{ - Message: "Missing object storage CA config map", - Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, - Requeue: false, - } - } - - var cm corev1.ConfigMap - key := client.ObjectKey{Name: tlsConfig.CA, Namespace: stack.Namespace} - if err = k.Get(ctx, key, &cm); err != nil { - if apierrors.IsNotFound(err) { - return &status.DegradedError{ - Message: "Missing object storage CA config map", - Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, - Requeue: false, - } - } - return kverrors.Wrap(err, "failed to lookup lokistack object storage CA config map", "name", key) - } - - caKey := defaultCAKey - if tlsConfig.CAKey != "" { - caKey = tlsConfig.CAKey - } - - var caHash string - caHash, err = storage.CheckCAConfigMap(&cm, caKey) - if err != nil { - return &status.DegradedError{ - Message: fmt.Sprintf("Invalid object storage CA configmap contents: %s", err), - Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, - Requeue: false, - } - } - - objStore.SecretSHA1 = fmt.Sprintf("%s;%s", objStore.SecretSHA1, caHash) - objStore.TLS = &storageoptions.TLSConfig{CA: cm.Name, Key: caKey} + return err } - var ( - baseDomain string - tenantSecrets []*manifests.TenantSecrets - tenantConfigs map[string]manifests.TenantConfig - ) - if fg.LokiStackGateway && stack.Spec.Tenants == nil { - return &status.DegradedError{ - Message: "Invalid tenants configuration - TenantsSpec cannot be nil when gateway flag is enabled", - Reason: lokiv1.ReasonInvalidTenantsConfiguration, - Requeue: false, - } - } else if fg.LokiStackGateway && stack.Spec.Tenants != nil { - if err = gateway.ValidateModes(stack); err != nil { - return &status.DegradedError{ - Message: fmt.Sprintf("Invalid tenants configuration: %s", err), - Reason: lokiv1.ReasonInvalidTenantsConfiguration, - Requeue: false, - } - } - - switch stack.Spec.Tenants.Mode { - case lokiv1.OpenshiftLogging, lokiv1.OpenshiftNetwork: - baseDomain, err = gateway.GetOpenShiftBaseDomain(ctx, k, req) - if err != nil { - return err - } - - if stack.Spec.Proxy == nil { - // If the LokiStack has no proxy set but there is a cluster-wide proxy setting, - // set the LokiStack proxy to that. - ocpProxy, proxyErr := openshift.GetProxy(ctx, k) - if proxyErr != nil { - return proxyErr - } - - stack.Spec.Proxy = ocpProxy - } - default: - tenantSecrets, err = gateway.GetTenantSecrets(ctx, k, req, &stack) - if err != nil { - return err - } - } - - // extract the existing tenant's id, cookieSecret if exists, otherwise create new. - tenantConfigs, err = gateway.GetTenantConfigSecretData(ctx, k, req) - if err != nil { - ll.Error(err, "error in getting tenant secret data") - } + if err = rules.Cleanup(ctx, ll, k, &stack); err != nil { + return err } - var ( - alertingRules []lokiv1.AlertingRule - recordingRules []lokiv1.RecordingRule - rulerConfig *lokiv1.RulerConfigSpec - rulerSecret *manifests.RulerSecret - ocpAmEnabled bool - ocpUWAmEnabled bool - ) - if stack.Spec.Rules != nil && stack.Spec.Rules.Enabled { - alertingRules, recordingRules, err = rules.List(ctx, k, req.Namespace, stack.Spec.Rules) - if err != nil { - ll.Error(err, "failed to lookup rules", "spec", stack.Spec.Rules) - } - - rulerConfig, err = rules.GetRulerConfig(ctx, k, req) - if err != nil { - ll.Error(err, "failed to lookup ruler config", "key", req.NamespacedName) - } - - if rulerConfig != nil && rulerConfig.RemoteWriteSpec != nil && rulerConfig.RemoteWriteSpec.ClientSpec != nil { - var rs corev1.Secret - key := client.ObjectKey{Name: rulerConfig.RemoteWriteSpec.ClientSpec.AuthorizationSecretName, Namespace: stack.Namespace} - if err = k.Get(ctx, key, &rs); err != nil { - if apierrors.IsNotFound(err) { - return &status.DegradedError{ - Message: "Missing ruler remote write authorization secret", - Reason: lokiv1.ReasonMissingRulerSecret, - Requeue: false, - } - } - return kverrors.Wrap(err, "failed to lookup lokistack ruler secret", "name", key) - } - - rulerSecret, err = rules.ExtractRulerSecret(&rs, rulerConfig.RemoteWriteSpec.ClientSpec.AuthorizationType) - if err != nil { - return &status.DegradedError{ - Message: "Invalid ruler remote write authorization secret contents", - Reason: lokiv1.ReasonInvalidRulerSecret, - Requeue: false, - } - } - } - - ocpAmEnabled, err = openshift.AlertManagerSVCExists(ctx, stack.Spec, k) - if err != nil { - ll.Error(err, "failed to check OCP AlertManager") - return err - } - - ocpUWAmEnabled, err = openshift.UserWorkloadAlertManagerSVCExists(ctx, stack.Spec, k) - if err != nil { - ll.Error(err, "failed to check OCP User Workload AlertManager") - return err - } - } else { - // Clean up ruler resources - err = rules.RemoveRulesConfigMap(ctx, req, k) - if err != nil { - ll.Error(err, "failed to remove rules ConfigMap") - return err - } - - err = rules.RemoveRuler(ctx, req, k) - if err != nil { - ll.Error(err, "failed to remove ruler StatefulSet") - return err - } + alertingRules, recordingRules, ruler, ocpOptions, err := rules.BuildOptions(ctx, ll, k, &stack) + if err != nil { + return err } certRotationRequiredAt := "" @@ -292,25 +102,14 @@ func CreateOrUpdateLokiStack( GatewayBaseDomain: baseDomain, Stack: stack.Spec, Gates: fg, - ObjectStorage: *objStore, + ObjectStorage: objStore, CertRotationRequiredAt: certRotationRequiredAt, AlertingRules: alertingRules, RecordingRules: recordingRules, - Ruler: manifests.Ruler{ - Spec: rulerConfig, - Secret: rulerSecret, - }, - Timeouts: timeoutConfig, - Tenants: manifests.Tenants{ - Secrets: tenantSecrets, - Configs: tenantConfigs, - }, - OpenShiftOptions: manifests_openshift.Options{ - BuildOpts: manifests_openshift.BuildOptions{ - AlertManagerEnabled: ocpAmEnabled, - UserWorkloadAlertManagerEnabled: ocpUWAmEnabled, - }, - }, + Ruler: ruler, + Timeouts: timeoutConfig, + Tenants: tenants, + OpenShiftOptions: ocpOptions, } ll.Info("begin building manifests") @@ -357,7 +156,7 @@ func CreateOrUpdateLokiStack( // updated and another resource is not. This would cause the status to // be possibly misaligned with the configmap, which could lead to // a user possibly being unable to read logs. - if err := status.SetStorageSchemaStatus(ctx, k, req, storageSchemas); err != nil { + if err := status.SetStorageSchemaStatus(ctx, k, req, objStore.Schemas); err != nil { ll.Error(err, "failed to set storage schema status") return err } diff --git a/operator/internal/handlers/lokistack_create_or_update_test.go b/operator/internal/handlers/lokistack_create_or_update_test.go index b2158fe4d2ba2..ad80f45b817a1 100644 --- a/operator/internal/handlers/lokistack_create_or_update_test.go +++ b/operator/internal/handlers/lokistack_create_or_update_test.go @@ -13,7 +13,6 @@ import ( routev1 "github.com/openshift/api/route/v1" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -73,42 +72,6 @@ var ( "issuerCAPath": []byte("/tmp/test/ca.pem"), }, } - - rulesCM = corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack-rules-0", - Namespace: "some-ns", - }, - } - - rulerSS = appsv1.StatefulSet{ - TypeMeta: metav1.TypeMeta{ - Kind: "StatefulSet", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack-ruler", - Namespace: "some-ns", - }, - } - - invalidSecret = corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: "some-stack-secret", - Namespace: "some-ns", - }, - Data: map[string][]byte{}, - } - - invalidCAConfigMap = corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "some-stack-ca-configmap", - Namespace: "some-ns", - }, - Data: map[string]string{}, - } ) func TestMain(m *testing.M) { @@ -573,8 +536,6 @@ func TestCreateOrUpdateLokiStack_WhenCreateReturnsError_ContinueWithOtherObjects }, } - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { _, isLokiStack := object.(*lokiv1.LokiStack) if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { @@ -681,8 +642,6 @@ func TestCreateOrUpdateLokiStack_WhenUpdateReturnsError_ContinueWithOtherObjects }, } - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { _, isLokiStack := object.(*lokiv1.LokiStack) if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { @@ -710,69 +669,7 @@ func TestCreateOrUpdateLokiStack_WhenUpdateReturnsError_ContinueWithOtherObjects require.Error(t, err) } -func TestCreateOrUpdateLokiStack_WhenMissingSecret_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Missing object storage secret", - Reason: lokiv1.ReasonMissingObjectStorageSecret, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenInvalidSecret_SetDegraded(t *testing.T) { +func TestCreateOrUpdateLokiStack_WhenInvalidQueryTimeout_SetDegraded(t *testing.T) { sw := &k8sfakes.FakeStatusWriter{} k := &k8sfakes.FakeClient{} r := ctrl.Request{ @@ -783,8 +680,8 @@ func TestCreateOrUpdateLokiStack_WhenInvalidSecret_SetDegraded(t *testing.T) { } degradedErr := &status.DegradedError{ - Message: "Invalid object storage secret contents: missing secret field", - Reason: lokiv1.ReasonInvalidObjectStorageSecret, + Message: `Error parsing query timeout: time: invalid duration "invalid"`, + Reason: lokiv1.ReasonQueryTimeoutInvalid, Requeue: false, } @@ -802,179 +699,37 @@ func TestCreateOrUpdateLokiStack_WhenInvalidSecret_SetDegraded(t *testing.T) { Storage: lokiv1.ObjectStorageSpec{ Schemas: []lokiv1.ObjectStorageSchema{ { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", + Version: lokiv1.ObjectStorageSchemaV12, + EffectiveDate: "2023-05-22", }, }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: invalidSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - if name.Name == invalidSecret.Name { - k.SetClientObject(object, &invalidSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WithInvalidStorageSchema_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid object storage schema contents: spec does not contain any schemas", - Reason: lokiv1.ReasonInvalidObjectStorageSchema, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{}, Secret: lokiv1.ObjectStorageSecretSpec{ Name: defaultSecret.Name, Type: lokiv1.ObjectStorageSecretS3, }, }, - }, - Status: lokiv1.LokiStackStatus{ - Storage: lokiv1.LokiStackStorageStatus{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - { - Version: lokiv1.ObjectStorageSchemaV12, - EffectiveDate: "2021-10-11", - }, - }, + Tenants: &lokiv1.TenantsSpec{ + Mode: "openshift", }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - if name.Name == defaultSecret.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenMissingCAConfigMap_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Missing object storage CA config map", - Reason: lokiv1.ReasonMissingObjectStorageCAConfigMap, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - TLS: &lokiv1.ObjectStorageTLSSpec{ - CASpec: lokiv1.CASpec{ - CA: "not-existing", + Limits: &lokiv1.LimitsSpec{ + Global: &lokiv1.LimitsTemplateSpec{ + QueryLimits: &lokiv1.QueryLimitSpec{ + QueryTimeout: "invalid", }, }, }, }, } - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. + // Create looks up the CR first, so we need to return our fake stack k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { + if r.Name == name.Name && r.Namespace == name.Namespace { k.SetClientObject(object, stack) - return nil } - - if name.Name == defaultSecret.Name { + if defaultSecret.Name == name.Name { k.SetClientObject(object, &defaultSecret) - return nil } - - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") + return nil } k.StatusStub = func() client.StatusWriter { return sw } @@ -985,642 +740,3 @@ func TestCreateOrUpdateLokiStack_WhenMissingCAConfigMap_SetDegraded(t *testing.T require.Error(t, err) require.Equal(t, degradedErr, err) } - -func TestCreateOrUpdateLokiStack_WhenInvalidCAConfigMap_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid object storage CA configmap contents: key not present or data empty: service-ca.crt", - Reason: lokiv1.ReasonInvalidObjectStorageCAConfigMap, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - TLS: &lokiv1.ObjectStorageTLSSpec{ - CASpec: lokiv1.CASpec{ - CA: invalidCAConfigMap.Name, - }, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - if name.Name == defaultSecret.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - - if name.Name == invalidCAConfigMap.Name { - k.SetClientObject(object, &invalidCAConfigMap) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenInvalidTenantsConfiguration_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid tenants configuration: mandatory configuration - missing OPA Url", - Reason: lokiv1.ReasonInvalidTenantsConfiguration, - Requeue: false, - } - - ff := configv1.FeatureGates{ - LokiStackGateway: true, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "dynamic", - Authentication: []lokiv1.AuthenticationSpec{ - { - TenantName: "test", - TenantID: "1234", - OIDC: &lokiv1.OIDCSpec{ - Secret: &lokiv1.TenantSecretSpec{ - Name: defaultGatewaySecret.Name, - }, - }, - }, - }, - Authorization: nil, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - _, isLokiStack := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(object, stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, ff) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenMissingGatewaySecret_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Missing secrets for tenant test", - Reason: lokiv1.ReasonMissingGatewayTenantSecret, - Requeue: true, - } - - ff := configv1.FeatureGates{ - LokiStackGateway: true, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "dynamic", - Authentication: []lokiv1.AuthenticationSpec{ - { - TenantName: "test", - TenantID: "1234", - OIDC: &lokiv1.OIDCSpec{ - Secret: &lokiv1.TenantSecretSpec{ - Name: defaultGatewaySecret.Name, - }, - }, - }, - }, - Authorization: &lokiv1.AuthorizationSpec{ - OPA: &lokiv1.OPASpec{ - URL: "some-url", - }, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - o, ok := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && ok { - k.SetClientObject(o, stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, ff) - - // make sure error is returned to re-trigger reconciliation - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenInvalidGatewaySecret_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid gateway tenant secret contents", - Reason: lokiv1.ReasonInvalidGatewayTenantSecret, - Requeue: true, - } - - ff := configv1.FeatureGates{ - LokiStackGateway: true, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "dynamic", - Authentication: []lokiv1.AuthenticationSpec{ - { - TenantName: "test", - TenantID: "1234", - OIDC: &lokiv1.OIDCSpec{ - Secret: &lokiv1.TenantSecretSpec{ - Name: invalidSecret.Name, - }, - }, - }, - }, - Authorization: &lokiv1.AuthorizationSpec{ - OPA: &lokiv1.OPASpec{ - URL: "some-url", - }, - }, - }, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - o, ok := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && ok { - k.SetClientObject(o, stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - if name.Name == invalidSecret.Name { - k.SetClientObject(object, &invalidSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, ff) - - // make sure error is returned to re-trigger reconciliation - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_MissingTenantsSpec_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: "Invalid tenants configuration - TenantsSpec cannot be nil when gateway flag is enabled", - Reason: lokiv1.ReasonInvalidTenantsConfiguration, - Requeue: false, - } - - ff := configv1.FeatureGates{ - LokiStackGateway: true, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: nil, - }, - } - - // GetStub looks up the CR first, so we need to return our fake stack - // return NotFound for everything else to trigger create. - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - o, ok := object.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && ok { - k.SetClientObject(o, stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something is not found") - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, ff) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_WhenInvalidQueryTimeout_SetDegraded(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - degradedErr := &status.DegradedError{ - Message: `Error parsing query timeout: time: invalid duration "invalid"`, - Reason: lokiv1.ReasonQueryTimeoutInvalid, - Requeue: false, - } - - stack := &lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV12, - EffectiveDate: "2023-05-22", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "openshift", - }, - Limits: &lokiv1.LimitsSpec{ - Global: &lokiv1.LimitsTemplateSpec{ - QueryLimits: &lokiv1.QueryLimitSpec{ - QueryTimeout: "invalid", - }, - }, - }, - }, - } - - // Create looks up the CR first, so we need to return our fake stack - k.GetStub = func(_ context.Context, name types.NamespacedName, object client.Object, _ ...client.GetOption) error { - if r.Name == name.Name && r.Namespace == name.Namespace { - k.SetClientObject(object, stack) - } - if defaultSecret.Name == name.Name { - k.SetClientObject(object, &defaultSecret) - } - return nil - } - - k.StatusStub = func() client.StatusWriter { return sw } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - - // make sure error is returned - require.Error(t, err) - require.Equal(t, degradedErr, err) -} - -func TestCreateOrUpdateLokiStack_RemovesRulerResourcesWhenDisabled(t *testing.T) { - sw := &k8sfakes.FakeStatusWriter{} - k := &k8sfakes.FakeClient{} - r := ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: "my-stack", - Namespace: "some-ns", - }, - } - - stack := lokiv1.LokiStack{ - TypeMeta: metav1.TypeMeta{ - Kind: "LokiStack", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "my-stack", - Namespace: "some-ns", - UID: "b23f9a38-9672-499f-8c29-15ede74d3ece", - }, - Spec: lokiv1.LokiStackSpec{ - Size: lokiv1.SizeOneXExtraSmall, - Storage: lokiv1.ObjectStorageSpec{ - Schemas: []lokiv1.ObjectStorageSchema{ - { - Version: lokiv1.ObjectStorageSchemaV11, - EffectiveDate: "2020-10-11", - }, - }, - Secret: lokiv1.ObjectStorageSecretSpec{ - Name: defaultSecret.Name, - Type: lokiv1.ObjectStorageSecretS3, - }, - }, - Rules: &lokiv1.RulesSpec{ - Enabled: true, - }, - Tenants: &lokiv1.TenantsSpec{ - Mode: "dynamic", - Authentication: []lokiv1.AuthenticationSpec{ - { - TenantName: "test", - TenantID: "1234", - OIDC: &lokiv1.OIDCSpec{ - Secret: &lokiv1.TenantSecretSpec{ - Name: defaultGatewaySecret.Name, - }, - }, - }, - }, - Authorization: &lokiv1.AuthorizationSpec{ - OPA: &lokiv1.OPASpec{ - URL: "some-url", - }, - }, - }, - }, - } - - k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { - _, ok := out.(*lokiv1.RulerConfig) - if ok { - return apierrors.NewNotFound(schema.GroupResource{}, "no ruler config") - } - - _, isLokiStack := out.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(out, &stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(out, &defaultSecret) - return nil - } - if defaultGatewaySecret.Name == name.Name { - k.SetClientObject(out, &defaultGatewaySecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") - } - - k.CreateStub = func(_ context.Context, o client.Object, _ ...client.CreateOption) error { - assert.Equal(t, r.Namespace, o.GetNamespace()) - return nil - } - - k.StatusStub = func() client.StatusWriter { return sw } - - k.DeleteStub = func(_ context.Context, o client.Object, _ ...client.DeleteOption) error { - assert.Equal(t, r.Namespace, o.GetNamespace()) - return nil - } - - k.ListStub = func(_ context.Context, list client.ObjectList, options ...client.ListOption) error { - switch list.(type) { - case *corev1.ConfigMapList: - k.SetClientObjectList(list, &corev1.ConfigMapList{ - Items: []corev1.ConfigMap{ - rulesCM, - }, - }) - } - return nil - } - - err := CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - require.NoError(t, err) - - // make sure create was called - require.NotZero(t, k.CreateCallCount()) - - // make sure delete not called - require.Zero(t, k.DeleteCallCount()) - - // Disable the ruler - stack.Spec.Rules.Enabled = false - - // Get should return ruler resources - k.GetStub = func(_ context.Context, name types.NamespacedName, out client.Object, _ ...client.GetOption) error { - _, ok := out.(*lokiv1.RulerConfig) - if ok { - return apierrors.NewNotFound(schema.GroupResource{}, "no ruler config") - } - if rulesCM.Name == name.Name { - k.SetClientObject(out, &rulesCM) - return nil - } - if rulerSS.Name == name.Name { - k.SetClientObject(out, &rulerSS) - return nil - } - - _, isLokiStack := out.(*lokiv1.LokiStack) - if r.Name == name.Name && r.Namespace == name.Namespace && isLokiStack { - k.SetClientObject(out, &stack) - return nil - } - if defaultSecret.Name == name.Name { - k.SetClientObject(out, &defaultSecret) - return nil - } - if defaultGatewaySecret.Name == name.Name { - k.SetClientObject(out, &defaultGatewaySecret) - return nil - } - return apierrors.NewNotFound(schema.GroupResource{}, "something wasn't found") - } - err = CreateOrUpdateLokiStack(context.TODO(), logger, r, k, scheme, featureGates) - require.NoError(t, err) - - // make sure delete was called twice (delete rules configmap and ruler statefulset) - require.Equal(t, 2, k.DeleteCallCount()) -}