Skip to content

Commit

Permalink
refactor: prevent collector pod restarts on startup
Browse files Browse the repository at this point in the history
  • Loading branch information
TheSpiritXIII committed Jul 3, 2024
1 parent a1ad2b0 commit 0c59776
Show file tree
Hide file tree
Showing 14 changed files with 523 additions and 59 deletions.
23 changes: 23 additions & 0 deletions charts/operator/templates/mutatingwebhookconfiguration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,26 @@ webhooks:
- CREATE
- UPDATE
sideEffects: None
- name: default.operatorconfigs.gmp-operator.gmp-system.monitoring.googleapis.com
admissionReviewVersions:
- v1
clientConfig:
# caBundle populated by operator.
service:
name: gmp-operator
namespace: {{.Values.namespace.system}}
port: 443
path: /default/monitoring.googleapis.com/v1/operatorconfigs
# Since this is re-applied at runtime by the operator's controllers
# we can safely ignore any transient issues with the webhook server.
failurePolicy: Ignore
rules:
- resources:
- operatorconfigs
apiGroups:
- monitoring.googleapis.com
apiVersions:
- v1
operations:
- UPDATE
sideEffects: None
2 changes: 1 addition & 1 deletion charts/operator/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ rules:
- resources:
- operatorconfigs
apiGroups: ["monitoring.googleapis.com"]
verbs: ["get", "list", "watch"]
verbs: ["get", "update", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
Expand Down
4 changes: 2 additions & 2 deletions charts/values.global.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ images:
image: gke.gcr.io/prometheus-engine/alertmanager
tag: v0.25.1-gmp.2-gke.0
prometheus:
# TODO(bwplotka): Change to "v2.45.3-gmp.4-gke.0" once tags are cloned.
# TODO(bwplotka): Change to "v2.45.3-gmp.6-gke.0" once tags are cloned.
image: gke.gcr.io/prometheus-engine/prometheus@sha256
tag: 7473d52f4a3e563e6377f8a6183091f25192b1e0705dd0933903e800bd69b7b2
tag: 01fd523f6dfa54b229b04974195632c8268fbd3d51e66ad076b5d31366210c54
configReloader:
image: gke.gcr.io/prometheus-engine/config-reloader
tag: v0.9.0-gke.1
Expand Down
3 changes: 0 additions & 3 deletions e2e/collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,9 +249,6 @@ func testCollectorOperatorConfig(ctx context.Context, kubeClient client.Client)
// We're mainly interested in the dynamic flags but checking the entire set including
// the static ones is ultimately simpler.
wantArgs := []string{
fmt.Sprintf("--export.label.project-id=%q", projectID),
fmt.Sprintf("--export.label.location=%q", location),
fmt.Sprintf("--export.label.cluster=%q", cluster),
fmt.Sprintf("--export.match=%q", projectFilter),
fmt.Sprintf("--export.match=%q", locationFilter),
fmt.Sprintf("--export.match=%q", kubeletFilter)}
Expand Down
9 changes: 6 additions & 3 deletions e2e/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,6 @@ func testRuleEvaluatorOperatorConfig(ctx context.Context, kubeClient client.Clie
// We're mainly interested in the dynamic flags but checking the entire set including
// the static ones is ultimately simpler.
wantArgs := []string{
fmt.Sprintf("--export.label.project-id=%q", projectID),
fmt.Sprintf("--export.label.location=%q", location),
fmt.Sprintf("--export.label.cluster=%q", cluster),
fmt.Sprintf("--query.project-id=%q", projectID),
fmt.Sprintf("--query.generator-url=%q", "http://example.com/"),
}
Expand Down Expand Up @@ -278,13 +275,19 @@ func testRuleEvaluatorConfiguration(ctx context.Context, kubeClient client.Clien
return strings.NewReplacer(
"{namespace}", operator.DefaultOperatorNamespace,
"{pubNamespace}", operator.DefaultPublicNamespace,
"{projectID}", projectID,
"{location}", location,
"{cluster}", cluster,
).Replace(s)
}

want := map[string]string{
"config.yaml": replace(`global:
external_labels:
cluster: {cluster}
external_key: external_val
location: {location}
project_id: {projectID}
alerting:
alertmanagers:
- follow_redirects: true
Expand Down
2 changes: 1 addition & 1 deletion examples/collector-max-throughput.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ spec:
privileged: false
readOnlyRootFilesystem: true
- name: prometheus
image: gke.gcr.io/prometheus-engine/prometheus@sha256:7473d52f4a3e563e6377f8a6183091f25192b1e0705dd0933903e800bd69b7b2
image: gke.gcr.io/prometheus-engine/prometheus@sha256:01fd523f6dfa54b229b04974195632c8268fbd3d51e66ad076b5d31366210c54
args:
- --config.file=/prometheus/config_out/config.yaml
- --enable-feature=exemplar-storage
Expand Down
27 changes: 25 additions & 2 deletions manifests/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ rules:
- resources:
- operatorconfigs
apiGroups: ["monitoring.googleapis.com"]
verbs: ["get", "list", "watch"]
verbs: ["get", "update", "list", "watch"]
---
# Source: operator/templates/rolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
Expand Down Expand Up @@ -403,7 +403,7 @@ spec:
privileged: false
readOnlyRootFilesystem: true
- name: prometheus
image: gke.gcr.io/prometheus-engine/prometheus@sha256:7473d52f4a3e563e6377f8a6183091f25192b1e0705dd0933903e800bd69b7b2
image: gke.gcr.io/prometheus-engine/prometheus@sha256:01fd523f6dfa54b229b04974195632c8268fbd3d51e66ad076b5d31366210c54
args:
- --config.file=/prometheus/config_out/config.yaml
- --enable-feature=exemplar-storage
Expand Down Expand Up @@ -997,6 +997,29 @@ webhooks:
- CREATE
- UPDATE
sideEffects: None
- name: default.operatorconfigs.gmp-operator.gmp-system.monitoring.googleapis.com
admissionReviewVersions:
- v1
clientConfig:
# caBundle populated by operator.
service:
name: gmp-operator
namespace: gmp-system
port: 443
path: /default/monitoring.googleapis.com/v1/operatorconfigs
# Since this is re-applied at runtime by the operator's controllers
# we can safely ignore any transient issues with the webhook server.
failurePolicy: Ignore
rules:
- resources:
- operatorconfigs
apiGroups:
- monitoring.googleapis.com
apiVersions:
- v1
operations:
- UPDATE
sideEffects: None
---
# Source: operator/templates/operatorconfig.yaml
apiVersion: monitoring.googleapis.com/v1
Expand Down
35 changes: 3 additions & 32 deletions pkg/operator/collection.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/GoogleCloudPlatform/prometheus-engine/pkg/export"
monitoringv1 "github.com/GoogleCloudPlatform/prometheus-engine/pkg/operator/apis/monitoring/v1"
"github.com/GoogleCloudPlatform/prometheus-engine/pkg/secrets"
)
Expand Down Expand Up @@ -234,14 +233,8 @@ func (r *collectionReconciler) ensureCollectorDaemonSet(ctx context.Context, spe
return err
}

var projectID, location, cluster = resolveLabels(r.opts, spec.ExternalLabels)

flags := []string{
fmt.Sprintf("--export.label.project-id=%q", projectID),
fmt.Sprintf("--export.label.location=%q", location),
fmt.Sprintf("--export.label.cluster=%q", cluster),
}
// Populate export filtering from OperatorConfig.
var flags []string
// Populate export flags for collector if necessary.
for _, matcher := range spec.Filter.MatchOneOf {
flags = append(flags, fmt.Sprintf("--export.match=%q", matcher))
}
Expand All @@ -258,28 +251,6 @@ func (r *collectionReconciler) ensureCollectorDaemonSet(ctx context.Context, spe
return r.client.Update(ctx, &ds)
}

func resolveLabels(opts Options, externalLabels map[string]string) (projectID string, location string, cluster string) {
// Prioritize OperatorConfig's external labels over operator's flags
// to be consistent with our export layer's priorities.
// This is to avoid confusion if users specify a project_id, location, and
// cluster in the OperatorConfig's external labels but not in flags passed
// to the operator - since on GKE environnments, these values are autopopulated
// without user intervention.
projectID = opts.ProjectID
if p, ok := externalLabels[export.KeyProjectID]; ok {
projectID = p
}
location = opts.Location
if l, ok := externalLabels[export.KeyLocation]; ok {
location = l
}
cluster = opts.Cluster
if c, ok := externalLabels[export.KeyCluster]; ok {
cluster = c
}
return
}

func gzipData(data []byte) ([]byte, error) {
var b bytes.Buffer
gz := gzip.NewWriter(&b)
Expand Down Expand Up @@ -415,7 +386,7 @@ func (r *collectionReconciler) makeCollectorConfig(ctx context.Context, spec *mo
}

usedSecrets := monitoringv1.PrometheusSecretConfigs{}
var projectID, location, cluster = resolveLabels(r.opts, spec.ExternalLabels)
projectID, location, cluster := resolveLabels(r.opts.ProjectID, r.opts.Location, r.opts.Cluster, spec.ExternalLabels)
var updates []update

// Mark status updates in batch with single timestamp.
Expand Down
43 changes: 43 additions & 0 deletions pkg/operator/labels.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package operator

import "github.com/GoogleCloudPlatform/prometheus-engine/pkg/export"

// resolveLabels compares the project, location, and cluster labels used by the operator
// against those in externalLabels. If any are found in the latter, they take precedence.
// The higher-precedence labels are then returned.
//
// This is to be consistent with our export layer's priorities and avoid confusion if users
// specify a project_id, location, and cluster in the OperatorConfig's external labels but
// not in flags passed to the operator - since on GKE environments, these values are
// auto-populated without user intervention.
func resolveLabels(defaultProjectID, defaultLocation, defaultCluster string, externalLabels map[string]string) (projectID, location, cluster string) {
if externalLabels == nil {
return defaultProjectID, defaultLocation, defaultCluster
}

var ok bool
if projectID, ok = externalLabels[export.KeyProjectID]; !ok {
projectID = defaultProjectID
}
if location, ok = externalLabels[export.KeyLocation]; !ok {
location = defaultLocation
}
if cluster, ok = externalLabels[export.KeyCluster]; !ok {
cluster = defaultCluster
}
return
}
106 changes: 106 additions & 0 deletions pkg/operator/labels_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package operator

import (
"testing"
)

func TestResolveLabels(t *testing.T) {
for _, tc := range []struct {
desc string
projectID, location, cluster string
expProjectID, expLocation, expCluster string
externalLabels map[string]string
expExternalLabels map[string]string
}{
{
desc: "no overwrite",
projectID: "proj-a",
location: "loc-a",
cluster: "clu-a",
externalLabels: map[string]string{
"project_id": "proj-b",
"location": "loc-b",
"cluster": "clu-b",
},
expProjectID: "proj-b",
expLocation: "loc-b",
expCluster: "clu-b",
},
{
desc: "overwrite projectID",
projectID: "proj-a",
location: "loc-a",
cluster: "clu-a",
externalLabels: map[string]string{
"location": "loc-b",
"cluster": "clu-b",
},
expProjectID: "proj-a",
expLocation: "loc-b",
expCluster: "clu-b",
},
{
desc: "overwrite location",
projectID: "proj-a",
location: "loc-a",
cluster: "clu-a",
externalLabels: map[string]string{
"project_id": "proj-b",
"cluster": "clu-b",
},
expProjectID: "proj-b",
expLocation: "loc-a",
expCluster: "clu-b",
},
{
desc: "overwrite cluster",
projectID: "proj-a",
location: "loc-a",
cluster: "clu-a",
externalLabels: map[string]string{
"project_id": "proj-b",
"location": "loc-b",
},
expProjectID: "proj-b",
expLocation: "loc-b",
expCluster: "clu-a",
},
{
desc: "overwrite all",
projectID: "proj-a",
location: "loc-a",
cluster: "clu-a",
externalLabels: map[string]string{},
expProjectID: "proj-a",
expLocation: "loc-a",
expCluster: "clu-a",
},
} {
t.Run(tc.desc, func(t *testing.T) {
projectID, location, cluster := resolveLabels(tc.projectID, tc.location, tc.cluster, tc.externalLabels)
if projectID != tc.expProjectID {
t.Error("projectIDs do not match")
}
if location != tc.expLocation {
t.Error("locations do not match")
}
if cluster != tc.expCluster {
t.Error("clusters do not match")
}
})
}
}
Loading

0 comments on commit 0c59776

Please sign in to comment.