From 107cdc258e0fe2dc79df45b8c58b086ed905f04d Mon Sep 17 00:00:00 2001 From: Jaromir Wysoglad Date: Tue, 11 Jun 2024 16:23:50 -0400 Subject: [PATCH] feat: Thanos Querier to Thanos sidecar mTLS --- go.mod | 2 + go.sum | 4 + pkg/assets/certificate_generator.go | 148 ++++++++++++++++++ .../monitoring/monitoring-stack/components.go | 22 +++ .../monitoring/monitoring-stack/controller.go | 39 +++++ .../monitoring/thanos-querier/components.go | 22 +++ 6 files changed, 237 insertions(+) create mode 100644 pkg/assets/certificate_generator.go diff --git a/go.mod b/go.mod index c110940c..f8fe20f0 100644 --- a/go.mod +++ b/go.mod @@ -84,6 +84,7 @@ require ( github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/oklog/ulid v1.3.1 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/openshift/library-go v0.0.0-20240216151214-738f3fa4ccf8 // indirect github.com/opentracing/opentracing-go v1.2.0 // indirect github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect @@ -119,6 +120,7 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apiserver v0.29.3 // indirect k8s.io/klog/v2 v2.110.1 // indirect k8s.io/kube-openapi v0.0.0-20231129212854-f0671cc7e66a // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect diff --git a/go.sum b/go.sum index 99dfec59..ecb609d2 100644 --- a/go.sum +++ b/go.sum @@ -396,6 +396,8 @@ github.com/opencontainers/image-spec v1.0.2 h1:9yCKha/T5XdGtO0q9Q9a6T5NUCsTn/DrB github.com/opencontainers/image-spec v1.0.2/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= github.com/openshift/api v0.0.0-20240301093301-ce10821dc999 h1:+S998xHiJApsJZjRAO8wyedU9GfqFd8mtwWly6LqHDo= github.com/openshift/api v0.0.0-20240301093301-ce10821dc999/go.mod h1:CxgbWAlvu2iQB0UmKTtRu1YfepRg1/vJ64n2DlIEVz4= +github.com/openshift/library-go v0.0.0-20240216151214-738f3fa4ccf8 h1:dKtHGYiOwl0DKZEWBW4MFWFS6IYW02AVD1WSuUAVwEo= +github.com/openshift/library-go v0.0.0-20240216151214-738f3fa4ccf8/go.mod h1:ePlaOqUiPplRc++6aYdMe+2FmXb2xTNS9Nz5laG2YmI= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/ovh/go-ovh v1.4.3 h1:Gs3V823zwTFpzgGLZNI6ILS4rmxZgJwJCz54Er9LwD0= @@ -864,6 +866,8 @@ k8s.io/apiextensions-apiserver v0.29.3 h1:9HF+EtZaVpFjStakF4yVufnXGPRppWFEQ87qnO k8s.io/apiextensions-apiserver v0.29.3/go.mod h1:po0XiY5scnpJfFizNGo6puNU6Fq6D70UJY2Cb2KwAVc= k8s.io/apimachinery v0.29.3 h1:2tbx+5L7RNvqJjn7RIuIKu9XTsIZ9Z5wX2G22XAa5EU= k8s.io/apimachinery v0.29.3/go.mod h1:hx/S4V2PNW4OMg3WizRrHutyB5la0iCUbZym+W0EQIU= +k8s.io/apiserver v0.29.3 h1:xR7ELlJ/BZSr2n4CnD3lfA4gzFivh0wwfNfz9L0WZcE= +k8s.io/apiserver v0.29.3/go.mod h1:hrvXlwfRulbMbBgmWRQlFru2b/JySDpmzvQwwk4GUOs= k8s.io/client-go v0.29.3 h1:R/zaZbEAxqComZ9FHeQwOh3Y1ZUs7FaHKZdQtIc2WZg= k8s.io/client-go v0.29.3/go.mod h1:tkDisCvgPfiRpxGnOORfkljmS+UrW+WtXAy2fTvXJB0= k8s.io/component-base v0.29.3 h1:Oq9/nddUxlnrCuuR2K/jp6aflVvc0uDvxMzAWxnGzAo= diff --git a/pkg/assets/certificate_generator.go b/pkg/assets/certificate_generator.go new file mode 100644 index 00000000..16f8cf20 --- /dev/null +++ b/pkg/assets/certificate_generator.go @@ -0,0 +1,148 @@ +package assets + +import ( + "crypto/rand" + "crypto/x509" + "fmt" + "math/big" + "time" + + "github.com/go-logr/logr" + + "github.com/openshift/library-go/pkg/crypto" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apiserver/pkg/authentication/user" +) + +const certificateLifetime = time.Duration(crypto.DefaultCertificateLifetimeInDays) * 24 * time.Hour +const GRPCSecretName = "thanos-grpc-secret" + +// Taken from +// https://github.com/openshift/library-go/blob/08c2fd1b452520da35ad210930ea9d100545589a/pkg/operator/certrotation/signer.go#L68-L86 +// without refresh time handling. We just take care of rotation if we reach 1/5 of the validity timespan before expiration. +func needsNewCert(notBefore, notAfter time.Time, now func() time.Time) bool { + maxWait := notAfter.Sub(notBefore) / 5 + latestTime := notAfter.Add(-maxWait) + return now().After(latestTime) +} + +// Taken from +// https://github.com/openshift/cluster-monitoring-operator/blob/765d0b0369b176a5997d787b6710783437172879/pkg/manifests/tls.go#L113 +func RotateGRPCSecret(s *v1.Secret, logger logr.Logger) (bool, error) { + var ( + curCA, newCA *crypto.CA + curCABytes, crtPresent = s.Data["ca.crt"] + curCAKeyBytes, keyPresent = s.Data["ca.key"] + rotate = !crtPresent || !keyPresent + ) + + if crtPresent && keyPresent { + var err error + curCA, err = crypto.GetCAFromBytes(curCABytes, curCAKeyBytes) + if err != nil { + logger.Info(fmt.Sprintf("generating a new CA due to error reading CA: %v", err)) + rotate = true + } else if needsNewCert(curCA.Config.Certs[0].NotBefore, curCA.Config.Certs[0].NotAfter, time.Now) { + logger.Info("generating new CA, because the current one is older than 1/5 of it validity timestamp") + rotate = true + } + } + + if !rotate { + return rotate, nil + } + + if curCA == nil { + newCAConfig, err := crypto.MakeSelfSignedCAConfig( + fmt.Sprintf("%s@%d", "openshift-cluster-monitoring", time.Now().Unix()), + crypto.DefaultCertificateLifetimeInDays, + ) + if err != nil { + return rotate, fmt.Errorf("error generating self signed CA: %w", err) + } + + newCA = &crypto.CA{ + SerialGenerator: &crypto.RandomSerialGenerator{}, + Config: newCAConfig, + } + } else { + template := curCA.Config.Certs[0] + now := time.Now() + template.NotBefore = now.Add(-1 * time.Second) + template.NotAfter = now.Add(certificateLifetime) + template.SerialNumber = template.SerialNumber.Add(template.SerialNumber, big.NewInt(1)) + + newCACert, err := createCertificate(template, template, template.PublicKey, curCA.Config.Key) + if err != nil { + return rotate, fmt.Errorf("error rotating CA: %w", err) + } + + newCA = &crypto.CA{ + SerialGenerator: &crypto.RandomSerialGenerator{}, + Config: &crypto.TLSCertificateConfig{ + Certs: []*x509.Certificate{newCACert}, + Key: curCA.Config.Key, + }, + } + } + + newCABytes, newCAKeyBytes, err := newCA.Config.GetPEMBytes() + if err != nil { + return rotate, fmt.Errorf("error getting PEM bytes from CA: %w", err) + } + + s.Data["ca.crt"] = newCABytes + s.Data["ca.key"] = newCAKeyBytes + + { + cfg, err := newCA.MakeClientCertificateForDuration( + &user.DefaultInfo{ + Name: "thanos-querier", + }, + time.Duration(crypto.DefaultCertificateLifetimeInDays)*24*time.Hour, + ) + if err != nil { + return rotate, fmt.Errorf("error making client certificate: %w", err) + } + + crt, key, err := cfg.GetPEMBytes() + if err != nil { + return rotate, fmt.Errorf("error getting PEM bytes for thanos querier client certificate: %w", err) + } + s.Data["thanos-querier-client.crt"] = crt + s.Data["thanos-querier-client.key"] = key + } + + { + cfg, err := newCA.MakeServerCert( + sets.NewString("prometheus-grpc"), + crypto.DefaultCertificateLifetimeInDays, + ) + if err != nil { + return rotate, fmt.Errorf("error making server certificate: %w", err) + } + + crt, key, err := cfg.GetPEMBytes() + if err != nil { + return rotate, fmt.Errorf("error getting PEM bytes for prometheus-k8s server certificate: %w", err) + } + s.Data["prometheus-server.crt"] = crt + s.Data["prometheus-server.key"] = key + } + + return rotate, nil +} + +// createCertificate creates a new certificate and returns it in x509.Certificate form. +func createCertificate(template, parent *x509.Certificate, pub, priv interface{}) (*x509.Certificate, error) { + rawCert, err := x509.CreateCertificate(rand.Reader, template, parent, pub, priv) + if err != nil { + return nil, fmt.Errorf("error creating certificate: %w", err) + } + parsedCerts, err := x509.ParseCertificates(rawCert) + if err != nil { + return nil, fmt.Errorf("error parsing certificate: %w", err) + } + return parsedCerts[0], nil +} diff --git a/pkg/controllers/monitoring/monitoring-stack/components.go b/pkg/controllers/monitoring/monitoring-stack/components.go index cfd83ed1..966e706b 100644 --- a/pkg/controllers/monitoring/monitoring-stack/components.go +++ b/pkg/controllers/monitoring/monitoring-stack/components.go @@ -13,6 +13,7 @@ import ( stack "github.com/rhobs/observability-operator/pkg/apis/monitoring/v1alpha1" "github.com/rhobs/observability-operator/pkg/reconciler" + "github.com/rhobs/observability-operator/pkg/assets" ) const AdditionalScrapeConfigsSelfScrapeKey = "self-scrape-config" @@ -189,12 +190,33 @@ func newPrometheus( } return []string{} }(), + Volumes: []corev1.Volume{ + { + Name: "thanos-tls-assets", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: assets.GRPCSecretName, + }, + }, + }, + }, }, Retention: ms.Spec.Retention, RuleSelector: prometheusSelector, RuleNamespaceSelector: ms.Spec.NamespaceSelector, Thanos: &monv1.ThanosSpec{ Image: ptr.To(thanosCfg.Image), + GRPCServerTLSConfig: &monv1.TLSConfig{ + CAFile: "/etc/thanos/tls-assets/ca.crt", + CertFile: "/etc/thanos/tls-assets/prometheus-server.crt", + KeyFile: "/etc/thanos/tls-assets/prometheus-server.key", + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "thanos-tls-assets", + MountPath: "/etc/thanos/tls-assets", + }, + }, }, }, } diff --git a/pkg/controllers/monitoring/monitoring-stack/controller.go b/pkg/controllers/monitoring/monitoring-stack/controller.go index 52068406..0ad9e0fc 100644 --- a/pkg/controllers/monitoring/monitoring-stack/controller.go +++ b/pkg/controllers/monitoring/monitoring-stack/controller.go @@ -28,7 +28,9 @@ import ( policyv1 "k8s.io/api/policy/v1" rbacv1 "k8s.io/api/rbac/v1" "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -36,6 +38,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" stack "github.com/rhobs/observability-operator/pkg/apis/monitoring/v1alpha1" + "github.com/rhobs/observability-operator/pkg/assets" ) type resourceManager struct { @@ -133,6 +136,42 @@ func RegisterWithManager(mgr ctrl.Manager, opts Options) error { func (rm resourceManager) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := rm.logger.WithValues("stack", req.NamespacedName) logger.Info("Reconciling monitoring stack") + + gRPCSecret := v1.Secret{ + TypeMeta: metav1.TypeMeta{ + APIVersion: v1.SchemeGroupVersion.String(), + Kind: "Secret", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: assets.GRPCSecretName, + Namespace: req.Namespace, + }, + Data: map[string][]byte{}, + } + err := rm.k8sClient.Get(ctx, + types.NamespacedName{ + Name: assets.GRPCSecretName, + Namespace: req.Namespace, + }, + &gRPCSecret) + if client.IgnoreNotFound(err) != nil { + return ctrl.Result{}, err + } + + rotate, err := assets.RotateGRPCSecret(&gRPCSecret, logger) + if err != nil { + return ctrl.Result{}, err + } + if rotate { + err = rm.k8sClient.Update(ctx, &gRPCSecret) + if errors.IsNotFound(err) { + err = rm.k8sClient.Create(ctx, &gRPCSecret) + } + if err != nil { + return ctrl.Result{}, err + } + } + ms, err := rm.getStack(ctx, req) if err != nil { // retry since some error has occured diff --git a/pkg/controllers/monitoring/thanos-querier/components.go b/pkg/controllers/monitoring/thanos-querier/components.go index 90553995..753af6f5 100644 --- a/pkg/controllers/monitoring/thanos-querier/components.go +++ b/pkg/controllers/monitoring/thanos-querier/components.go @@ -10,6 +10,7 @@ import ( "k8s.io/utils/ptr" msoapi "github.com/rhobs/observability-operator/pkg/apis/monitoring/v1alpha1" + "github.com/rhobs/observability-operator/pkg/assets" "github.com/rhobs/observability-operator/pkg/reconciler" ) @@ -29,6 +30,11 @@ func newThanosQuerierDeployment(name string, spec *msoapi.ThanosQuerier, sidecar "--log.format=logfmt", "--query.replica-label=prometheus_replica", "--query.auto-downsampling", + "--grpc-client-tls-secure", + "--grpc-client-server-name=prometheus-grpc", + "--grpc-client-tls-ca=/etc/thanos/tls-sidecar-assets/ca.crt", + "--grpc-client-tls-key=/etc/thanos/tls-sidecar-assets/thanos-querier-client.key", + "--grpc-client-tls-cert=/etc/thanos/tls-sidecar-assets/thanos-querier-client.crt", } for _, endpoint := range sidecarUrls { args = append(args, fmt.Sprintf("--endpoint=%s", endpoint)) @@ -86,6 +92,12 @@ func newThanosQuerierDeployment(name string, spec *msoapi.ThanosQuerier, sidecar Type: corev1.SeccompProfileTypeRuntimeDefault, }, }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "thanos-sidecar-tls-assets", + MountPath: "/etc/thanos/tls-sidecar-assets", + }, + }, }, }, NodeSelector: map[string]string{ @@ -97,6 +109,16 @@ func newThanosQuerierDeployment(name string, spec *msoapi.ThanosQuerier, sidecar Type: corev1.SeccompProfileTypeRuntimeDefault, }, }, + Volumes: []corev1.Volume{ + { + Name: "thanos-sidecar-tls-assets", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: assets.GRPCSecretName, + }, + }, + }, + }, }, }, ProgressDeadlineSeconds: ptr.To(int32(300)),