diff --git a/internalshared/configutil/telemetry.go b/internalshared/configutil/telemetry.go index d00b922b75a9..7c49fce00917 100644 --- a/internalshared/configutil/telemetry.go +++ b/internalshared/configutil/telemetry.go @@ -22,6 +22,7 @@ import ( "github.com/hashicorp/hcl" "github.com/hashicorp/hcl/hcl/ast" "github.com/hashicorp/vault/helper/metricsutil" + "github.com/hashicorp/vault/sdk/helper/metricregistry" "google.golang.org/api/option" ) @@ -289,6 +290,10 @@ func SetupTelemetry(opts *SetupTelemetryOpts) (*metrics.InmemSink, *metricsutil. Expiration: opts.Config.PrometheusRetentionTime, } + // Merge in explicit metric definitions so Prometheus always reports those + // metrics. + metricregistry.MergeDefinitions(&prometheusOpts) + sink, err := prometheus.NewPrometheusSinkFrom(prometheusOpts) if err != nil { return nil, nil, false, err diff --git a/sdk/helper/metricregistry/metricregistry.go b/sdk/helper/metricregistry/metricregistry.go new file mode 100644 index 000000000000..bd513136dfd8 --- /dev/null +++ b/sdk/helper/metricregistry/metricregistry.go @@ -0,0 +1,107 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +// Package metricregistry is a helper that allows Vault code or plugins that are +// compiled into Vault to pre-define any metrics they will emit to go-metrics at +// init time. Metrics registered this way will always be reported by the +// go-metrics PrometheusSink if it is used so infrequently updated metrics are +// always present. It is not required to pre-register metrics to use go-metrics +// with Prometheus, but it's preferable as it makes them behave more like the +// Prometheus ecosystem expects, being always present and with a helpful +// description in the output which some systems use to help operators explore +// metrics. +// +// Note that this will not work for external Vault plugins since they are in a +// separate process and only started after Vault's metrics sink is already +// configured. +package metricregistry + +import promsink "github.com/armon/go-metrics/prometheus" + +var Registry definitionRegistry + +// Re-export these types so that we don't have the whole of Vault depending +// directly on go-metrics prometheus sink and can buffer changes if needed +type ( + // GaugeDefinition provides the name and help text of a gauge metric that will + // be exported via go-metrics' Prometheus sink if enabled. + GaugeDefinition promsink.GaugeDefinition + + // CounterDefinition provides the name and help text of a counter metric that + // will be exported via go-metrics' Prometheus sink if enabled. + CounterDefinition promsink.CounterDefinition + + // SummaryDefinition provides the name and help text of a summary metric that + // will be exported via go-metrics' Prometheus sink if enabled. + SummaryDefinition promsink.SummaryDefinition +) + +// definitionRegistry is a central place for packages to register their metrics +// definitions during init so that we can correctly report metrics to Prometheus +// even before they are observed. Typically there is one global instance. +type definitionRegistry struct { + gauges []GaugeDefinition + counters []CounterDefinition + summaries []SummaryDefinition +} + +// RegisterGauges is intended to be called during init. It accesses global state +// without synchronization. Statically defined definitions should be registered +// during `init` of a package read to be configured if the prometheus sink is +// enabled in configuration. Registering metrics is not mandatory but it is +// strongly preferred as it ensures they are always output even before the are +// observed which makes dashboards much easier to work with, provides helpful +// descriptions and matches Prometheus eco system expectations. It also prevents +// the metrics ever being expired which means users don't need to work around +// that quirk of go-metrics by setting long prometheus retention times. All +// registered metrics will report 0 until an actual observation is made. +func RegisterGauges(defs []GaugeDefinition) { + Registry.gauges = append(Registry.gauges, defs...) +} + +// RegisterCounters is intended to be called during init. It accesses global +// state without synchronization. Statically defined definitions should be +// registered during `init` of a package read to be configured if the prometheus +// sink is enabled in configuration. Registering metrics is not mandatory but it +// is strongly preferred as it ensures they are always output even before the +// are observed which makes dashboards much easier to work with, provides +// helpful descriptions and matches Prometheus eco system expectations. It also +// prevents the metrics ever being expired which means users don't need to work +// around that quirk of go-metrics by setting long prometheus retention times. +// All registered metrics will report 0 until an actual observation is made. +func RegisterCounters(defs []CounterDefinition) { + Registry.counters = append(Registry.counters, defs...) +} + +// RegisterSummaries is intended to be called during init. It accesses global +// state without synchronization. Statically defined definitions should be +// registered during `init` of a package read to be configured if the prometheus +// sink is enabled in configuration. Registering metrics is not mandatory but it +// is strongly preferred as it ensures they are always output even before the +// are observed which makes dashboards much easier to work with, provides +// helpful descriptions and matches Prometheus eco system expectations. It also +// prevents the metrics ever being expired which means users don't need to work +// around that quirk of go-metrics by setting long prometheus retention times. +// All registered metrics will report 0 until an actual observation is made. +func RegisterSummaries(defs []SummaryDefinition) { + Registry.summaries = append(Registry.summaries, defs...) +} + +// MergeDefinitions adds all registered metrics to any already present in `cfg` +// ready to be passed to the go-metrics prometheus sink. Note it is not safe to +// call this concurrently with registrations or other calls, it's intended this +// is called once only after all registrations (which should be in init +// functions) just before the PrometheusSink is created. Calling more than once +// could result in duplicate metrics definitions being passed unless the cfg is +// different each time for different Prometheus sinks. +func MergeDefinitions(cfg *promsink.PrometheusOpts) { + for _, g := range Registry.gauges { + cfg.GaugeDefinitions = append(cfg.GaugeDefinitions, promsink.GaugeDefinition(g)) + } + for _, c := range Registry.counters { + cfg.CounterDefinitions = append(cfg.CounterDefinitions, promsink.CounterDefinition(c)) + } + for _, s := range Registry.summaries { + cfg.SummaryDefinitions = append(cfg.SummaryDefinitions, promsink.SummaryDefinition(s)) + } +} diff --git a/sdk/helper/metricregistry/metricregistry_test.go b/sdk/helper/metricregistry/metricregistry_test.go new file mode 100644 index 000000000000..1c1d0d039cfe --- /dev/null +++ b/sdk/helper/metricregistry/metricregistry_test.go @@ -0,0 +1,119 @@ +package metricregistry + +import ( + "testing" + + promsink "github.com/armon/go-metrics/prometheus" + "github.com/stretchr/testify/require" +) + +var testGauges = []GaugeDefinition{ + { + Name: []string{"test_gauge"}, + Help: "A test gauge", + }, + { + Name: []string{"test_gauge2"}, + Help: "Another test gauge", + }, +} + +var testCounters = []CounterDefinition{ + { + Name: []string{"test_counter"}, + Help: "A test counter", + }, + { + Name: []string{"test_counter2"}, + Help: "Another test counter", + }, +} + +var testSummaries = []SummaryDefinition{ + { + Name: []string{"test_summary"}, + Help: "A test summary", + }, + { + Name: []string{"test_summary2"}, + Help: "Another test summary", + }, +} + +func TestMetricRegistry(t *testing.T) { + // Register some metrics + RegisterGauges(testGauges) + RegisterCounters(testCounters) + RegisterSummaries(testSummaries) + + var opts promsink.PrometheusOpts + + // Add some pre-existing metrics to ensure merge is really a merge + opts.GaugeDefinitions = []promsink.GaugeDefinition{ + { + Name: []string{"preexisting_gauge"}, + Help: "A pre-existing gauge", + }, + } + opts.CounterDefinitions = []promsink.CounterDefinition{ + { + Name: []string{"preexisting_counter"}, + Help: "A pre-existing counter", + }, + } + opts.SummaryDefinitions = []promsink.SummaryDefinition{ + { + Name: []string{"preexisting_summary"}, + Help: "A pre-existing summary", + }, + } + + MergeDefinitions(&opts) + + require.Len(t, opts.GaugeDefinitions, 3) + require.Len(t, opts.CounterDefinitions, 3) + require.Len(t, opts.SummaryDefinitions, 3) + + wantGauges := []string{"test_gauge", "test_gauge2", "preexisting_gauge"} + wantGaugeHelp := []string{"A test gauge", "Another test gauge", "A pre-existing gauge"} + gotGauges := reduce(opts.GaugeDefinitions, nil, func(r []string, d promsink.GaugeDefinition) []string { + return append(r, d.Name[0]) + }) + gotGaugeHelp := reduce(opts.GaugeDefinitions, nil, func(r []string, d promsink.GaugeDefinition) []string { + return append(r, d.Help) + }) + + require.ElementsMatch(t, wantGauges, gotGauges) + require.ElementsMatch(t, wantGaugeHelp, gotGaugeHelp) + + wantCounters := []string{"test_counter", "test_counter2", "preexisting_counter"} + wantCounterHelp := []string{"A test counter", "Another test counter", "A pre-existing counter"} + gotCounters := reduce(opts.CounterDefinitions, nil, func(r []string, d promsink.CounterDefinition) []string { + return append(r, d.Name[0]) + }) + gotCounterHelp := reduce(opts.CounterDefinitions, nil, func(r []string, d promsink.CounterDefinition) []string { + return append(r, d.Help) + }) + + require.ElementsMatch(t, wantCounters, gotCounters) + require.ElementsMatch(t, wantCounterHelp, gotCounterHelp) + + wantSummaries := []string{"test_summary", "test_summary2", "preexisting_summary"} + wantSummaryHelp := []string{"A test summary", "Another test summary", "A pre-existing summary"} + gotSummaries := reduce(opts.SummaryDefinitions, nil, func(r []string, d promsink.SummaryDefinition) []string { + return append(r, d.Name[0]) + }) + gotSummaryHelp := reduce(opts.SummaryDefinitions, nil, func(r []string, d promsink.SummaryDefinition) []string { + return append(r, d.Help) + }) + + require.ElementsMatch(t, wantSummaries, gotSummaries) + require.ElementsMatch(t, wantSummaryHelp, gotSummaryHelp) +} + +func reduce[T, R any](s []T, r R, f func(R, T) R) R { + for _, v := range s { + r = f(r, v) + } + return r +} diff --git a/vault/ha.go b/vault/ha.go index cb3ce2de2c71..d596ff812e8d 100644 --- a/vault/ha.go +++ b/vault/ha.go @@ -24,6 +24,7 @@ import ( "github.com/hashicorp/vault/sdk/helper/certutil" "github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/helper/jsonutil" + "github.com/hashicorp/vault/sdk/helper/metricregistry" "github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/sdk/physical" "github.com/hashicorp/vault/vault/seal" @@ -47,6 +48,26 @@ const ( leaderPrefixCleanDelay = 200 * time.Millisecond ) +func init() { + // Register metrics that we should always consistenty report whether or not + // they've been hit recently. The help texts are taken verbatim from our + // telemetry reference docs so if updated should probably stay in sync. + metricregistry.RegisterSummaries([]metricregistry.SummaryDefinition{ + { + Name: []string{"core", "step_down"}, + Help: "Time required to step down cluster leadership", + }, + { + Name: []string{"core", "leadership_setup_failed"}, + Help: "Time taken by the most recent leadership setup failure", + }, + { + Name: []string{"core", "leadership_lost"}, + Help: "Total time that a high-availability cluster node last maintained leadership", + }, + }) +} + var ( addEnterpriseHaActors func(*Core, *run.Group) chan func() = addEnterpriseHaActorsNoop interruptPerfStandby func(chan func(), chan struct{}) chan struct{} = interruptPerfStandbyNoop