Skip to content

Commit

Permalink
Register ha timing metrics. Fixes #11732
Browse files Browse the repository at this point in the history
  • Loading branch information
banks committed Aug 5, 2024
1 parent 37513e0 commit baa77c0
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 0 deletions.
5 changes: 5 additions & 0 deletions internalshared/configutil/telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/hashicorp/hcl"
"github.com/hashicorp/hcl/hcl/ast"
"github.com/hashicorp/vault/helper/metricsutil"
"github.com/hashicorp/vault/sdk/helper/metricregistry"
"google.golang.org/api/option"
)

Expand Down Expand Up @@ -289,6 +290,10 @@ func SetupTelemetry(opts *SetupTelemetryOpts) (*metrics.InmemSink, *metricsutil.
Expiration: opts.Config.PrometheusRetentionTime,
}

// Merge in explicit metric definitions so Prometheus always reports those
// metrics.
metricregistry.MergeDefinitions(&prometheusOpts)

sink, err := prometheus.NewPrometheusSinkFrom(prometheusOpts)
if err != nil {
return nil, nil, false, err
Expand Down
107 changes: 107 additions & 0 deletions sdk/helper/metricregistry/metricregistry.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

// Package metricregistry is a helper that allows Vault code or plugins that are
// compiled into Vault to pre-define any metrics they will emit to go-metrics at
// init time. Metrics registered this way will always be reported by the
// go-metrics PrometheusSink if it is used so infrequently updated metrics are
// always present. It is not required to pre-register metrics to use go-metrics
// with Prometheus, but it's preferable as it makes them behave more like the
// Prometheus ecosystem expects, being always present and with a helpful
// description in the output which some systems use to help operators explore
// metrics.
//
// Note that this will not work for external Vault plugins since they are in a
// separate process and only started after Vault's metrics sink is already
// configured.
package metricregistry

import promsink "github.com/armon/go-metrics/prometheus"

var Registry definitionRegistry

// Re-export these types so that we don't have the whole of Vault depending
// directly on go-metrics prometheus sink and can buffer changes if needed
type (
// GaugeDefinition provides the name and help text of a gauge metric that will
// be exported via go-metrics' Prometheus sink if enabled.
GaugeDefinition promsink.GaugeDefinition

// CounterDefinition provides the name and help text of a counter metric that
// will be exported via go-metrics' Prometheus sink if enabled.
CounterDefinition promsink.CounterDefinition

// SummaryDefinition provides the name and help text of a summary metric that
// will be exported via go-metrics' Prometheus sink if enabled.
SummaryDefinition promsink.SummaryDefinition
)

// definitionRegistry is a central place for packages to register their metrics
// definitions during init so that we can correctly report metrics to Prometheus
// even before they are observed. Typically there is one global instance.
type definitionRegistry struct {
gauges []GaugeDefinition
counters []CounterDefinition
summaries []SummaryDefinition
}

// RegisterGauges is intended to be called during init. It accesses global state
// without synchronization. Statically defined definitions should be registered
// during `init` of a package read to be configured if the prometheus sink is
// enabled in configuration. Registering metrics is not mandatory but it is
// strongly preferred as it ensures they are always output even before the are
// observed which makes dashboards much easier to work with, provides helpful
// descriptions and matches Prometheus eco system expectations. It also prevents
// the metrics ever being expired which means users don't need to work around
// that quirk of go-metrics by setting long prometheus retention times. All
// registered metrics will report 0 until an actual observation is made.
func RegisterGauges(defs []GaugeDefinition) {
Registry.gauges = append(Registry.gauges, defs...)
}

// RegisterCounters is intended to be called during init. It accesses global
// state without synchronization. Statically defined definitions should be
// registered during `init` of a package read to be configured if the prometheus
// sink is enabled in configuration. Registering metrics is not mandatory but it
// is strongly preferred as it ensures they are always output even before the
// are observed which makes dashboards much easier to work with, provides
// helpful descriptions and matches Prometheus eco system expectations. It also
// prevents the metrics ever being expired which means users don't need to work
// around that quirk of go-metrics by setting long prometheus retention times.
// All registered metrics will report 0 until an actual observation is made.
func RegisterCounters(defs []CounterDefinition) {
Registry.counters = append(Registry.counters, defs...)
}

// RegisterSummaries is intended to be called during init. It accesses global
// state without synchronization. Statically defined definitions should be
// registered during `init` of a package read to be configured if the prometheus
// sink is enabled in configuration. Registering metrics is not mandatory but it
// is strongly preferred as it ensures they are always output even before the
// are observed which makes dashboards much easier to work with, provides
// helpful descriptions and matches Prometheus eco system expectations. It also
// prevents the metrics ever being expired which means users don't need to work
// around that quirk of go-metrics by setting long prometheus retention times.
// All registered metrics will report 0 until an actual observation is made.
func RegisterSummaries(defs []SummaryDefinition) {
Registry.summaries = append(Registry.summaries, defs...)
}

// MergeDefinitions adds all registered metrics to any already present in `cfg`
// ready to be passed to the go-metrics prometheus sink. Note it is not safe to
// call this concurrently with registrations or other calls, it's intended this
// is called once only after all registrations (which should be in init
// functions) just before the PrometheusSink is created. Calling more than once
// could result in duplicate metrics definitions being passed unless the cfg is
// different each time for different Prometheus sinks.
func MergeDefinitions(cfg *promsink.PrometheusOpts) {
for _, g := range Registry.gauges {
cfg.GaugeDefinitions = append(cfg.GaugeDefinitions, promsink.GaugeDefinition(g))
}
for _, c := range Registry.counters {
cfg.CounterDefinitions = append(cfg.CounterDefinitions, promsink.CounterDefinition(c))
}
for _, s := range Registry.summaries {
cfg.SummaryDefinitions = append(cfg.SummaryDefinitions, promsink.SummaryDefinition(s))
}
}
119 changes: 119 additions & 0 deletions sdk/helper/metricregistry/metricregistry_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package metricregistry

import (
"testing"

promsink "github.com/armon/go-metrics/prometheus"
"github.com/stretchr/testify/require"
)

var testGauges = []GaugeDefinition{
{
Name: []string{"test_gauge"},
Help: "A test gauge",
},
{
Name: []string{"test_gauge2"},
Help: "Another test gauge",
},
}

var testCounters = []CounterDefinition{
{
Name: []string{"test_counter"},
Help: "A test counter",
},
{
Name: []string{"test_counter2"},
Help: "Another test counter",
},
}

var testSummaries = []SummaryDefinition{
{
Name: []string{"test_summary"},
Help: "A test summary",
},
{
Name: []string{"test_summary2"},
Help: "Another test summary",
},
}

func TestMetricRegistry(t *testing.T) {
// Register some metrics
RegisterGauges(testGauges)
RegisterCounters(testCounters)
RegisterSummaries(testSummaries)

var opts promsink.PrometheusOpts

// Add some pre-existing metrics to ensure merge is really a merge
opts.GaugeDefinitions = []promsink.GaugeDefinition{
{
Name: []string{"preexisting_gauge"},
Help: "A pre-existing gauge",
},
}
opts.CounterDefinitions = []promsink.CounterDefinition{
{
Name: []string{"preexisting_counter"},
Help: "A pre-existing counter",
},
}
opts.SummaryDefinitions = []promsink.SummaryDefinition{
{
Name: []string{"preexisting_summary"},
Help: "A pre-existing summary",
},
}

MergeDefinitions(&opts)

require.Len(t, opts.GaugeDefinitions, 3)
require.Len(t, opts.CounterDefinitions, 3)
require.Len(t, opts.SummaryDefinitions, 3)

wantGauges := []string{"test_gauge", "test_gauge2", "preexisting_gauge"}
wantGaugeHelp := []string{"A test gauge", "Another test gauge", "A pre-existing gauge"}
gotGauges := reduce(opts.GaugeDefinitions, nil, func(r []string, d promsink.GaugeDefinition) []string {
return append(r, d.Name[0])
})
gotGaugeHelp := reduce(opts.GaugeDefinitions, nil, func(r []string, d promsink.GaugeDefinition) []string {
return append(r, d.Help)
})

require.ElementsMatch(t, wantGauges, gotGauges)
require.ElementsMatch(t, wantGaugeHelp, gotGaugeHelp)

wantCounters := []string{"test_counter", "test_counter2", "preexisting_counter"}
wantCounterHelp := []string{"A test counter", "Another test counter", "A pre-existing counter"}
gotCounters := reduce(opts.CounterDefinitions, nil, func(r []string, d promsink.CounterDefinition) []string {
return append(r, d.Name[0])
})
gotCounterHelp := reduce(opts.CounterDefinitions, nil, func(r []string, d promsink.CounterDefinition) []string {
return append(r, d.Help)
})

require.ElementsMatch(t, wantCounters, gotCounters)
require.ElementsMatch(t, wantCounterHelp, gotCounterHelp)

wantSummaries := []string{"test_summary", "test_summary2", "preexisting_summary"}
wantSummaryHelp := []string{"A test summary", "Another test summary", "A pre-existing summary"}
gotSummaries := reduce(opts.SummaryDefinitions, nil, func(r []string, d promsink.SummaryDefinition) []string {
return append(r, d.Name[0])
})
gotSummaryHelp := reduce(opts.SummaryDefinitions, nil, func(r []string, d promsink.SummaryDefinition) []string {
return append(r, d.Help)
})

require.ElementsMatch(t, wantSummaries, gotSummaries)
require.ElementsMatch(t, wantSummaryHelp, gotSummaryHelp)
}

func reduce[T, R any](s []T, r R, f func(R, T) R) R {
for _, v := range s {
r = f(r, v)
}
return r
}
21 changes: 21 additions & 0 deletions vault/ha.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/hashicorp/vault/sdk/helper/certutil"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/helper/jsonutil"
"github.com/hashicorp/vault/sdk/helper/metricregistry"
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/sdk/physical"
"github.com/hashicorp/vault/vault/seal"
Expand All @@ -47,6 +48,26 @@ const (
leaderPrefixCleanDelay = 200 * time.Millisecond
)

func init() {
// Register metrics that we should always consistenty report whether or not
// they've been hit recently. The help texts are taken verbatim from our
// telemetry reference docs so if updated should probably stay in sync.
metricregistry.RegisterSummaries([]metricregistry.SummaryDefinition{
{
Name: []string{"core", "step_down"},
Help: "Time required to step down cluster leadership",
},
{
Name: []string{"core", "leadership_setup_failed"},
Help: "Time taken by the most recent leadership setup failure",
},
{
Name: []string{"core", "leadership_lost"},
Help: "Total time that a high-availability cluster node last maintained leadership",
},
})
}

var (
addEnterpriseHaActors func(*Core, *run.Group) chan func() = addEnterpriseHaActorsNoop
interruptPerfStandby func(chan func(), chan struct{}) chan struct{} = interruptPerfStandbyNoop
Expand Down

0 comments on commit baa77c0

Please sign in to comment.