Skip to content

Commit

Permalink
feat: prometheus metrics for counting passes and failures
Browse files Browse the repository at this point in the history
  • Loading branch information
polsar88 committed Oct 16, 2024
1 parent 7d3c6c2 commit 1bca518
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 15 deletions.
72 changes: 61 additions & 11 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ const (

// Metric labels

// General health check error types
// HTTPInit General health check error types
HTTPInit = "httpInit"
HTTPRequest = "httpReq"

// BlockHeightCheck-specific errors
// WSSubscribe BlockHeightCheck-specific errors
WSSubscribe = "wsSubscribe"
WSError = "wsError"
)
Expand Down Expand Up @@ -239,6 +239,7 @@ var (
[]string{"chain_name", "upstream_id", "url", "errorType"},
)

// Enhanced routing control metrics
errorLatencyStatus = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricsNamespace,
Expand All @@ -253,8 +254,8 @@ var (
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: "healthcheck",
Name: "latency_check_errors",
Help: "Errors of upstream requests.",
Name: "error_check_has_errors",
Help: "Number of errors of upstream requests.",
},
[]string{"chain_name", "upstream_id", "url", "errorType", "method"},
)
Expand All @@ -263,12 +264,32 @@ var (
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: "healthcheck",
Name: "latency_check_no_errors",
Help: "No errors of upstream requests.",
Name: "error_check_no_errors",
Help: "Number of no errors of upstream requests.",
},
[]string{"chain_name", "upstream_id", "url", "errorType", "method"},
)

errorLatencyStatusCheckErrorsIsPassing = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: "healthcheck",
Name: "error_check_is_passing",
Help: "Number of passing error checks.",
},
[]string{"chain_name", "upstream_id", "url", "errorType"},
)

errorLatencyStatusCheckErrorsIsFailing = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: "healthcheck",
Name: "error_check_is_failing",
Help: "Number of failing error checks.",
},
[]string{"chain_name", "upstream_id", "url", "errorType"},
)

errorLatencyStatusHighLatencies = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Expand All @@ -289,6 +310,26 @@ var (
[]string{"chain_name", "upstream_id", "url", "errorType", "method"},
)

errorLatencyStatusCheckLatencyIsPassing = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: "healthcheck",
Name: "latency_check_is_passing",
Help: "Number of passing latency checks.",
},
[]string{"chain_name", "upstream_id", "url", "errorType", "method"},
)

errorLatencyStatusCheckLatencyIsFailing = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: "healthcheck",
Name: "latency_check_is_failing",
Help: "Number of failing latency checks.",
},
[]string{"chain_name", "upstream_id", "url", "errorType", "method"},
)

// System metrics
fileDescriptorsUsed = promauto.NewGauge(
prometheus.GaugeOpts{
Expand Down Expand Up @@ -324,11 +365,16 @@ type Container struct {
SyncStatusCheckDuration prometheus.ObserverVec
SyncStatusCheckErrors *prometheus.CounterVec

ErrorLatency *prometheus.GaugeVec
ErrorLatencyCheckNoErrors *prometheus.CounterVec
ErrorLatencyCheckErrors *prometheus.CounterVec
ErrorLatencyCheckOkLatencies *prometheus.CounterVec
ErrorLatencyCheckHighLatencies *prometheus.CounterVec
// Enhanced routing control metrics
ErrorLatency *prometheus.GaugeVec
ErrorLatencyCheckNoErrors *prometheus.CounterVec
ErrorLatencyCheckErrors *prometheus.CounterVec
ErrorLatencyCheckErrorsIsPassing *prometheus.CounterVec
ErrorLatencyCheckErrorsIsFailing *prometheus.CounterVec
ErrorLatencyCheckOkLatencies *prometheus.CounterVec
ErrorLatencyCheckHighLatencies *prometheus.CounterVec
ErrorLatencyCheckLatencyIsPassing *prometheus.CounterVec
ErrorLatencyCheckLatencyIsFailing *prometheus.CounterVec
}

func NewContainer(chainName string) *Container {
Expand Down Expand Up @@ -364,8 +410,12 @@ func NewContainer(chainName string) *Container {
result.ErrorLatency = errorLatencyStatus.MustCurryWith(presetLabels)
result.ErrorLatencyCheckErrors = errorLatencyStatusCheckErrors.MustCurryWith(presetLabels)
result.ErrorLatencyCheckNoErrors = errorLatencyStatusCheckNoErrors.MustCurryWith(presetLabels)
result.ErrorLatencyCheckErrorsIsPassing = errorLatencyStatusCheckErrorsIsPassing.MustCurryWith(presetLabels)
result.ErrorLatencyCheckErrorsIsFailing = errorLatencyStatusCheckErrorsIsFailing.MustCurryWith(presetLabels)
result.ErrorLatencyCheckHighLatencies = errorLatencyStatusHighLatencies.MustCurryWith(presetLabels)
result.ErrorLatencyCheckOkLatencies = errorLatencyStatusOkLatencies.MustCurryWith(presetLabels)
result.ErrorLatencyCheckLatencyIsPassing = errorLatencyStatusCheckLatencyIsPassing.MustCurryWith(presetLabels)
result.ErrorLatencyCheckLatencyIsFailing = errorLatencyStatusCheckLatencyIsFailing.MustCurryWith(presetLabels)

return result
}
Expand Down
53 changes: 51 additions & 2 deletions internal/route/node_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/satsuma-data/node-gateway/internal/checks"
"github.com/satsuma-data/node-gateway/internal/config"
"github.com/satsuma-data/node-gateway/internal/metadata"
"github.com/satsuma-data/node-gateway/internal/metrics"
"go.uber.org/zap"
)

Expand Down Expand Up @@ -124,20 +125,68 @@ func (f *IsDoneSyncing) Apply(_ metadata.RequestMetadata, upstreamConfig *config

type IsErrorRateAcceptable struct {
HealthCheckManager checks.HealthCheckManager
MetricsContainer *metrics.Container
}

func (f *IsErrorRateAcceptable) Apply(requestMetadata metadata.RequestMetadata, upstreamConfig *config.UpstreamConfig, _ int) bool {
upstreamStatus := f.HealthCheckManager.GetUpstreamStatus(upstreamConfig.ID)
return upstreamStatus.ErrorCheck.IsPassing(requestMetadata.Methods)
isPassing := upstreamStatus.ErrorCheck.IsPassing(requestMetadata.Methods)

if isPassing {
f.MetricsContainer.ErrorLatencyCheckErrorsIsPassing.WithLabelValues(
upstreamConfig.ID,
upstreamConfig.HTTPURL,
metrics.HTTPRequest,
).Inc()
} else {
f.MetricsContainer.ErrorLatencyCheckErrorsIsFailing.WithLabelValues(
upstreamConfig.ID,
upstreamConfig.HTTPURL,
metrics.HTTPRequest,
).Inc()
}

return isPassing
}

type IsLatencyAcceptable struct {
HealthCheckManager checks.HealthCheckManager
MetricsContainer *metrics.Container
}

func (f *IsLatencyAcceptable) Apply(requestMetadata metadata.RequestMetadata, upstreamConfig *config.UpstreamConfig, _ int) bool {
upstreamStatus := f.HealthCheckManager.GetUpstreamStatus(upstreamConfig.ID)
return upstreamStatus.LatencyCheck.IsPassing(requestMetadata.Methods)
isPassing := upstreamStatus.LatencyCheck.IsPassing(requestMetadata.Methods)

var method string

// We can only meaningfully record latency metric for a method if exactly one method is being called.
switch {
case len(requestMetadata.Methods) == 0:
method = "unknown"
case len(requestMetadata.Methods) > 1:
method = "multiple"
default:
method = requestMetadata.Methods[0]
}

if isPassing {
f.MetricsContainer.ErrorLatencyCheckLatencyIsPassing.WithLabelValues(
upstreamConfig.ID,
upstreamConfig.HTTPURL,
metrics.HTTPRequest,
method,
).Inc()
} else {
f.MetricsContainer.ErrorLatencyCheckLatencyIsFailing.WithLabelValues(
upstreamConfig.ID,
upstreamConfig.HTTPURL,
metrics.HTTPRequest,
method,
).Inc()
}

return isPassing
}

type IsCloseToGlobalMaxHeight struct {
Expand Down
10 changes: 8 additions & 2 deletions internal/server/object_graph.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,14 @@ func wireSingleChainDependencies(

var routingStrategy route.RoutingStrategy

errorFilter := route.IsErrorRateAcceptable{HealthCheckManager: healthCheckManager}
latencyFilter := route.IsLatencyAcceptable{HealthCheckManager: healthCheckManager}
errorFilter := route.IsErrorRateAcceptable{
HealthCheckManager: healthCheckManager,
MetricsContainer: metricContainer,
}
latencyFilter := route.IsLatencyAcceptable{
HealthCheckManager: healthCheckManager,
MetricsContainer: metricContainer,
}

// These should be ordered from most important to least important.
nodeFilters := []route.NodeFilter{
Expand Down

0 comments on commit 1bca518

Please sign in to comment.