From 1bca51830c258ba6a2f32674ad6c1ce8c7bd7b42 Mon Sep 17 00:00:00 2001 From: Peter Olsar Date: Tue, 15 Oct 2024 21:46:26 -0700 Subject: [PATCH] feat: prometheus metrics for counting passes and failures --- internal/metrics/metrics.go | 72 ++++++++++++++++++++++++++++----- internal/route/node_filter.go | 53 +++++++++++++++++++++++- internal/server/object_graph.go | 10 ++++- 3 files changed, 120 insertions(+), 15 deletions(-) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 5188417..0bf0a10 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -21,11 +21,11 @@ const ( // Metric labels - // General health check error types + // HTTPInit General health check error types HTTPInit = "httpInit" HTTPRequest = "httpReq" - // BlockHeightCheck-specific errors + // WSSubscribe BlockHeightCheck-specific errors WSSubscribe = "wsSubscribe" WSError = "wsError" ) @@ -239,6 +239,7 @@ var ( []string{"chain_name", "upstream_id", "url", "errorType"}, ) + // Enhanced routing control metrics errorLatencyStatus = promauto.NewGaugeVec( prometheus.GaugeOpts{ Namespace: metricsNamespace, @@ -253,8 +254,8 @@ var ( prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: "healthcheck", - Name: "latency_check_errors", - Help: "Errors of upstream requests.", + Name: "error_check_has_errors", + Help: "Number of errors of upstream requests.", }, []string{"chain_name", "upstream_id", "url", "errorType", "method"}, ) @@ -263,12 +264,32 @@ var ( prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: "healthcheck", - Name: "latency_check_no_errors", - Help: "No errors of upstream requests.", + Name: "error_check_no_errors", + Help: "Number of no errors of upstream requests.", }, []string{"chain_name", "upstream_id", "url", "errorType", "method"}, ) + errorLatencyStatusCheckErrorsIsPassing = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: "healthcheck", + Name: "error_check_is_passing", + Help: "Number of passing error checks.", + }, + []string{"chain_name", "upstream_id", "url", "errorType"}, + ) + + errorLatencyStatusCheckErrorsIsFailing = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: "healthcheck", + Name: "error_check_is_failing", + Help: "Number of failing error checks.", + }, + []string{"chain_name", "upstream_id", "url", "errorType"}, + ) + errorLatencyStatusHighLatencies = promauto.NewCounterVec( prometheus.CounterOpts{ Namespace: metricsNamespace, @@ -289,6 +310,26 @@ var ( []string{"chain_name", "upstream_id", "url", "errorType", "method"}, ) + errorLatencyStatusCheckLatencyIsPassing = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: "healthcheck", + Name: "latency_check_is_passing", + Help: "Number of passing latency checks.", + }, + []string{"chain_name", "upstream_id", "url", "errorType", "method"}, + ) + + errorLatencyStatusCheckLatencyIsFailing = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: "healthcheck", + Name: "latency_check_is_failing", + Help: "Number of failing latency checks.", + }, + []string{"chain_name", "upstream_id", "url", "errorType", "method"}, + ) + // System metrics fileDescriptorsUsed = promauto.NewGauge( prometheus.GaugeOpts{ @@ -324,11 +365,16 @@ type Container struct { SyncStatusCheckDuration prometheus.ObserverVec SyncStatusCheckErrors *prometheus.CounterVec - ErrorLatency *prometheus.GaugeVec - ErrorLatencyCheckNoErrors *prometheus.CounterVec - ErrorLatencyCheckErrors *prometheus.CounterVec - ErrorLatencyCheckOkLatencies *prometheus.CounterVec - ErrorLatencyCheckHighLatencies *prometheus.CounterVec + // Enhanced routing control metrics + ErrorLatency *prometheus.GaugeVec + ErrorLatencyCheckNoErrors *prometheus.CounterVec + ErrorLatencyCheckErrors *prometheus.CounterVec + ErrorLatencyCheckErrorsIsPassing *prometheus.CounterVec + ErrorLatencyCheckErrorsIsFailing *prometheus.CounterVec + ErrorLatencyCheckOkLatencies *prometheus.CounterVec + ErrorLatencyCheckHighLatencies *prometheus.CounterVec + ErrorLatencyCheckLatencyIsPassing *prometheus.CounterVec + ErrorLatencyCheckLatencyIsFailing *prometheus.CounterVec } func NewContainer(chainName string) *Container { @@ -364,8 +410,12 @@ func NewContainer(chainName string) *Container { result.ErrorLatency = errorLatencyStatus.MustCurryWith(presetLabels) result.ErrorLatencyCheckErrors = errorLatencyStatusCheckErrors.MustCurryWith(presetLabels) result.ErrorLatencyCheckNoErrors = errorLatencyStatusCheckNoErrors.MustCurryWith(presetLabels) + result.ErrorLatencyCheckErrorsIsPassing = errorLatencyStatusCheckErrorsIsPassing.MustCurryWith(presetLabels) + result.ErrorLatencyCheckErrorsIsFailing = errorLatencyStatusCheckErrorsIsFailing.MustCurryWith(presetLabels) result.ErrorLatencyCheckHighLatencies = errorLatencyStatusHighLatencies.MustCurryWith(presetLabels) result.ErrorLatencyCheckOkLatencies = errorLatencyStatusOkLatencies.MustCurryWith(presetLabels) + result.ErrorLatencyCheckLatencyIsPassing = errorLatencyStatusCheckLatencyIsPassing.MustCurryWith(presetLabels) + result.ErrorLatencyCheckLatencyIsFailing = errorLatencyStatusCheckLatencyIsFailing.MustCurryWith(presetLabels) return result } diff --git a/internal/route/node_filter.go b/internal/route/node_filter.go index 2216611..165ce79 100644 --- a/internal/route/node_filter.go +++ b/internal/route/node_filter.go @@ -7,6 +7,7 @@ import ( "github.com/satsuma-data/node-gateway/internal/checks" "github.com/satsuma-data/node-gateway/internal/config" "github.com/satsuma-data/node-gateway/internal/metadata" + "github.com/satsuma-data/node-gateway/internal/metrics" "go.uber.org/zap" ) @@ -124,20 +125,68 @@ func (f *IsDoneSyncing) Apply(_ metadata.RequestMetadata, upstreamConfig *config type IsErrorRateAcceptable struct { HealthCheckManager checks.HealthCheckManager + MetricsContainer *metrics.Container } func (f *IsErrorRateAcceptable) Apply(requestMetadata metadata.RequestMetadata, upstreamConfig *config.UpstreamConfig, _ int) bool { upstreamStatus := f.HealthCheckManager.GetUpstreamStatus(upstreamConfig.ID) - return upstreamStatus.ErrorCheck.IsPassing(requestMetadata.Methods) + isPassing := upstreamStatus.ErrorCheck.IsPassing(requestMetadata.Methods) + + if isPassing { + f.MetricsContainer.ErrorLatencyCheckErrorsIsPassing.WithLabelValues( + upstreamConfig.ID, + upstreamConfig.HTTPURL, + metrics.HTTPRequest, + ).Inc() + } else { + f.MetricsContainer.ErrorLatencyCheckErrorsIsFailing.WithLabelValues( + upstreamConfig.ID, + upstreamConfig.HTTPURL, + metrics.HTTPRequest, + ).Inc() + } + + return isPassing } type IsLatencyAcceptable struct { HealthCheckManager checks.HealthCheckManager + MetricsContainer *metrics.Container } func (f *IsLatencyAcceptable) Apply(requestMetadata metadata.RequestMetadata, upstreamConfig *config.UpstreamConfig, _ int) bool { upstreamStatus := f.HealthCheckManager.GetUpstreamStatus(upstreamConfig.ID) - return upstreamStatus.LatencyCheck.IsPassing(requestMetadata.Methods) + isPassing := upstreamStatus.LatencyCheck.IsPassing(requestMetadata.Methods) + + var method string + + // We can only meaningfully record latency metric for a method if exactly one method is being called. + switch { + case len(requestMetadata.Methods) == 0: + method = "unknown" + case len(requestMetadata.Methods) > 1: + method = "multiple" + default: + method = requestMetadata.Methods[0] + } + + if isPassing { + f.MetricsContainer.ErrorLatencyCheckLatencyIsPassing.WithLabelValues( + upstreamConfig.ID, + upstreamConfig.HTTPURL, + metrics.HTTPRequest, + method, + ).Inc() + } else { + f.MetricsContainer.ErrorLatencyCheckLatencyIsFailing.WithLabelValues( + upstreamConfig.ID, + upstreamConfig.HTTPURL, + metrics.HTTPRequest, + method, + ).Inc() + } + + return isPassing } type IsCloseToGlobalMaxHeight struct { diff --git a/internal/server/object_graph.go b/internal/server/object_graph.go index 035977c..f1c567c 100644 --- a/internal/server/object_graph.go +++ b/internal/server/object_graph.go @@ -72,8 +72,14 @@ func wireSingleChainDependencies( var routingStrategy route.RoutingStrategy - errorFilter := route.IsErrorRateAcceptable{HealthCheckManager: healthCheckManager} - latencyFilter := route.IsLatencyAcceptable{HealthCheckManager: healthCheckManager} + errorFilter := route.IsErrorRateAcceptable{ + HealthCheckManager: healthCheckManager, + MetricsContainer: metricContainer, + } + latencyFilter := route.IsLatencyAcceptable{ + HealthCheckManager: healthCheckManager, + MetricsContainer: metricContainer, + } // These should be ordered from most important to least important. nodeFilters := []route.NodeFilter{