From afed75b2a74801ecf6f6819679e345bab1ae908a Mon Sep 17 00:00:00 2001 From: Jon Johnson <113393155+jonathanj-square@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:52:55 -0700 Subject: [PATCH] feat: introduce runner metrics (#2201) This change introduces the following runner metrics: * `ftl.runner.startup.failures` * `ftl.runner.registration.heartbeats` * `ftl.runner.failures` Metric output from local run: --- ScopeMetrics #1 ScopeMetrics SchemaURL: InstrumentationScope ftl.runner Metric #0 Descriptor: -> Name: ftl.runner.registration.heartbeats -> Description: the number of successful runner (re-)registrations -> Unit: -> DataType: Sum -> IsMonotonic: true -> AggregationTemporality: Cumulative NumberDataPoints #0 Data point attributes: -> ftl.deployment.key: Str(unknown) -> ftl.runner.state.name: Str(reserved) StartTimestamp: 2024-07-30 23:15:10.131588 +0000 UTC Timestamp: 2024-07-30 23:15:55.135067 +0000 UTC Value: 4 NumberDataPoints #1 Data point attributes: -> ftl.deployment.key: Str(dpl-echo-32feoiv4rvtuqc66) -> ftl.runner.state.name: Str(assigned) StartTimestamp: 2024-07-30 23:15:10.131588 +0000 UTC Timestamp: 2024-07-30 23:15:55.135067 +0000 UTC Value: 39 NumberDataPoints #2 Data point attributes: -> ftl.deployment.key: Str(dpl-time-2qb3im47gl8zgfb0) -> ftl.runner.state.name: Str(assigned) StartTimestamp: 2024-07-30 23:15:10.131588 +0000 UTC Timestamp: 2024-07-30 23:15:55.135067 +0000 UTC Value: 39 NumberDataPoints #3 Data point attributes: -> ftl.deployment.key: Str(unknown) -> ftl.runner.state.name: Str(idle) StartTimestamp: 2024-07-30 23:15:10.131588 +0000 UTC Timestamp: 2024-07-30 23:15:55.135067 +0000 UTC Value: 119 --------- Co-authored-by: github-actions[bot] --- backend/runner/observability/observability.go | 19 ++++ backend/runner/observability/runner.go | 87 +++++++++++++++++++ backend/runner/runner.go | 7 ++ internal/observability/attributes.go | 5 +- 4 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 backend/runner/observability/observability.go create mode 100644 backend/runner/observability/runner.go diff --git a/backend/runner/observability/observability.go b/backend/runner/observability/observability.go new file mode 100644 index 0000000000..5a32238717 --- /dev/null +++ b/backend/runner/observability/observability.go @@ -0,0 +1,19 @@ +package observability + +import ( + "fmt" +) + +var ( + Runner *RunnerMetrics +) + +func init() { + var err error + + Runner, err = initRunnerMetrics() + + if err != nil { + panic(fmt.Errorf("could not initialize runner metrics: %w", err)) + } +} diff --git a/backend/runner/observability/runner.go b/backend/runner/observability/runner.go new file mode 100644 index 0000000000..b9bce7c4dc --- /dev/null +++ b/backend/runner/observability/runner.go @@ -0,0 +1,87 @@ +package observability + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/alecthomas/types/optional" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" + + ftlv1 "github.com/TBD54566975/ftl/backend/protos/xyz/block/ftl/v1" + "github.com/TBD54566975/ftl/internal/observability" +) + +const ( + runnerMeterName = "ftl.runner" + runnerStateNameAttribute = "ftl.runner.state.name" +) + +type RunnerMetrics struct { + startupFailures metric.Int64Counter + registrationHeartbeats metric.Int64Counter + registrationFailures metric.Int64Counter +} + +func initRunnerMetrics() (*RunnerMetrics, error) { + result := &RunnerMetrics{} + + var errs error + var err error + + meter := otel.Meter(runnerMeterName) + + counter := fmt.Sprintf("%s.startup.failures", runnerMeterName) + if result.startupFailures, err = meter.Int64Counter( + counter, + metric.WithDescription("the number of runner startup failures")); err != nil { + result.startupFailures, errs = handleInitErrors(counter, err, errs) + } + + counter = fmt.Sprintf("%s.registration.heartbeats", runnerMeterName) + if result.registrationHeartbeats, err = meter.Int64Counter( + counter, + metric.WithDescription("the number of successful runner (re-)registrations")); err != nil { + result.registrationHeartbeats, errs = handleInitErrors(counter, err, errs) + } + + counter = fmt.Sprintf("%s.registration.failures", runnerMeterName) + if result.registrationFailures, err = meter.Int64Counter( + counter, + metric.WithDescription("the number of failures encountered while attempting to register a runner")); err != nil { + result.registrationFailures, errs = handleInitErrors(counter, err, errs) + } + + return result, errs +} + +func (m *RunnerMetrics) Registered(ctx context.Context, key optional.Option[string], state ftlv1.RunnerState) { + m.registrationHeartbeats.Add(ctx, 1, metric.WithAttributes( + attribute.String(observability.RunnerDeploymentKeyAttribute, key.Default("unknown")), + attribute.String(runnerStateNameAttribute, runnerStateToString(state)), + )) +} + +func (m *RunnerMetrics) RegistrationFailure(ctx context.Context, key optional.Option[string], state ftlv1.RunnerState) { + m.registrationFailures.Add(ctx, 1, metric.WithAttributes( + attribute.String(observability.RunnerDeploymentKeyAttribute, key.Default("unknown")), + attribute.String(runnerStateNameAttribute, runnerStateToString(state)), + )) +} + +func (m *RunnerMetrics) StartupFailed(ctx context.Context) { + m.startupFailures.Add(ctx, 1) +} + +//nolint:unparam +func handleInitErrors(counter string, err error, errs error) (metric.Int64Counter, error) { + return noop.Int64Counter{}, errors.Join(errs, fmt.Errorf("%q counter init failed; falling back to noop: %w", counter, err)) +} + +func runnerStateToString(state ftlv1.RunnerState) string { + return strings.ToLower(strings.TrimPrefix(state.String(), "RUNNER_")) +} diff --git a/backend/runner/runner.go b/backend/runner/runner.go index 6dde2593df..4f7750aaa2 100644 --- a/backend/runner/runner.go +++ b/backend/runner/runner.go @@ -29,6 +29,7 @@ import ( ftlv1 "github.com/TBD54566975/ftl/backend/protos/xyz/block/ftl/v1" "github.com/TBD54566975/ftl/backend/protos/xyz/block/ftl/v1/ftlv1connect" + "github.com/TBD54566975/ftl/backend/runner/observability" "github.com/TBD54566975/ftl/backend/schema" "github.com/TBD54566975/ftl/common/plugin" "github.com/TBD54566975/ftl/internal/download" @@ -60,6 +61,7 @@ func Start(ctx context.Context, config Config) error { } hostname, err := os.Hostname() if err != nil { + observability.Runner.StartupFailed(ctx) return fmt.Errorf("failed to get hostname: %w", err) } pid := os.Getpid() @@ -72,6 +74,7 @@ func Start(ctx context.Context, config Config) error { err = manageDeploymentDirectory(logger, config) if err != nil { + observability.Runner.StartupFailed(ctx) return err } @@ -92,6 +95,7 @@ func Start(ctx context.Context, config Config) error { "languages": slices.Map(config.Language, func(t string) any { return t }), }) if err != nil { + observability.Runner.StartupFailed(ctx) return fmt.Errorf("failed to marshal labels: %w", err) } @@ -420,6 +424,7 @@ func (s *Service) registrationLoop(ctx context.Context, send func(request *ftlv1 }) if err != nil { s.registrationFailure.Store(optional.Some(err)) + observability.Runner.RegistrationFailure(ctx, optional.Ptr(deploymentKey), state) return fmt.Errorf("failed to register with Controller: %w", err) } s.registrationFailure.Store(optional.None[error]()) @@ -427,10 +432,12 @@ func (s *Service) registrationLoop(ctx context.Context, send func(request *ftlv1 // Wait for the next heartbeat. delay := s.config.HeartbeatPeriod + time.Duration(rand.Intn(int(s.config.HeartbeatJitter))) //nolint:gosec logger.Tracef("Registered with Controller, next heartbeat in %s", delay) + observability.Runner.Registered(ctx, optional.Ptr(deploymentKey), state) select { case <-ctx.Done(): err = context.Cause(ctx) s.registrationFailure.Store(optional.Some(err)) + observability.Runner.RegistrationFailure(ctx, optional.Ptr(deploymentKey), state) return err case <-s.forceUpdate: diff --git a/internal/observability/attributes.go b/internal/observability/attributes.go index 878ac1a8f2..44f01c858f 100644 --- a/internal/observability/attributes.go +++ b/internal/observability/attributes.go @@ -1,6 +1,7 @@ package observability const ( - ModuleNameAttribute = "ftl.module.name" - StatusSucceededAttribute = "ftl.status.succeeded" + ModuleNameAttribute = "ftl.module.name" + StatusSucceededAttribute = "ftl.status.succeeded" + RunnerDeploymentKeyAttribute = "ftl.deployment.key" )