Skip to content

Commit

Permalink
feat: introduce runner metrics (#2201)
Browse files Browse the repository at this point in the history
This change introduces the following runner metrics:

* `ftl.runner.startup.failures`
* `ftl.runner.registration.heartbeats`
* `ftl.runner.failures`

Metric output from local run:

---

ScopeMetrics #1
ScopeMetrics SchemaURL:
InstrumentationScope ftl.runner

Metric #0
Descriptor:
     -> Name: ftl.runner.registration.heartbeats
     -> Description: the number of successful runner (re-)registrations
     -> Unit:
     -> DataType: Sum
     -> IsMonotonic: true
     -> AggregationTemporality: Cumulative

NumberDataPoints #0
Data point attributes:
     -> ftl.deployment.key: Str(unknown)
     -> ftl.runner.state.name: Str(reserved)
StartTimestamp: 2024-07-30 23:15:10.131588 +0000 UTC
Timestamp: 2024-07-30 23:15:55.135067 +0000 UTC
Value: 4

NumberDataPoints #1
Data point attributes:
     -> ftl.deployment.key: Str(dpl-echo-32feoiv4rvtuqc66)
     -> ftl.runner.state.name: Str(assigned)
StartTimestamp: 2024-07-30 23:15:10.131588 +0000 UTC
Timestamp: 2024-07-30 23:15:55.135067 +0000 UTC
Value: 39

NumberDataPoints #2
Data point attributes:
     -> ftl.deployment.key: Str(dpl-time-2qb3im47gl8zgfb0)
     -> ftl.runner.state.name: Str(assigned)
StartTimestamp: 2024-07-30 23:15:10.131588 +0000 UTC
Timestamp: 2024-07-30 23:15:55.135067 +0000 UTC
Value: 39

NumberDataPoints #3
Data point attributes:
     -> ftl.deployment.key: Str(unknown)
     -> ftl.runner.state.name: Str(idle)
StartTimestamp: 2024-07-30 23:15:10.131588 +0000 UTC
Timestamp: 2024-07-30 23:15:55.135067 +0000 UTC
Value: 119

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
jonathanj-square and github-actions[bot] authored Jul 31, 2024
1 parent 8c48e11 commit afed75b
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 2 deletions.
19 changes: 19 additions & 0 deletions backend/runner/observability/observability.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package observability

import (
"fmt"
)

var (
Runner *RunnerMetrics
)

func init() {
var err error

Runner, err = initRunnerMetrics()

if err != nil {
panic(fmt.Errorf("could not initialize runner metrics: %w", err))
}
}
87 changes: 87 additions & 0 deletions backend/runner/observability/runner.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package observability

import (
"context"
"errors"
"fmt"
"strings"

"github.com/alecthomas/types/optional"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/metric/noop"

ftlv1 "github.com/TBD54566975/ftl/backend/protos/xyz/block/ftl/v1"
"github.com/TBD54566975/ftl/internal/observability"
)

const (
runnerMeterName = "ftl.runner"
runnerStateNameAttribute = "ftl.runner.state.name"
)

type RunnerMetrics struct {
startupFailures metric.Int64Counter
registrationHeartbeats metric.Int64Counter
registrationFailures metric.Int64Counter
}

func initRunnerMetrics() (*RunnerMetrics, error) {
result := &RunnerMetrics{}

var errs error
var err error

meter := otel.Meter(runnerMeterName)

counter := fmt.Sprintf("%s.startup.failures", runnerMeterName)
if result.startupFailures, err = meter.Int64Counter(
counter,
metric.WithDescription("the number of runner startup failures")); err != nil {
result.startupFailures, errs = handleInitErrors(counter, err, errs)
}

counter = fmt.Sprintf("%s.registration.heartbeats", runnerMeterName)
if result.registrationHeartbeats, err = meter.Int64Counter(
counter,
metric.WithDescription("the number of successful runner (re-)registrations")); err != nil {
result.registrationHeartbeats, errs = handleInitErrors(counter, err, errs)
}

counter = fmt.Sprintf("%s.registration.failures", runnerMeterName)
if result.registrationFailures, err = meter.Int64Counter(
counter,
metric.WithDescription("the number of failures encountered while attempting to register a runner")); err != nil {
result.registrationFailures, errs = handleInitErrors(counter, err, errs)
}

return result, errs
}

func (m *RunnerMetrics) Registered(ctx context.Context, key optional.Option[string], state ftlv1.RunnerState) {
m.registrationHeartbeats.Add(ctx, 1, metric.WithAttributes(
attribute.String(observability.RunnerDeploymentKeyAttribute, key.Default("unknown")),
attribute.String(runnerStateNameAttribute, runnerStateToString(state)),
))
}

func (m *RunnerMetrics) RegistrationFailure(ctx context.Context, key optional.Option[string], state ftlv1.RunnerState) {
m.registrationFailures.Add(ctx, 1, metric.WithAttributes(
attribute.String(observability.RunnerDeploymentKeyAttribute, key.Default("unknown")),
attribute.String(runnerStateNameAttribute, runnerStateToString(state)),
))
}

func (m *RunnerMetrics) StartupFailed(ctx context.Context) {
m.startupFailures.Add(ctx, 1)
}

//nolint:unparam
func handleInitErrors(counter string, err error, errs error) (metric.Int64Counter, error) {
return noop.Int64Counter{}, errors.Join(errs, fmt.Errorf("%q counter init failed; falling back to noop: %w", counter, err))
}

func runnerStateToString(state ftlv1.RunnerState) string {
return strings.ToLower(strings.TrimPrefix(state.String(), "RUNNER_"))
}
7 changes: 7 additions & 0 deletions backend/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (

ftlv1 "github.com/TBD54566975/ftl/backend/protos/xyz/block/ftl/v1"
"github.com/TBD54566975/ftl/backend/protos/xyz/block/ftl/v1/ftlv1connect"
"github.com/TBD54566975/ftl/backend/runner/observability"
"github.com/TBD54566975/ftl/backend/schema"
"github.com/TBD54566975/ftl/common/plugin"
"github.com/TBD54566975/ftl/internal/download"
Expand Down Expand Up @@ -60,6 +61,7 @@ func Start(ctx context.Context, config Config) error {
}
hostname, err := os.Hostname()
if err != nil {
observability.Runner.StartupFailed(ctx)
return fmt.Errorf("failed to get hostname: %w", err)
}
pid := os.Getpid()
Expand All @@ -72,6 +74,7 @@ func Start(ctx context.Context, config Config) error {

err = manageDeploymentDirectory(logger, config)
if err != nil {
observability.Runner.StartupFailed(ctx)
return err
}

Expand All @@ -92,6 +95,7 @@ func Start(ctx context.Context, config Config) error {
"languages": slices.Map(config.Language, func(t string) any { return t }),
})
if err != nil {
observability.Runner.StartupFailed(ctx)
return fmt.Errorf("failed to marshal labels: %w", err)
}

Expand Down Expand Up @@ -420,17 +424,20 @@ func (s *Service) registrationLoop(ctx context.Context, send func(request *ftlv1
})
if err != nil {
s.registrationFailure.Store(optional.Some(err))
observability.Runner.RegistrationFailure(ctx, optional.Ptr(deploymentKey), state)
return fmt.Errorf("failed to register with Controller: %w", err)
}
s.registrationFailure.Store(optional.None[error]())

// Wait for the next heartbeat.
delay := s.config.HeartbeatPeriod + time.Duration(rand.Intn(int(s.config.HeartbeatJitter))) //nolint:gosec
logger.Tracef("Registered with Controller, next heartbeat in %s", delay)
observability.Runner.Registered(ctx, optional.Ptr(deploymentKey), state)
select {
case <-ctx.Done():
err = context.Cause(ctx)
s.registrationFailure.Store(optional.Some(err))
observability.Runner.RegistrationFailure(ctx, optional.Ptr(deploymentKey), state)
return err

case <-s.forceUpdate:
Expand Down
5 changes: 3 additions & 2 deletions internal/observability/attributes.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package observability

const (
ModuleNameAttribute = "ftl.module.name"
StatusSucceededAttribute = "ftl.status.succeeded"
ModuleNameAttribute = "ftl.module.name"
StatusSucceededAttribute = "ftl.status.succeeded"
RunnerDeploymentKeyAttribute = "ftl.deployment.key"
)

0 comments on commit afed75b

Please sign in to comment.