From e54e2200272ae9c0308367fa5e7e8093bd1aed46 Mon Sep 17 00:00:00 2001 From: Denise Li Date: Tue, 20 Aug 2024 13:30:22 -0400 Subject: [PATCH] feat: tune latency attribute buckets to reduce cardinality (#2432) Fixes https://github.com/TBD54566975/ftl/issues/2286 ### Log Buckets This PR introduces the ability to constrain `logBucket` with a max and min bucket. Log buckets just bucket numbers (in this case, ms latency): Example for base 4: ``` <1, [1,4), [4,16), [16,64), [64,256), [256,1024), [1024,4096), ... ``` In DataDog, looking across the last week, we see several latency buckets displaying low counts in the noise, especially on the buckets closer to 1. This makes sense given the intrinsic nature of log buckets - they are densest close to 1 and sparser the further up you go. To reduce cardinality (thus, cost), we can chop away that noise at the low end. It's similarly worth capping the buckets at the high end because although the buckets get exponentially larger, you can still have infinitely more buckets. We haven't been hit with any _major_ issues yet, but you can imagine how this could impact cardinality during a latency spike. ### Call Latency DataDog links: [bucketed counts](https://app.datadoghq.com/metric/explorer?fromUser=false&graph_layout=split&start=1723496312134&end=1724101112134&paused=false#N4Ig7glgJg5gpgFxALlAGwIYE8D2BXJVEADxQEYAaELcqyKBAC1pEbghkcLIF8qo4AMwgA7CAgg4RKUAiwAHOChASAtnADOcAE4RNIKtrgBHPJoQaUAbVBGN8qVoD6gnNtUZCKiOq279VKY6epbINiAiGOrKQdpYZAYgUJ4YThr42gDGSsgg6gi6mZaBZnHKGniqyIIIaAB0mRho9UZBGhbAAFQ8dRgaTpn4IggAFACUIDwAulSu7niYoeFzqgsYMaXxkzMg7VhoOaDyTYgIOUk4MAOLGhCZiWiicE5yiuU4j1APT070TMoiNweNCTfgQeyYLAvBTnR4iJTTHh8XbyR4IADCUmEMBkKLRTigPjgIlujmsoCkzxg2gw8kYTkU2mUNXqjWadW0eBELyJTlUGjqACM8JkANaIbZUR6qcQoABMABYqOltIQKdoBEzcgINPckTwgA), [avg latency rollup](https://app.datadoghq.com/metric/explorer?fromUser=false&graph_layout=multi&start=1723498319625&end=1724103119625&paused=false#N4Ig7glgJg5gpgFxALlAGwIYE8D2BXJVEADxQEYAaELcqyKBAC1pEbghkcLIF8qo4AMwgA7CAgg4RKUAiwAHOChASAtnADOcAE4RNIKtrgBHPJoQaUAbVBGN8qVoD6gnNtUZCKiOq279VKY6epbINiAiGOrKQdpYZAYgUJ4YThr42gDGSsgg6gi6mZaBZnHKGABuMMiCCGgAdJkYaA2qGk4IOE6ZOKryaIhwwABUPPXaOC148gAUlTAUAAQAbAAMAJQgPAC6VK7ueJih4fuqhxgxpfFbuyAacgMyIPLNiAg5STgw3UcaEJmJNCiOAdBQfdJAqCA4FOehMZQiNweNBbfgQeyYLCgxTKIEiJQ7Hh8O79cQAYSkwhgKBEhzQPCAA) Currently set to base 2 without min/max. * `<1`, `1-2` .. `8-16`: noise * `16-32` .. `1024-2048`: healthy number of data points * `2048-4096`, `4096-8192`: some spikes, but low data * `8192-16384`: healthy number of data points (worth investigating separately from this change) * 15 buckets total Proposed change: base 4, min `4^3` (i.e. `<64` is the smallest bucket), max `4^7` (i.e. `>=16384` is the highest bucket, accounting for outliers). * => Max cardinality = 6 buckets ### Async Call Latency DataDog links: [bucketed counts](https://app.datadoghq.com/metric/explorer?fromUser=false&graph_layout=split&start=1723497145524&end=1724101945524&paused=false#N4Ig7glgJg5gpgFxALlAGwIYE8D2BXJVEADxQEYAaELcqyKBAC1pEbghkcLIF8qo4AMwgA7CAgg4RKUAiwAHOChASAtnADOcAE4RNIKtrgBHPJoQaUAbVBGN8qVoD6gnNtUZCKiOq279VKY6epbINiAiGOrKQdpYZAYgUJ4YThr42gDGSsgg6gi6mZaBZnHKGniqyIIIaAB0GBpYIplOmRho9Zk4qvJoiHBQwABUPA0abfgiCAAUAJQgPAC6VK7ueJih4WuqGxgxpfGLKyAacv0yIPIdiAg5STgwbZsaEJmJaKJwTnKK5TifKAfL5OehMZQiNweNCLfgQeyYLA-BT3T4iJTLHh8U59cQAYSkwhglwR4icUB8cBEr0c1lAUm+MG0GHkjCcim0yhq9UazVa7U6dTU31eLRFmTYUA2gycnicqg0dQARnhMgBrRDHKifVTiFAAJgALFR0tpCPTtAJObkBBp3lieEA), [avg latency rollup](https://app.datadoghq.com/metric/explorer?fromUser=false&graph_layout=multi&start=1723498319625&end=1724103119625&paused=false#N4Ig7glgJg5gpgFxALlAGwIYE8D2BXJVEADxQEYAaELcqyKBAC1pEbghkcLIF8qo4AMwgA7CAgg4RKUAiwAHOChASAtnADOcAE4RNIKtrgBHPJoQaUAbVBGN8qVoD6gnNtUZCKiOq279VKY6epbINiAiGOrKQdpYZAYgUJ4YThr42gDGSsgg6gi6mZaBZnHKGABuMMiCCGgAdBgaWCKZTpkYaA2qGk4IOO04qvJoiHDAAFQ89do4XXjyABSVMBQABABsAAwAlCA8ALpUru54mKHhJ6pnGDGl8ftHIBpyozIg8p2ICDlJODDtc4aCCZRJoURwPoKX7pcFQMEQpz0JjKERuDxofb8CD2TBYKGKZTgkRKQ48PjPEbiADCUmEMBQIjOaB4QA) Currently set to base 8 without min/max. * `8-64`: noise. Note that `<1` and `1-8` buckets have never occurred, though they are possible in the current setup. * `64-512`, `512-4096`: healthy number of data points * `4096-32768`: noise * 4 buckets total Proposed change: base 4, min `4^4` (i.e. `<256` is the smallest bucket), max `4^6` (i.e. `>=4096` is the highest bucket) * => Max cardinality = 4 buckets ### Ingress Latency DataDog links: [bucketed counts](https://app.datadoghq.com/metric/explorer?fromUser=false&graph_layout=split&start=1723497891973&end=1724102691973&paused=false#N4Ig7glgJg5gpgFxALlAGwIYE8D2BXJVEADxQEYAaELcqyKBAC1pEbghkcLIF8qo4AMwgA7CAgg4RKUAiwAHOChASAtnADOcAE4RNIKtrgBHPJoQaUAbVBGN8qVoD6gnNtUZCKiOq279VKY6epbINiAiGOrKQdpYZAYgUJ4YThr42gDGSsgg6gi6mZaBZnHKGniqyIIIaAB0ojB2GnVGQRoWwABUPHUYGk6Z+CIIABQAlCA8ALpUru54mKHh86qLGDGl8VOzIB1YaDmg8hiHCAg5STgwg0saEJmJaKJwTnKK5TjPUE8vTvRMZQiNweNBTfgQeyYLBvBSXZ4iJQzHh8PbyZ4IADCUmEMBkaIxTigPjgInujmsoCkryaGHkjCcim0yhq9UazVaeBEbxJTlULQARnhMgBrRA7KjPVTiFAAJgALFR0tpCFTtAJmbkBBpHiieEA), [avg latency rollup](https://app.datadoghq.com/metric/explorer?fromUser=false&graph_layout=multi&start=1723498319625&end=1724103119625&paused=false#N4Ig7glgJg5gpgFxALlAGwIYE8D2BXJVEADxQEYAaELcqyKBAC1pEbghkcLIF8qo4AMwgA7CAgg4RKUAiwAHOChASAtnADOcAE4RNIKtrgBHPJoQaUAbVBGN8qVoD6gnNtUZCKiOq279VKY6epbINiAiGOrKQdpYZAYgUJ4YThr42gDGSsgg6gi6mZaBZnHKGABuMMiCCGgAdKIwdhr1qhpOCDhOmTiq8miIcMAAVDz12jhoaHjyABSVMBQABABsAAwAlCA8ALpUru54mKHhh6rHGDGl8Tv7IBpygzIg8hiDCAg5STgwPScaCCZRJoURwToKb7pUFQEFgpz0JjKERuDxoHb8CD2TBYCGKZSgkRKPY8PgPAbiADCUmEMBQImOaB4QA) Currently set to base 2 without min/max. * `<1`, `1-2` .. `16-32`: noise. * `32-64` .. `1024-2048`: healthy number of data points * `2048-4096`, `4096-8192 `: noise * `8192-16384`: healthy number of data points (worth investigating separately from this change) * 15 buckets total Proposed change: base 4, min `4^3` (i.e. `<64` is the smallest bucket), max `4^7` (i.e. `>=16384` is the highest bucket). * => Max cardinality = 6 buckets --- .../controller/observability/async_calls.go | 8 ++- backend/controller/observability/calls.go | 2 +- backend/controller/observability/ingress.go | 2 +- .../controller/observability/observability.go | 31 +++++++++++- .../observability/observability_test.go | 50 ++++++++++++++++++- 5 files changed, 87 insertions(+), 6 deletions(-) diff --git a/backend/controller/observability/async_calls.go b/backend/controller/observability/async_calls.go index d2943be77c..471cc557c4 100644 --- a/backend/controller/observability/async_calls.go +++ b/backend/controller/observability/async_calls.go @@ -126,7 +126,7 @@ func (m *AsyncCallMetrics) Completed(ctx context.Context, verb schema.RefKey, ca attrs = append(attrs, observability.SuccessOrFailureStatusAttr(maybeErr == nil)) m.msToComplete.Record(ctx, msToComplete, metric.WithAttributes(attrs...)) - attrs = append(attrs, attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, logBucket(8, msToComplete))) + attrs = append(attrs, attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, asyncLogBucket(msToComplete))) m.completed.Add(ctx, 1, metric.WithAttributes(attrs...)) m.queueDepth.Record(ctx, queueDepth) @@ -155,7 +155,11 @@ func RetrieveTraceContextFromContext(ctx context.Context) ([]byte, error) { } func extractAsyncCallAttrs(verb schema.RefKey, catchVerb optional.Option[schema.RefKey], origin string, scheduledAt time.Time, isCatching bool) []attribute.KeyValue { - return append(extractRefAttrs(verb, catchVerb, origin, isCatching), attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, logBucket(8, timeSinceMS(scheduledAt)))) + return append(extractRefAttrs(verb, catchVerb, origin, isCatching), attribute.String(asyncCallTimeSinceScheduledAtBucketAttr, asyncLogBucket(timeSinceMS(scheduledAt)))) +} + +func asyncLogBucket(msToComplete int64) string { + return logBucket(4, msToComplete, optional.Some(4), optional.Some(6)) } func extractRefAttrs(verb schema.RefKey, catchVerb optional.Option[schema.RefKey], origin string, isCatching bool) []attribute.KeyValue { diff --git a/backend/controller/observability/calls.go b/backend/controller/observability/calls.go index 30118a24a2..98a305ff33 100644 --- a/backend/controller/observability/calls.go +++ b/backend/controller/observability/calls.go @@ -67,6 +67,6 @@ func (m *CallMetrics) Request(ctx context.Context, verb *schemapb.Ref, startTime msToComplete := timeSinceMS(startTime) m.msToComplete.Record(ctx, msToComplete, metric.WithAttributes(attrs...)) - attrs = append(attrs, attribute.String(callRunTimeBucketAttr, logBucket(2, msToComplete))) + attrs = append(attrs, attribute.String(callRunTimeBucketAttr, logBucket(4, msToComplete, optional.Some(3), optional.Some(7)))) m.requests.Add(ctx, 1, metric.WithAttributes(attrs...)) } diff --git a/backend/controller/observability/ingress.go b/backend/controller/observability/ingress.go index ebae91ad80..9ad6def394 100644 --- a/backend/controller/observability/ingress.go +++ b/backend/controller/observability/ingress.go @@ -71,6 +71,6 @@ func (m *IngressMetrics) Request(ctx context.Context, method string, path string msToComplete := timeSinceMS(startTime) m.msToComplete.Record(ctx, msToComplete, metric.WithAttributes(attrs...)) - attrs = append(attrs, attribute.String(ingressRunTimeBucketAttr, logBucket(2, msToComplete))) + attrs = append(attrs, attribute.String(ingressRunTimeBucketAttr, logBucket(4, msToComplete, optional.Some(3), optional.Some(7)))) m.requests.Add(ctx, 1, metric.WithAttributes(attrs...)) } diff --git a/backend/controller/observability/observability.go b/backend/controller/observability/observability.go index 57851b0e0e..b7ce0ddf20 100644 --- a/backend/controller/observability/observability.go +++ b/backend/controller/observability/observability.go @@ -5,6 +5,8 @@ import ( "fmt" "math" "time" + + "github.com/alecthomas/types/optional" ) var ( @@ -54,14 +56,41 @@ func timeSinceMS(start time.Time) int64 { // // <1, [1-8), [8-64), [64-512), etc. // +// The buckets are then demarcated by `min` and `max`, such that any `num` < `base`^`min` +// will be bucketed together into the min bucket, and similarly, any `num` >= `base`^`max` +// will go in the `max` bucket. This constrains output cardinality by chopping the long +// tails on both ends of the normal distribution and lumping them all into terminal +// buckets. When `min` and `max` are not provided, the effective `min` is 0, and there is +// no max. +// // Go only supports a few bases with math.Log*, so this func performs a change of base: // log_b(x) = log_a(x) / log_a(b) -func logBucket(base int, num int64) string { +func logBucket(base int, num int64, optMin, optMax optional.Option[int]) string { if num < 1 { return "<1" } b := float64(base) + + // Check max + maxBucket, ok := optMax.Get() + if ok { + maxThreshold := int64(math.Pow(b, float64(maxBucket))) + if num >= maxThreshold { + return fmt.Sprintf(">=%d", maxThreshold) + } + } + + // Check min + minBucket, ok := optMin.Get() + if ok { + minThreshold := int64(math.Pow(b, float64(minBucket))) + if num < minThreshold { + return fmt.Sprintf("<%d", minThreshold) + } + } + logB := math.Log(float64(num)) / math.Log(b) bucketExpLo := math.Floor(logB) + return fmt.Sprintf("[%d,%d)", int(math.Pow(b, bucketExpLo)), int(math.Pow(b, bucketExpLo+1))) } diff --git a/backend/controller/observability/observability_test.go b/backend/controller/observability/observability_test.go index 70f2dec9f5..b55c82da34 100644 --- a/backend/controller/observability/observability_test.go +++ b/backend/controller/observability/observability_test.go @@ -4,50 +4,98 @@ import ( "testing" "github.com/alecthomas/assert/v2" + "github.com/alecthomas/types/optional" ) func TestLogBucket(t *testing.T) { tests := []struct { name string base int + min optional.Option[int] + max optional.Option[int] num int want string }{ + // Without demarcating min/max buckets { name: "<1", base: 8, + min: optional.None[int](), + max: optional.None[int](), num: 0, want: "<1", }, { name: "EqualLowEndOfRange", base: 8, + min: optional.None[int](), + max: optional.None[int](), num: 1, want: "[1,8)", }, { name: "HigherEndOfRange", base: 8, + min: optional.None[int](), + max: optional.None[int](), num: 7, want: "[1,8)", }, { name: "BigInputNum", base: 8, + min: optional.None[int](), + max: optional.None[int](), num: 16800000, want: "[16777216,134217728)", }, { name: "Base2", base: 2, + min: optional.None[int](), + max: optional.None[int](), num: 8, want: "[8,16)", }, + + // With min/max buckets + { + name: "LessThanMin", + base: 2, + min: optional.Some(2), + max: optional.None[int](), + num: 3, + want: "<4", + }, + { + name: "EqualToMax", + base: 2, + min: optional.None[int](), + max: optional.Some(2), + num: 4, + want: ">=4", + }, + { + name: "EqualToMaxWhenMinMaxEqual", + base: 2, + min: optional.Some(2), + max: optional.Some(2), + num: 4, + want: ">=4", + }, + { + name: "GreaterThanMax", + base: 2, + min: optional.Some(2), + max: optional.Some(2), + num: 4000, + want: ">=4", + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - assert.Equal(t, test.want, logBucket(test.base, int64(test.num))) + assert.Equal(t, test.want, logBucket(test.base, int64(test.num), test.min, test.max)) }) } }