Skip to content

Commit

Permalink
Merge pull request #1348 from pyrra-dev/absent-duration
Browse files Browse the repository at this point in the history
Improve Absent alert duration
  • Loading branch information
metalmatze authored Dec 4, 2024
2 parents d268acb + 69adb19 commit 10766e0
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 52 deletions.
4 changes: 2 additions & 2 deletions kubernetes/controllers/servicelevelobjective_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func Test_makePrometheusRule(t *testing.T) {
{
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_requests_total{job="app"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("6m"),
Annotations: map[string]string{
"description": "foo",
},
Expand Down Expand Up @@ -210,7 +210,7 @@ func Test_makeConfigMap(t *testing.T) {
annotations:
description: foo
expr: absent(http_requests_total{job="app"}) == 1
for: 2m
for: 6m
labels:
job: app
severity: critical
Expand Down
40 changes: 15 additions & 25 deletions slo/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -713,11 +713,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
}.replace(expr)

rules = append(rules, monitoringv1.Rule{
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(model.Duration(
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
).String()),
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(o.AbsentDuration().String()),
Labels: alertLabels,
Annotations: o.commonRuleAnnotations(),
})
Expand Down Expand Up @@ -755,11 +753,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
}.replace(expr)

rules = append(rules, monitoringv1.Rule{
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(model.Duration(
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
).String()),
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(o.AbsentDuration().String()),
Labels: alertLabels,
Annotations: o.commonRuleAnnotations(),
})
Expand Down Expand Up @@ -867,11 +863,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
alertLabels["severity"] = string(critical)

rules = append(rules, monitoringv1.Rule{
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(model.Duration(
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
).String()),
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(o.AbsentDuration().String()),
Labels: alertLabels,
Annotations: o.commonRuleAnnotations(),
})
Expand All @@ -894,11 +888,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
alertLabelsLe["severity"] = string(critical)

rules = append(rules, monitoringv1.Rule{
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(model.Duration(
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
).String()),
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(o.AbsentDuration().String()),
Labels: alertLabelsLe,
Annotations: o.commonRuleAnnotations(),
})
Expand Down Expand Up @@ -1037,11 +1029,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
alertLabels["severity"] = string(critical)

rules = append(rules, monitoringv1.Rule{
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(model.Duration(
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
).String()),
Alert: o.AlertNameAbsent(),
Expr: intstr.FromString(expr.String()),
For: monitoringDuration(o.AbsentDuration().String()),
Labels: alertLabels,
Annotations: o.commonRuleAnnotations(),
})
Expand Down
50 changes: 25 additions & 25 deletions slo/rules_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1263,7 +1263,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_requests_total{job="thanos-receive-default"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("10m"),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors", "severity": "critical"},
}},
},
Expand All @@ -1280,7 +1280,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_requests_total{job="thanos-receive-default"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("10m"),
Labels: map[string]string{"slo": "monitoring-http-errors", "severity": "critical"},
}},
},
Expand All @@ -1297,7 +1297,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_requests_total{handler=~"/api.*",job="thanos-receive-default"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("10m"),
Labels: map[string]string{"slo": "monitoring-http-errors", "severity": "critical"},
}},
},
Expand All @@ -1314,7 +1314,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(grpc_server_handled_total{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("3m"),
Labels: map[string]string{"grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "job": "api", "slo": "monitoring-grpc-errors", "severity": "critical"},
}},
},
Expand All @@ -1331,7 +1331,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(grpc_server_handled_total{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("3m"),
Labels: map[string]string{"grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "slo": "monitoring-grpc-errors", "severity": "critical"},
}},
},
Expand All @@ -1352,12 +1352,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("6m"),
Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "severity": "critical"},
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",job="metrics-service-thanos-receive-default",le="1"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("6m"),
Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "le": "1", "severity": "critical"},
}},
},
Expand All @@ -1375,7 +1375,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
Record: "http_request_duration_seconds:increase4w",
Expr: intstr.FromString(`histogram_fraction(0, 1, increase(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}[4w])) * histogram_count(increase(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}[4w]))`),
Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "le": "1"},
//}, {
// }, {
// Alert: "SLOMetricAbsent",
// Expr: intstr.FromString(`absent(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`),
// For: monitoringDuration("2m"),
Expand All @@ -1399,12 +1399,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("6m"),
Labels: map[string]string{"slo": "monitoring-http-latency", "severity": "critical"},
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",job="metrics-service-thanos-receive-default",le="1"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("6m"),
Labels: map[string]string{"slo": "monitoring-http-latency", "le": "1", "severity": "critical"},
}},
},
Expand All @@ -1425,12 +1425,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",handler=~"/api.*",job="metrics-service-thanos-receive-default"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("6m"),
Labels: map[string]string{"slo": "monitoring-http-latency", "severity": "critical"},
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",handler=~"/api.*",job="metrics-service-thanos-receive-default",le="1"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("6m"),
Labels: map[string]string{"slo": "monitoring-http-latency", "le": "1", "severity": "critical"},
}},
},
Expand All @@ -1451,12 +1451,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(grpc_server_handling_seconds_count{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("2m"),
Labels: map[string]string{"slo": "monitoring-grpc-latency", "job": "api", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "severity": "critical"},
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(grpc_server_handling_seconds_bucket{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",le="0.6"}) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("2m"),
Labels: map[string]string{"slo": "monitoring-grpc-latency", "job": "api", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "le": "0.6", "severity": "critical"},
}},
},
Expand All @@ -1477,12 +1477,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(grpc_server_handling_seconds_count{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("2m"),
Labels: map[string]string{"slo": "monitoring-grpc-latency", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "severity": "critical"},
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(grpc_server_handling_seconds_bucket{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",le="0.6"}) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("2m"),
Labels: map[string]string{"slo": "monitoring-grpc-latency", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "le": "0.6", "severity": "critical"},
}},
},
Expand All @@ -1499,7 +1499,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(prometheus_operator_reconcile_operations_total) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("5m"),
Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"},
}, {
Record: "prometheus_operator_reconcile_errors:increase2w",
Expand All @@ -1508,7 +1508,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(prometheus_operator_reconcile_errors_total) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("5m"),
Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"},
}},
},
Expand All @@ -1525,7 +1525,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(prometheus_operator_reconcile_operations_total) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("5m"),
Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"},
}, {
Record: "prometheus_operator_reconcile_errors:increase2w",
Expand All @@ -1534,7 +1534,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(prometheus_operator_reconcile_errors_total) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("5m"),
Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"},
}},
},
Expand All @@ -1551,7 +1551,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "APIServerMetricAbsent",
Expr: intstr.FromString(`absent(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("5m"),
Labels: map[string]string{"job": "apiserver", "slo": "apiserver-write-response-errors", "severity": "critical"},
}},
},
Expand All @@ -1572,12 +1572,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(apiserver_request_duration_seconds_count{job="apiserver",resource=~"resource|",verb=~"LIST|GET"}) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("5m"),
Labels: map[string]string{"job": "apiserver", "slo": "apiserver-read-resource-latency", "severity": "critical"},
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",resource=~"resource|",verb=~"LIST|GET"}) == 1`),
For: monitoringDuration("1m"),
For: monitoringDuration("5m"),
Labels: map[string]string{"job": "apiserver", "slo": "apiserver-read-resource-latency", "le": "0.1", "severity": "critical"},
}},
},
Expand All @@ -1598,7 +1598,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(up) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("10m"),
Labels: map[string]string{"severity": "critical", "slo": "up-targets"},
}},
},
Expand All @@ -1619,7 +1619,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(up{instance!~"(127.0.0.1|localhost).*"}) == 1`),
For: monitoringDuration("2m"),
For: monitoringDuration("10m"),
Labels: map[string]string{"severity": "critical", "slo": "up-targets"},
}},
},
Expand Down
15 changes: 15 additions & 0 deletions slo/slo.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,21 @@ func (o Objective) Exhausts(factor float64) model.Duration {
return model.Duration(time.Second * time.Duration(time.Duration(o.Window).Seconds()/factor))
}

// AbsentDuration calculates the duration when absent alerts should fire.
// The idea is as follows: Use the most critical of the multi burn rate alerts.
// For that alert to fire, both the short AND long windows have to be above the threshold.
// The long window takes the - longest - to fire.
// Assuming absence of the metric means 100% error rate,
// the time it takes to fire is the duration for the long window to go above the threshold (factor * objective).
// Finally, we add the "for" duration we add to the multi burn rate alerts.
func (o Objective) AbsentDuration() model.Duration {
mostCritical := o.Windows()[0]
mostCriticalThreshold := mostCritical.Factor * (1 - o.Target)
mostCriticalDuration := time.Duration(mostCriticalThreshold*mostCritical.Long.Seconds()) * time.Second
mostCriticalDuration += mostCritical.For
return model.Duration(mostCriticalDuration.Round(time.Minute))
}

type IndicatorType int

const (
Expand Down

0 comments on commit 10766e0

Please sign in to comment.