Skip to content

Commit

Permalink
Enable PodDisruptionBudgetAtLimit alert to find maintenance blockin…
Browse files Browse the repository at this point in the history
…g PDBs (#220)

Checked against internal cluster telemetry, seems to be a high fidelity alert. Only very few alerts need to be silenced due to operator managed PDBs.
  • Loading branch information
bastjan authored Nov 27, 2024
1 parent 122ba06 commit f0b386f
Show file tree
Hide file tree
Showing 12 changed files with 276 additions and 100 deletions.
1 change: 0 additions & 1 deletion class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ parameters:
- NodeFilesystemAlmostOutOfFiles
- NodeFilesystemAlmostOutOfSpace
- NodeFilesystemFilesFillingUp
- PodDisruptionBudgetAtLimit
- ThanosRuleRuleEvaluationLatencyHigh
- etcdDatabaseHighFragmentationRatio
- etcdExcessiveDatabaseGrowth
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -638,13 +654,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1908,11 +1924,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -638,13 +654,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1908,11 +1924,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -647,13 +663,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1983,11 +1999,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -789,13 +805,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -2059,11 +2075,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -619,13 +635,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1885,11 +1901,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,22 @@ spec:
severity: critical
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetAtLimit
annotations:
description: The pod disruption budget is at the minimum disruptions allowed
level. The number of current healthy pods is equal to the desired healthy
pods.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
summary: The pod disruption budget is preventing further disruption to
pods.
syn_component: openshift4-monitoring
expr: |
max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
for: 60m
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_PodDisruptionBudgetLimit
annotations:
description: The pod disruption budget is below the minimum disruptions
Expand Down Expand Up @@ -638,13 +654,13 @@ spec:
rules:
- alert: SYN_KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
on container {{ $labels.container}} has been in waiting state for longer
than 1 hour.
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
}} on container {{ $labels.container}} has been in waiting state for
longer than 1 hour. (reason: "{{ $labels.reason }}").'
summary: Pod container waiting longer than 1 hour
syn_component: openshift4-monitoring
expr: |
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
Expand Down Expand Up @@ -1908,11 +1924,11 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
{{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts
to a specific Alertmanager.
description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
{{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
were affected by errors.'
summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
were affected by errors.
syn_component: openshift4-monitoring
expr: |
(
Expand Down
Loading

0 comments on commit f0b386f

Please sign in to comment.