Enable PodDisruptionBudgetAtLimit alert to find maintenance blockin…

…g PDBs (#220) Checked against internal cluster telemetry, seems to be a high fidelity alert. Only very few alerts need to be silenced due to operator managed PDBs.
appuio · Nov 27, 2024 · f0b386f · f0b386f
1 parent 122ba06
commit f0b386f
Show file tree

Hide file tree

Showing 12 changed files with 276 additions and 100 deletions.
diff --git a/class/defaults.yml b/class/defaults.yml
@@ -151,7 +151,6 @@ parameters:
         - NodeFilesystemAlmostOutOfFiles
         - NodeFilesystemAlmostOutOfSpace
         - NodeFilesystemFilesFillingUp
-        - PodDisruptionBudgetAtLimit
         - ThanosRuleRuleEvaluationLatencyHigh
         - etcdDatabaseHighFragmentationRatio
         - etcdExcessiveDatabaseGrowth

diff --git a/...alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/...alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -369,6 +369,22 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_PodDisruptionBudgetAtLimit
+          annotations:
+            description: The pod disruption budget is at the minimum disruptions allowed
+              level. The number of current healthy pods is equal to the desired healthy
+              pods.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
+            summary: The pod disruption budget is preventing further disruption to
+              pods.
+            syn_component: openshift4-monitoring
+          expr: |
+            max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
+          for: 60m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_PodDisruptionBudgetLimit
           annotations:
             description: The pod disruption budget is below the minimum disruptions
@@ -638,13 +654,13 @@ spec:
       rules:
         - alert: SYN_KubeContainerWaiting
           annotations:
-            description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
-              on container {{ $labels.container}} has been in waiting state for longer
-              than 1 hour.
+            description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
+              }} on container {{ $labels.container}} has been in waiting state for
+              longer than 1 hour. (reason: "{{ $labels.reason }}").'
             summary: Pod container waiting longer than 1 hour
             syn_component: openshift4-monitoring
           expr: |
-            sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
+            kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
           for: 1h
           labels:
             severity: warning
@@ -1908,11 +1924,11 @@ spec:
             syn_component: openshift4-monitoring
         - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
           annotations:
-            description: '{{ printf "%.1f" $value }}% errors while sending alerts
-              from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
-              {{$labels.alertmanager}}.'
-            summary: Prometheus has encountered more than 1% errors sending alerts
-              to a specific Alertmanager.
+            description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
+              {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
+              were affected by errors.'
+            summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
+              were affected by errors.
             syn_component: openshift4-monitoring
           expr: |
             (

diff --git a/.../golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/.../golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -369,6 +369,22 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_PodDisruptionBudgetAtLimit
+          annotations:
+            description: The pod disruption budget is at the minimum disruptions allowed
+              level. The number of current healthy pods is equal to the desired healthy
+              pods.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
+            summary: The pod disruption budget is preventing further disruption to
+              pods.
+            syn_component: openshift4-monitoring
+          expr: |
+            max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
+          for: 60m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_PodDisruptionBudgetLimit
           annotations:
             description: The pod disruption budget is below the minimum disruptions
@@ -638,13 +654,13 @@ spec:
       rules:
         - alert: SYN_KubeContainerWaiting
           annotations:
-            description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
-              on container {{ $labels.container}} has been in waiting state for longer
-              than 1 hour.
+            description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
+              }} on container {{ $labels.container}} has been in waiting state for
+              longer than 1 hour. (reason: "{{ $labels.reason }}").'
             summary: Pod container waiting longer than 1 hour
             syn_component: openshift4-monitoring
           expr: |
-            sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
+            kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
           for: 1h
           labels:
             severity: warning
@@ -1908,11 +1924,11 @@ spec:
             syn_component: openshift4-monitoring
         - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
           annotations:
-            description: '{{ printf "%.1f" $value }}% errors while sending alerts
-              from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
-              {{$labels.alertmanager}}.'
-            summary: Prometheus has encountered more than 1% errors sending alerts
-              to a specific Alertmanager.
+            description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
+              {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
+              were affected by errors.'
+            summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
+              were affected by errors.
             syn_component: openshift4-monitoring
           expr: |
             (

diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -375,6 +375,22 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_PodDisruptionBudgetAtLimit
+          annotations:
+            description: The pod disruption budget is at the minimum disruptions allowed
+              level. The number of current healthy pods is equal to the desired healthy
+              pods.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
+            summary: The pod disruption budget is preventing further disruption to
+              pods.
+            syn_component: openshift4-monitoring
+          expr: |
+            max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
+          for: 60m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_PodDisruptionBudgetLimit
           annotations:
             description: The pod disruption budget is below the minimum disruptions
@@ -647,13 +663,13 @@ spec:
       rules:
         - alert: SYN_KubeContainerWaiting
           annotations:
-            description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
-              on container {{ $labels.container}} has been in waiting state for longer
-              than 1 hour.
+            description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
+              }} on container {{ $labels.container}} has been in waiting state for
+              longer than 1 hour. (reason: "{{ $labels.reason }}").'
             summary: Pod container waiting longer than 1 hour
             syn_component: openshift4-monitoring
           expr: |
-            sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"}) > 0
+            kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"} > 0
           for: 1h
           labels:
             severity: warning
@@ -1983,11 +1999,11 @@ spec:
             syn_component: openshift4-monitoring
         - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
           annotations:
-            description: '{{ printf "%.1f" $value }}% errors while sending alerts
-              from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
-              {{$labels.alertmanager}}.'
-            summary: Prometheus has encountered more than 1% errors sending alerts
-              to a specific Alertmanager.
+            description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
+              {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
+              were affected by errors.'
+            summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
+              were affected by errors.
             syn_component: openshift4-monitoring
           expr: |
             (

diff --git a/...s/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/...s/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -520,6 +520,22 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_PodDisruptionBudgetAtLimit
+          annotations:
+            description: The pod disruption budget is at the minimum disruptions allowed
+              level. The number of current healthy pods is equal to the desired healthy
+              pods.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
+            summary: The pod disruption budget is preventing further disruption to
+              pods.
+            syn_component: openshift4-monitoring
+          expr: |
+            max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
+          for: 60m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_PodDisruptionBudgetLimit
           annotations:
             description: The pod disruption budget is below the minimum disruptions
@@ -789,13 +805,13 @@ spec:
       rules:
         - alert: SYN_KubeContainerWaiting
           annotations:
-            description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
-              on container {{ $labels.container}} has been in waiting state for longer
-              than 1 hour.
+            description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
+              }} on container {{ $labels.container}} has been in waiting state for
+              longer than 1 hour. (reason: "{{ $labels.reason }}").'
             summary: Pod container waiting longer than 1 hour
             syn_component: openshift4-monitoring
           expr: |
-            sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
+            kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
           for: 1h
           labels:
             severity: warning
@@ -2059,11 +2075,11 @@ spec:
             syn_component: openshift4-monitoring
         - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
           annotations:
-            description: '{{ printf "%.1f" $value }}% errors while sending alerts
-              from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
-              {{$labels.alertmanager}}.'
-            summary: Prometheus has encountered more than 1% errors sending alerts
-              to a specific Alertmanager.
+            description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
+              {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
+              were affected by errors.'
+            summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
+              were affected by errors.
             syn_component: openshift4-monitoring
           expr: |
             (

diff --git a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -350,6 +350,22 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_PodDisruptionBudgetAtLimit
+          annotations:
+            description: The pod disruption budget is at the minimum disruptions allowed
+              level. The number of current healthy pods is equal to the desired healthy
+              pods.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
+            summary: The pod disruption budget is preventing further disruption to
+              pods.
+            syn_component: openshift4-monitoring
+          expr: |
+            max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
+          for: 60m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_PodDisruptionBudgetLimit
           annotations:
             description: The pod disruption budget is below the minimum disruptions
@@ -619,13 +635,13 @@ spec:
       rules:
         - alert: SYN_KubeContainerWaiting
           annotations:
-            description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
-              on container {{ $labels.container}} has been in waiting state for longer
-              than 1 hour.
+            description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
+              }} on container {{ $labels.container}} has been in waiting state for
+              longer than 1 hour. (reason: "{{ $labels.reason }}").'
             summary: Pod container waiting longer than 1 hour
             syn_component: openshift4-monitoring
           expr: |
-            sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
+            kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
           for: 1h
           labels:
             severity: warning
@@ -1885,11 +1901,11 @@ spec:
             syn_component: openshift4-monitoring
         - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
           annotations:
-            description: '{{ printf "%.1f" $value }}% errors while sending alerts
-              from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
-              {{$labels.alertmanager}}.'
-            summary: Prometheus has encountered more than 1% errors sending alerts
-              to a specific Alertmanager.
+            description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
+              {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
+              were affected by errors.'
+            summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
+              were affected by errors.
             syn_component: openshift4-monitoring
           expr: |
             (

diff --git a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -369,6 +369,22 @@ spec:
             severity: critical
             syn: 'true'
             syn_component: openshift4-monitoring
+        - alert: SYN_PodDisruptionBudgetAtLimit
+          annotations:
+            description: The pod disruption budget is at the minimum disruptions allowed
+              level. The number of current healthy pods is equal to the desired healthy
+              pods.
+            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md
+            summary: The pod disruption budget is preventing further disruption to
+              pods.
+            syn_component: openshift4-monitoring
+          expr: |
+            max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0)
+          for: 60m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_PodDisruptionBudgetLimit
           annotations:
             description: The pod disruption budget is below the minimum disruptions
@@ -638,13 +654,13 @@ spec:
       rules:
         - alert: SYN_KubeContainerWaiting
           annotations:
-            description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }}
-              on container {{ $labels.container}} has been in waiting state for longer
-              than 1 hour.
+            description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace
+              }} on container {{ $labels.container}} has been in waiting state for
+              longer than 1 hour. (reason: "{{ $labels.reason }}").'
             summary: Pod container waiting longer than 1 hour
             syn_component: openshift4-monitoring
           expr: |
-            sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0
+            kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0
           for: 1h
           labels:
             severity: warning
@@ -1908,11 +1924,11 @@ spec:
             syn_component: openshift4-monitoring
         - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers
           annotations:
-            description: '{{ printf "%.1f" $value }}% errors while sending alerts
-              from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager
-              {{$labels.alertmanager}}.'
-            summary: Prometheus has encountered more than 1% errors sending alerts
-              to a specific Alertmanager.
+            description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus
+              {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}
+              were affected by errors.'
+            summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager
+              were affected by errors.
             syn_component: openshift4-monitoring
           expr: |
             (