diff --git a/class/defaults.yml b/class/defaults.yml index 49dfe2e..891cee5 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -151,7 +151,6 @@ parameters: - NodeFilesystemAlmostOutOfFiles - NodeFilesystemAlmostOutOfSpace - NodeFilesystemFilesFillingUp - - PodDisruptionBudgetAtLimit - ThanosRuleRuleEvaluationLatencyHigh - etcdDatabaseHighFragmentationRatio - etcdExcessiveDatabaseGrowth diff --git a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 40d2cd7..1df7499 100644 --- a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -369,6 +369,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -638,13 +654,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1908,11 +1924,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 40d2cd7..1df7499 100644 --- a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -369,6 +369,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -638,13 +654,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1908,11 +1924,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index c5960e8..64aef6c 100644 --- a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -375,6 +375,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -647,13 +663,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1983,11 +1999,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 47c414a..6abfa06 100644 --- a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -520,6 +520,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -789,13 +805,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -2059,11 +2075,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 66eb33b..13a95e1 100644 --- a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -350,6 +350,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -619,13 +635,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1885,11 +1901,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 40d2cd7..1df7499 100644 --- a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -369,6 +369,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -638,13 +654,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1908,11 +1924,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index a961bbe..ef3122d 100644 --- a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -375,6 +375,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -644,13 +660,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1980,11 +1996,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 40d2cd7..1df7499 100644 --- a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -369,6 +369,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -638,13 +654,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1908,11 +1924,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 965f34b..573eb60 100644 --- a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -389,6 +389,23 @@ spec: syn: 'true' syn_component: openshift4-monitoring syn_team: clumsy-donkeys + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + syn_team: clumsy-donkeys - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -670,13 +687,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -2013,11 +2030,11 @@ spec: syn_team: clumsy-donkeys - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index ef14096..d2790e5 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -369,6 +369,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -638,13 +654,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1908,11 +1924,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | ( diff --git a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 61e8d57..710272c 100644 --- a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -369,6 +369,22 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetAtLimit + annotations: + description: The pod disruption budget is at the minimum disruptions allowed + level. The number of current healthy pods is equal to the desired healthy + pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetAtLimit.md + summary: The pod disruption budget is preventing further disruption to + pods. + syn_component: openshift4-monitoring + expr: | + max by(namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy == kube_poddisruptionbudget_status_desired_healthy and on (namespace, poddisruptionbudget) kube_poddisruptionbudget_status_expected_pods > 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_PodDisruptionBudgetLimit annotations: description: The pod disruption budget is below the minimum disruptions @@ -638,13 +654,13 @@ spec: rules: - alert: SYN_KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} - on container {{ $labels.container}} has been in waiting state for longer - than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace + }} on container {{ $labels.container}} has been in waiting state for + longer than 1 hour. (reason: "{{ $labels.reason }}").' summary: Pod container waiting longer than 1 hour syn_component: openshift4-monitoring expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -1908,11 +1924,11 @@ spec: syn_component: openshift4-monitoring - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager - {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts - to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% of alerts sent by Prometheus + {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}} + were affected by errors.' + summary: More than 1% of alerts sent by Prometheus to a specific Alertmanager + were affected by errors. syn_component: openshift4-monitoring expr: | (