diff --git a/class/defaults.yml b/class/defaults.yml index b408bdfc..cafdb483 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -10,9 +10,9 @@ parameters: manifests_version: release-4.15 # no release branches newer than 4.9 exist =_operator_lifecycle_manager_map: - release-4.13: release-4.9 release-4.14: release-4.9 release-4.15: release-4.9 + release-4.16: release-4.9 jsonnetfile_parameters: cmo_version: ${openshift4_monitoring:manifests_version} etcd_version: ${openshift4_monitoring:manifests_version} @@ -205,9 +205,9 @@ parameters: NodeMemoryMajorPagesFaults: # Only alert for >100*cores major page faults/node instead of >500/node expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > on (instance) (count by (instance) (node_cpu_info{}) * 100) - release-4.13: {} release-4.14: {} release-4.15: {} + release-4.16: {} # Alerts to ignore for user workload monitoring ignoreUserWorkload: [] diff --git a/class/openshift4-monitoring.yml b/class/openshift4-monitoring.yml index 4dc1b1f5..4740d5ce 100644 --- a/class/openshift4-monitoring.yml +++ b/class/openshift4-monitoring.yml @@ -2,10 +2,6 @@ parameters: openshift4_monitoring: =_manifest_urls: kube-apiserver: - release-4.13: - api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/api-usage.yaml - cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/cpu-utilization.yaml - slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/kube-apiserver-slos-basic.yaml release-4.14: api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/api-usage.yaml cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/cpu-utilization.yaml @@ -14,21 +10,28 @@ parameters: api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/api-usage.yaml cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/cpu-utilization.yaml slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/kube-apiserver-slos-basic.yaml + release-4.16: + api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/api-usage.yaml + cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/cpu-utilization.yaml + slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/kube-apiserver-slos-basic.yaml machine-api-operator: - release-4.13: - prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.13/install/0000_90_machine-api-operator_04_alertrules.yaml release-4.14: prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.14/install/0000_90_machine-api-operator_04_alertrules.yaml release-4.15: prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.15/install/0000_90_machine-api-operator_04_alertrules.yaml + release-4.16: + prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.16/install/0000_90_machine-api-operator_04_alertrules.yaml + + machine-config-operator: + release-4.14: + prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.14/install/0000_90_machine-config-operator_01_prometheus-rules.yaml + release-4.15: + prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.15/install/0000_90_machine-config-operator_01_prometheus-rules.yaml + release-4.16: + prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.16/install/0000_90_machine-config_01_prometheus-rules.yaml ovn-kubernetes: - release-4.13: - common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml - # We use the "self-hosted" variant of the control-plane alerts, so - # we don't have to worry about unresolved gotemplate references. - control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml release-4.14: common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml # We handle the gotemplate stuff in Jsonnet for now, since Jinja @@ -47,11 +50,16 @@ parameters: # The only templates that are in the alerting rules can be handled # with a simple string replace. control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.15/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml + release-4.16: + common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml + # We use the "self-hosted" variant of the control-plane alerts, so + # we don't have to worry about unresolved gotemplate references. + control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml cloud-credential-operator: - release-4.13: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.13/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml release-4.14: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.14/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml release-4.15: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.15/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml + release-4.16: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.16/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml kapitan: @@ -103,7 +111,7 @@ parameters: source: ${openshift4_monitoring:_manifest_urls:machine-api-operator:${openshift4_monitoring:manifests_version}:prometheus} output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/machine-api-operator.yaml - type: https - source: https://raw.githubusercontent.com/openshift/machine-config-operator/${openshift4_monitoring:manifests_version}/install/0000_90_machine-config-operator_01_prometheus-rules.yaml + source: ${openshift4_monitoring:_manifest_urls:machine-config-operator:${openshift4_monitoring:manifests_version}:prometheus} output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/machine-config-operator.yaml - type: https source: https://raw.githubusercontent.com/operator-framework/operator-lifecycle-manager/${openshift4_monitoring:_operator_lifecycle_manager_map:${openshift4_monitoring:manifests_version}}/manifests/0000_90_olm_01-prometheus-rule.yaml diff --git a/docs/modules/ROOT/pages/references/parameters.adoc b/docs/modules/ROOT/pages/references/parameters.adoc index 5ccccd93..5ecaba84 100644 --- a/docs/modules/ROOT/pages/references/parameters.adoc +++ b/docs/modules/ROOT/pages/references/parameters.adoc @@ -6,7 +6,7 @@ The parent key for all of the following parameters is `openshift4_monitoring`. [horizontal] type:: string -default:: `release-4.14` +default:: `release-4.16` Select which version of the upstream alerting (and recording) rules should be used by the component. This parameter must be changed to match the cluster's OCP4 minor version. @@ -480,7 +480,7 @@ patchRules: PrometheusRemoteWriteBehind: annotations: runbook_url: https://example.com/runbooks/PrometheusRemoteWriteBehind.html - release-4.14: + release-4.16: SystemMemoryExceedsReservation: for: 30m ---- diff --git a/tests/custom-rules.yml b/tests/custom-rules.yml index 0e89bb21..8cf611b2 100644 --- a/tests/custom-rules.yml +++ b/tests/custom-rules.yml @@ -11,7 +11,7 @@ parameters: name: patch-sa openshift4_monitoring: - manifests_version: release-4.13 + manifests_version: release-4.16 customNodeExporter: enabled: true @@ -25,7 +25,7 @@ parameters: labels: foo: foo generic: patch - release-4.13: + release-4.16: HighOverallControlPlaneMemory: labels: foo: bar diff --git a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 6b2db14c..40d2cd74 100644 --- a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -852,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1040,7 +1040,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 6b2db14c..40d2cd74 100644 --- a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -852,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1040,7 +1040,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/appuio_node_exporter.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/appuio_node_exporter.yaml index bfaa3b57..e0792bd8 100644 --- a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/appuio_node_exporter.yaml +++ b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/appuio_node_exporter.yaml @@ -4,7 +4,6 @@ metadata: labels: app.kubernetes.io/part-of: openshift4-monitoring name: appuio-node-exporter - namespace: openshift-monitoring rules: - apiGroups: - authentication.k8s.io @@ -33,7 +32,6 @@ metadata: labels: app.kubernetes.io/part-of: openshift4-monitoring name: appuio-node-exporter - namespace: openshift-monitoring roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole @@ -59,6 +57,7 @@ spec: metadata: annotations: kubectl.kubernetes.io/default-container: appuio-node-exporter + openshift.io/required-scc: node-exporter labels: app.kubernetes.io/managed-by: cluster-monitoring-operator app.kubernetes.io/part-of: openshift4-monitoring @@ -152,6 +151,9 @@ spec: fi echo "ts=$(date -Iseconds) num_cpus=$NUM_CPUS gomaxprocs=$GOMAXPROCS" exec /bin/node_exporter "$0" "$@" + env: + - name: DBUS_SYSTEM_BUS_ADDRESS + value: unix:path=/host/root/var/run/dbus/system_bus_socket image: quay.io/prometheus/node-exporter:v1.8.2 name: appuio-node-exporter resources: @@ -162,7 +164,6 @@ spec: cpu: 8m memory: 32Mi securityContext: {} - terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /host/sys mountPropagation: HostToContainer @@ -177,7 +178,6 @@ spec: readOnly: true workingDir: /var/node_exporter/textfile - args: - - --logtostderr - --secure-listen-address=[$(IP)]:9199 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:9199/ @@ -209,7 +209,8 @@ spec: runAsGroup: 65532 runAsNonRoot: true runAsUser: 65532 - terminationMessagePolicy: FallbackToLogsOnError + seccompProfile: + type: RuntimeDefault volumeMounts: - mountPath: /etc/tls/private name: node-exporter-tls @@ -240,7 +241,6 @@ spec: securityContext: privileged: true runAsUser: 0 - terminationMessagePolicy: FallbackToLogsOnError volumeMounts: - mountPath: /var/node_exporter/textfile name: node-exporter-textfile @@ -306,6 +306,8 @@ apiVersion: v1 kind: Service metadata: annotations: + openshift.io/description: Expose the `/metrics` endpoint on port 9199. This port + is for internal use, and no other usage is guaranteed. service.beta.openshift.io/serving-cert-secret-name: appuio-node-exporter-tls labels: app.kubernetes.io/part-of: openshift4-monitoring diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 61120e67..c5960e8c 100644 --- a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -221,22 +221,44 @@ spec: syn_component: openshift4-monitoring - name: syn-cluster-operators rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_ClusterOperatorDegraded annotations: description: The {{ $labels.name }} operator is degraded because {{ $labels.reason }}, and the components it manages may have reduced quality of service. Cluster - upgrades may not complete. For more information refer to 'oc get -o - yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" - | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or - {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ - end }}. + upgrades may not complete. For more information refer to '{{ if eq $labels.name + "version" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ + $labels.name }}{{ end }}'{{ with $console_url := "console_url" | query + }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label + "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md summary: Cluster operator has been degraded for 30 minutes. syn_component: openshift4-monitoring expr: | max by (namespace, name, reason) ( ( - cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} + cluster_operator_conditions{job="cluster-version-operator", name!="version", condition="Degraded"} + or on (namespace, name) + cluster_operator_conditions{job="cluster-version-operator", name="version", condition="Failing"} or on (namespace, name) group by (namespace, name) (cluster_operator_up{job="cluster-version-operator"}) ) == 1 @@ -249,12 +271,14 @@ spec: - alert: SYN_ClusterOperatorDown annotations: description: The {{ $labels.name }} operator may be down or disabled because - ${{ $labels.reason }}, and the components it manages may be unavailable + {{ $labels.reason }}, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information - refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with - $console_url := "console_url" | query }}{{ if ne (len (label "url" (first - $console_url ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{ + refer to '{{ if eq $labels.name "version" }}oc adm upgrade{{ else }}oc + get -o yaml clusteroperator {{ $labels.name }}{{ end }}'{{ with $console_url + := "console_url" | query }}{{ if ne (len (label "url" (first $console_url + ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md summary: Cluster operator has not been available for 10 minutes. syn_component: openshift4-monitoring expr: | @@ -267,10 +291,11 @@ spec: - alert: SYN_ClusterOperatorFlapping annotations: description: The {{ $labels.name }} operator behavior might cause upgrades - to be unstable. For more information refer to 'oc get -o yaml clusteroperator - {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ - if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label "url" - (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}. + to be unstable. For more information refer to '{{ if eq $labels.name + "version" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ + $labels.name }}{{ end }}'{{ with $console_url := "console_url" | query + }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label + "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}. summary: Cluster operator up status is changing often. syn_component: openshift4-monitoring expr: | @@ -285,7 +310,7 @@ spec: - alert: SYN_ClusterReleaseNotAccepted annotations: description: The desired cluster release has not been accepted because - ${{ $labels.reason }}, and the cluster will continue to reconcile an + {{ $labels.reason }}, and the cluster will continue to reconcile an earlier release instead of moving towards that desired release. For more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or @@ -339,6 +364,7 @@ spec: - alert: SYN_KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md summary: Target disappeared from Prometheus target discovery. syn_component: openshift4-monitoring expr: | @@ -429,7 +455,7 @@ spec: summary: etcd cluster database is running full. syn_component: openshift4-monitoring expr: | - (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical @@ -613,6 +639,7 @@ spec: > 0.01 for: 15m labels: + namespace: openshift-monitoring severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -834,7 +861,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"} @@ -1022,7 +1049,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning @@ -1046,6 +1073,7 @@ spec: description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAggregatedAPIErrors.md summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | @@ -1079,6 +1107,7 @@ spec: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1132,6 +1161,7 @@ spec: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1145,6 +1175,7 @@ spec: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: + namespace: kube-system severity: warning syn: 'true' syn_component: openshift4-monitoring @@ -1173,7 +1204,7 @@ spec: mapi_mao_collector_up == 0 for: 5m labels: - severity: critical + severity: warning syn: 'true' syn_component: openshift4-monitoring - name: syn-machine-health-check-unterminated-short-circuit @@ -1244,14 +1275,34 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcc-boot-image-update-error + rules: + - alert: SYN_MCCBootImageUpdateError + annotations: + description: 'The boot images of Machineset {{ $labels.machineset }} could + not be updated. For more details check MachineConfigController pod logs: + oc logs -n {{ $labels.namespace }} -f $(oc get pod -o name -l=''k8s-app=machine-config-controller'' + -n openshift-machine-config-operator) | grep machine_set' + summary: Triggers when machineset boot images could not be updated + syn_component: openshift4-monitoring + expr: | + mcc_bootimage_update_error > 0 + for: 30m + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcc-drain-error rules: - alert: SYN_MCCDrainError annotations: - message: 'Drain failed on {{ $labels.exported_node }} , updates may be - blocked. For more details check MachineConfigController pod logs: oc - logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c - machine-config-controller' + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. syn_component: openshift4-monitoring expr: | mcc_drain_err > 0 @@ -1260,11 +1311,32 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcd-kubelet-health-state-error rules: - alert: SYN_KubeletHealthState annotations: - message: Kubelet health failure threshold reached + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. syn_component: openshift4-monitoring expr: | mcd_kubelet_state > 2 @@ -1273,13 +1345,34 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring + - name: syn-mcd-missing-mc + rules: + - alert: SYN_MissingMachineConfig + annotations: + description: Could not find config {{ $labels.mc }} in-cluster, this likely + indicates the MachineConfigs in-cluster has changed during the install + process. If you are seeing this when installing the cluster, please + compare the in-cluster rendered machineconfigs to /etc/mcs-machine-config-content.json + summary: This keeps track of Machine Config failures. Specifically a common + failure on install when a rendered Machine Config is missing. Triggered + when this error happens once. + syn_component: openshift4-monitoring + expr: | + mcd_missing_mc > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-mcd-pivot-error rules: - alert: SYN_MCDPivotError annotations: - message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. syn_component: openshift4-monitoring expr: | mcd_pivot_errors_total > 0 @@ -1293,9 +1386,11 @@ spec: rules: - alert: SYN_MCDRebootError annotations: - message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. syn_component: openshift4-monitoring expr: | mcd_reboots_failed_total > 0 @@ -1327,10 +1422,12 @@ spec: runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md summary: Clock not synchronising. syn_component: openshift4-monitoring - expr: | + expr: |- + ( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: critical @@ -1342,7 +1439,8 @@ spec: 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. syn_component: openshift4-monitoring - expr: | + expr: |- + ( ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and @@ -1354,25 +1452,12 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - summary: Disk IO queue is high. - syn_component: openshift4-monitoring - expr: | - rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 - for: 30m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently @@ -1505,19 +1590,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-monitoring - - alert: SYN_NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - summary: Host is running out of memory. - syn_component: openshift4-monitoring - expr: | - 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m - labels: - severity: warning - syn: 'true' - syn_component: openshift4-monitoring - alert: SYN_NodeMemoryMajorPagesFaults annotations: description: | @@ -1568,7 +1640,7 @@ spec: syn_component: openshift4-monitoring expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: 15m labels: severity: warning syn: 'true' @@ -1690,6 +1762,31 @@ spec: severity: critical syn: 'true' syn_component: openshift4-monitoring + - alert: SYN_etcdSignerCAExpirationCritical + annotations: + description: etcd is reporting the signer ca "{{ $labels.name }}" to have + less than year (({{ printf "%.f" $value }} days) of validity left. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdSignerCAExpirationCritical.md + summary: etcd has critical signer ca expiration + syn_component: openshift4-monitoring + expr: avg(openshift_etcd_operator_signer_expiration_days) by (name) < 365 + for: 1h + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdSignerCAExpirationWarning + annotations: + description: etcd is reporting the signer ca "{{ $labels.name }}" to have + less than two years (({{ printf "%.f" $value }} days) of validity left. + summary: etcd signer ca is about to expire + syn_component: openshift4-monitoring + expr: avg(openshift_etcd_operator_signer_expiration_days) by (name) < 730 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - name: syn-openshift-general.rules rules: - alert: SYN_TargetDown @@ -1700,6 +1797,7 @@ spec: network connectivity issues, down nodes, or failures within these components. Assess the health of the infrastructure and nodes running these targets and then contact support.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/TargetDown.md summary: Some targets were not reachable from the monitoring server for an extended period of time. syn_component: openshift4-monitoring @@ -1722,7 +1820,8 @@ spec: annotations: description: This alert fires when there is a Route owned by an unmanaged Ingress. - message: Route {{ $labels.name }} is owned by an unmanaged Ingress. + message: Route {{ $labels.namespace }}/{{ $labels.name }} is owned by + an unmanaged Ingress. summary: Route owned by an Ingress no longer managed syn_component: openshift4-monitoring expr: openshift_ingress_to_route_controller_route_with_unmanaged_owner == @@ -1872,6 +1971,7 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusDuplicateTimestamps.md summary: Prometheus is dropping samples with duplicate timestamps. syn_component: openshift4-monitoring expr: | @@ -2035,6 +2135,7 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusRemoteStorageFailures.md summary: Prometheus fails to send samples to remote storage. syn_component: openshift4-monitoring expr: | @@ -2110,6 +2211,7 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusScrapeBodySizeLimitHit.md summary: Prometheus has dropped some targets that exceeded body size limit. syn_component: openshift4-monitoring expr: | @@ -2198,7 +2300,7 @@ spec: summary: Prometheus operator not ready syn_component: openshift4-monitoring expr: | - min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) for: 5m labels: severity: warning @@ -2212,7 +2314,7 @@ spec: summary: Errors while reconciling objects. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2223,6 +2325,7 @@ spec: description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusOperatorRejectedResources.md summary: Resources rejected by Prometheus operator syn_component: openshift4-monitoring expr: | @@ -2240,7 +2343,7 @@ spec: summary: Errors while updating objects status. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning @@ -2266,7 +2369,7 @@ spec: summary: Errors while performing watch operations in controller. syn_component: openshift4-monitoring expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning @@ -2276,7 +2379,7 @@ spec: rules: - alert: SYN_SystemMemoryExceedsReservation annotations: - message: System memory usage of {{ $value | humanize }} on {{ $labels.node + description: System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the @@ -2284,6 +2387,8 @@ spec: configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved syn_component: openshift4-monitoring expr: | sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) @@ -2333,9 +2438,9 @@ spec: annotations: description: Thanos Query {{$labels.job}} in {{$labels.namespace}} has been overloaded for more than 15 minutes. This may be a symptom of excessive - simultanous complex requests, low performance of the Prometheus API, + simultaneous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos - query instances, the connnected Prometheus instances, look for potential + query instances, the connected Prometheus instances, look for potential senders of these requests and then contact support. summary: Thanos query reaches its maximum capacity serving concurrent requests. diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 2f5bffda..47c414aa 100644 --- a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1003,7 +1003,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1191,7 +1191,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 08627bed..66eb33b9 100644 --- a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -833,7 +833,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1021,7 +1021,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 6b2db14c..40d2cd74 100644 --- a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -852,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1040,7 +1040,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/00_namespace_labels.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/00_namespace_labels.yaml new file mode 100644 index 00000000..4bc92396 --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/00_namespace_labels.yaml @@ -0,0 +1,23 @@ +apiVersion: redhatcop.redhat.io/v1alpha1 +kind: Patch +metadata: + annotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + labels: + name: namespace-openshift-monitoring-c4273dc15ddfdf7 + name: namespace-openshift-monitoring-c4273dc15ddfdf7 + namespace: syn-patch-operator +spec: + patches: + namespace-openshift-monitoring-c4273dc15ddfdf7-patch: + patchTemplate: |- + "metadata": + "labels": + "network.openshift.io/policy-group": "monitoring" + patchType: application/strategic-merge-patch+json + targetObjectRef: + apiVersion: v1 + kind: Namespace + name: openshift-monitoring + serviceAccountRef: + name: patch-sa diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/01_secrets.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/01_secrets.yaml new file mode 100644 index 00000000..e69de29b diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/02_aggregated_clusterroles.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/02_aggregated_clusterroles.yaml new file mode 100644 index 00000000..97a8cf95 --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/02_aggregated_clusterroles.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + annotations: {} + labels: + name: syn-openshift4-monitoring-cluster-reader + rbac.authorization.k8s.io/aggregate-to-cluster-reader: 'true' + name: syn-openshift4-monitoring-cluster-reader +rules: + - apiGroups: + - monitoring.coreos.com + resources: + - '*' + verbs: + - get + - list + - watch diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_alertmanager_config.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_alertmanager_config.yaml new file mode 100644 index 00000000..5035dc3d --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_alertmanager_config.yaml @@ -0,0 +1,39 @@ +apiVersion: v1 +data: {} +kind: Secret +metadata: + annotations: {} + labels: + name: alertmanager-main + name: alertmanager-main + namespace: openshift-monitoring +stringData: + alertmanager.yaml: |- + "inhibit_rules": + - "equal": + - "namespace" + - "alertname" + "source_match": + "severity": "critical" + "target_match_re": + "severity": "warning|info" + - "equal": + - "namespace" + - "alertname" + "source_match": + "severity": "warning" + "target_match_re": + "severity": "info" + "receivers": + - "name": "__component_openshift4_monitoring_null" + "route": + "group_interval": "5s" + "group_wait": "0s" + "repeat_interval": "10m" + "routes": + - "continue": false + "matchers": + - "namespace =~ \"\"" + "receiver": "__component_openshift4_monitoring_null" + - "receiver": "__component_openshift4_monitoring_null" +type: Opaque diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_configmap.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_configmap.yaml new file mode 100644 index 00000000..4588f8c0 --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_configmap.yaml @@ -0,0 +1,54 @@ +apiVersion: v1 +data: + config.yaml: |- + "alertmanagerMain": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "volumeClaimTemplate": + "spec": + "resources": + "requests": + "storage": "2Gi" + "enableUserWorkload": true + "grafana": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "k8sPrometheusAdapter": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "kubeStateMetrics": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "openshiftStateMetrics": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "prometheusK8s": + "externalLabels": + "cluster_id": "c-green-test-1234" + "cluster_name": "Test Cluster 1234" + "tenant_id": "t-silent-test-1234" + "tenant_name": "Test Tenant 1234" + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "retention": "8d" + "volumeClaimTemplate": + "spec": + "resources": + "requests": + "storage": "50Gi" + "prometheusOperator": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "telemeterClient": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "thanosQuerier": + "nodeSelector": + "node-role.kubernetes.io/infra": "" +kind: ConfigMap +metadata: + annotations: {} + labels: + name: cluster-monitoring-config + name: cluster-monitoring-config + namespace: openshift-monitoring diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_configmap_user_workload.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_configmap_user_workload.yaml new file mode 100644 index 00000000..08f4fff0 --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/10_configmap_user_workload.yaml @@ -0,0 +1,41 @@ +apiVersion: v1 +data: + config.yaml: |- + "alertmanager": + "enableAlertmanagerConfig": true + "enabled": true + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "volumeClaimTemplate": + "spec": + "resources": + "requests": + "storage": "2Gi" + "prometheus": + "externalLabels": + "cluster_id": "c-green-test-1234-user-workload" + "cluster_name": "Test Cluster 1234 User Workload" + "tenant_id": "t-silent-test-1234" + "tenant_name": "Test Tenant 1234" + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "remoteWrite": [] + "retention": "8d" + "volumeClaimTemplate": + "spec": + "resources": + "requests": + "storage": "50Gi" + "prometheusOperator": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "thanosRuler": + "nodeSelector": + "node-role.kubernetes.io/infra": "" +kind: ConfigMap +metadata: + annotations: {} + labels: + name: user-workload-monitoring-config + name: user-workload-monitoring-config + namespace: openshift-user-workload-monitoring diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml new file mode 100644 index 00000000..e7e81d7c --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml @@ -0,0 +1,62 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: alertmanager-allow-web + name: alertmanager-allow-web + namespace: openshift-monitoring +spec: + ingress: + - from: + - namespaceSelector: {} + ports: + - port: 9092 + protocol: TCP + - port: 9093 + protocol: TCP + - port: 9095 + protocol: TCP + - port: 9097 + protocol: TCP + podSelector: + matchLabels: + app.kubernetes.io/name: alertmanager + policyTypes: + - Ingress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: allow-same-namespace + name: allow-same-namespace + namespace: openshift-monitoring +spec: + ingress: + - from: + - podSelector: {} + podSelector: {} + policyTypes: + - Ingress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: allow-non-alertmanager + name: allow-non-alertmanager + namespace: openshift-monitoring +spec: + ingress: + - {} + podSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: NotIn + values: + - alertmanager + policyTypes: + - Ingress diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml new file mode 100644 index 00000000..ad572958 --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml @@ -0,0 +1,62 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: alertmanager-allow-web + name: alertmanager-allow-web + namespace: openshift-user-workload-monitoring +spec: + ingress: + - from: + - namespaceSelector: {} + ports: + - port: 9092 + protocol: TCP + - port: 9093 + protocol: TCP + - port: 9095 + protocol: TCP + - port: 9097 + protocol: TCP + podSelector: + matchLabels: + app.kubernetes.io/name: alertmanager + policyTypes: + - Ingress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: allow-same-namespace + name: allow-same-namespace + namespace: openshift-user-workload-monitoring +spec: + ingress: + - from: + - podSelector: {} + podSelector: {} + policyTypes: + - Ingress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: allow-non-alertmanager + name: allow-non-alertmanager + namespace: openshift-user-workload-monitoring +spec: + ingress: + - {} + podSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: NotIn + values: + - alertmanager + policyTypes: + - Ingress diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/capacity_rules.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/capacity_rules.yaml new file mode 100644 index 00000000..a430c4b2 --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/capacity_rules.yaml @@ -0,0 +1,141 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: + syn_component: openshift4-monitoring + labels: + name: capacity + name: capacity + namespace: openshift-monitoring +spec: + groups: + - name: syn-CpuCapacity + rules: + - alert: SYN_ClusterCpuUsageHigh + annotations: + description: The cluster is close to using up all CPU resources. The cluster + might not be able to handle node failures or load spikes. Consider adding + new nodes. + message: Only {{ $value }} idle cpu cores accross cluster. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/cpucapacity.html#SYN_ClusterCpuUsageHigh + syn_component: openshift4-monitoring + expr: sum(label_replace(rate(node_cpu_seconds_total{mode="idle"}[15m]), + "node", "$1", "instance", "(.+)") * on(node) group_left kube_node_role{role="app"}) + < 1.000000 * max((kube_node_status_capacity{resource="cpu"}) * on(node) + group_left kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-MemoryCapacity + rules: + - alert: SYN_ClusterLowOnMemory + annotations: + description: The cluster is close to using all of its memory. The cluster + might not be able to handle node failures or load spikes. Consider adding + new nodes. + message: Only {{ $value }} free memory on Worker Nodes. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/memorycapacity.html#SYN_ClusterMemoryUsageHigh + syn_component: openshift4-monitoring + expr: sum(label_replace(node_memory_MemAvailable_bytes, "node", "$1", "instance", + "(.+)") * on(node) group_left kube_node_role{role="app"}) < 1.000000 * + max((kube_node_status_capacity{resource="memory"}) * on(node) group_left + kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-PodCapacity + rules: + - alert: SYN_TooManyPods + annotations: + description: The cluster is close to the limit of running pods. The cluster + might not be able to handle node failures and might not be able to start + new pods. Consider adding new nodes. + message: Only {{ $value }} more pods can be started. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/podcapacity.html#SYN_TooManyPods + syn_component: openshift4-monitoring + expr: sum(kube_node_status_capacity{resource="pods"} * on(node) group_left + kube_node_role{role="app"}) - sum(kubelet_running_pods * on(node) group_left + kube_node_role{role="app"}) < 1.000000 * max((kube_node_status_capacity{resource="pods"}) + * on(node) group_left kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-ResourceRequests + rules: + - alert: SYN_TooMuchCPURequested + annotations: + description: The cluster is close to assigning all CPU resources to running + pods. The cluster might not be able to handle node failures and might + soon not be able to start new pods. Consider adding new nodes. + message: Only {{ $value }} cpu cores left for new pods. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/resourcerequests.html#SYN_TooMuchCPURequested + syn_component: openshift4-monitoring + expr: sum(kube_node_status_allocatable{resource="cpu"} * on(node) group_left + kube_node_role{role="app"}) - sum(kube_pod_resource_request{resource="cpu"} + * on(node) group_left kube_node_role{role="app"}) < 1.000000 * max((kube_node_status_allocatable{resource="cpu"}) + * on(node) group_left kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_TooMuchMemoryRequested + annotations: + description: The cluster is close to assigning all memory to running pods. + The cluster might not be able to handle node failures and might not + be able to start new pods. Consider adding new nodes. + message: Only {{ $value }} memory left for new pods. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/resourcerequests.html#SYN_TooMuchMemoryRequested + syn_component: openshift4-monitoring + expr: sum(kube_node_status_allocatable{resource="memory"} * on(node) group_left + kube_node_role{role="app"}) - sum(kube_pod_resource_request{resource="memory"} + * on(node) group_left kube_node_role{role="app"}) < 1.000000 * max((kube_node_status_allocatable{resource="memory"}) + * on(node) group_left kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-UnusedCapacity + rules: + - alert: SYN_ClusterHasUnusedNodes + annotations: + description: The cluster has {{ $value }} unused nodes. Consider removing + unused nodes. + message: Cluster has unused nodes. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/unusedcapacity.html#SYN_ClusterHasUnusedNodes + syn_component: openshift4-monitoring + expr: |- + min(( + label_replace( + (sum(kube_node_status_capacity{resource="pods"} * on(node) group_left kube_node_role{role="app"}) - sum(kubelet_running_pods * on(node) group_left kube_node_role{role="app"})) / max((kube_node_status_capacity{resource="pods"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "pods", "", "") + ) or ( + label_replace( + (sum(kube_node_status_allocatable{resource="memory"} * on(node) group_left kube_node_role{role="app"}) - sum(kube_pod_resource_request{resource="memory"} * on(node) group_left kube_node_role{role="app"})) / max((kube_node_status_allocatable{resource="memory"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "requested_memory", "", "") + ) or ( + label_replace( + (sum(kube_node_status_allocatable{resource="cpu"} * on(node) group_left kube_node_role{role="app"}) - sum(kube_pod_resource_request{resource="cpu"} * on(node) group_left kube_node_role{role="app"})) / max((kube_node_status_allocatable{resource="cpu"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "requested_cpu", "", "") + ) or ( + label_replace( + sum(label_replace(node_memory_MemAvailable_bytes, "node", "$1", "instance", "(.+)") * on(node) group_left kube_node_role{role="app"}) / max((kube_node_status_capacity{resource="memory"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "memory", "", "") + ) or ( + label_replace( + sum(label_replace(rate(node_cpu_seconds_total{mode="idle"}[15m]), "node", "$1", "instance", "(.+)") * on(node) group_left kube_node_role{role="app"}) / max((kube_node_status_capacity{resource="cpu"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "cpu", "", "") + ) + ) > 4.000000 + for: 8h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml new file mode 100644 index 00000000..a961bbea --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -0,0 +1,2533 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + role: alert-rules + name: syn-k8s-rules + namespace: openshift-monitoring +spec: + groups: + - name: syn-alertmanager.rules + rules: + - alert: SYN_AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances + within the {{$labels.job}} cluster have been up for less than half of + the last 5m.' + summary: Half or more of the Alertmanager instances within the same cluster + are down. + syn_component: openshift4-monitoring + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job=~"alertmanager-main|alertmanager-user-workload"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration + }} sent from any instance in the {{$labels.job}} cluster is {{ $value + | humanizePercentage }}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerClusterFailedToSendAlerts.md + summary: All Alertmanager instances in a cluster failed to send notifications + to a critical integration. + syn_component: openshift4-monitoring + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m]) + / + ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster + have different configurations. + summary: Alertmanager instances within the same cluster have different + configurations. + syn_component: openshift4-monitoring + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job=~"alertmanager-main|alertmanager-user-workload"}) + ) + != 1 + for: 20m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace + }}/{{ $labels.pod}}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerFailedReload.md + summary: Reloading an Alertmanager configuration has failed. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) == 0 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed + to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration + }}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerFailedToSendAlerts.md + summary: An Alertmanager instance failed to send notifications. + syn_component: openshift4-monitoring + expr: | + ( + rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) + / + ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has + only found {{ $value }} members of the {{$labels.job}} cluster. + summary: A member of an Alertmanager cluster has not found all other cluster + members. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=~"alertmanager-main|alertmanager-user-workload"}[5m])) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-machine-approver.rules + rules: + - alert: SYN_MachineApproverMaxPendingCSRsReached + annotations: + description: | + The number of pending CertificateSigningRequests has exceeded the + maximum threshold (current number of machine + 100). Check the + pending CSRs to determine which machines need approval, also check + that the nodelink controller is running in the openshift-machine-api + namespace. + summary: max pending CSRs threshold reached. + syn_component: openshift4-monitoring + expr: | + mapi_current_pending_csr > mapi_max_pending_csr + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-network-operator-sdn.rules + rules: + - alert: SYN_NodeProxyApplySlow + annotations: + description: Configuration of proxy rules for Kubernetes services in the + node is taking too long and stale endpoints may exist. + summary: OpenShift SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} + $labels.node {{"}}"}} is taking too long to update proxy rules for services. + syn_component: openshift4-monitoring + expr: | + histogram_quantile(.95, sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket[5m])) by (le, namespace, pod)) + * on(namespace, pod) group_right topk by (namespace, pod) (1, kube_pod_info{namespace="openshift-sdn", pod=~"sdn-[^-]*"}) > 15 + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeProxyApplyStale + annotations: + description: Stale proxy rules for Kubernetes services may increase the + time to configure the network and may degrade the network. + summary: OpenShift SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} + $labels.node {{"}}"}} has stale Kubernetes service rules. + syn_component: openshift4-monitoring + expr: | + (kubeproxy_sync_proxy_rules_last_queued_timestamp_seconds - kubeproxy_sync_proxy_rules_last_timestamp_seconds) + * on(namespace, pod) group_right() topk by (namespace, pod) (1, kube_pod_info{namespace="openshift-sdn",pod=~"sdn-[^-]*"}) + > 30 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeWithoutSDNController + annotations: + description: | + If at least one OpenShift SDN controller is 'Running', network control plane should be functional but + high availability is degraded when a controller is not functional. + summary: All control plane nodes should be running an OpenShift SDN controller + pod, {{"{{"}} $labels.node {{"}}"}} is not. + syn_component: openshift4-monitoring + expr: | + count(kube_node_role{role="master"} == 1) != count(kube_pod_info{namespace="openshift-sdn", pod=~"sdn-controller.*"}) + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeWithoutSDNPod + annotations: + description: Network control plane configuration on the node could be + degraded. + summary: All nodes should be running an OpenShift SDN pod, {{"{{"}} $labels.node + {{"}}"}} is not. + syn_component: openshift4-monitoring + expr: | + (kube_node_info unless on(node) topk by (node) (1, kube_pod_info{namespace="openshift-sdn", pod=~"sdn-[^-]*"})) > 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_SDNPodNotReady + annotations: + description: Network control plane configuration on the node could be + degraded. + summary: OpenShift SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} + $labels.node {{"}}"}} is not ready. + syn_component: openshift4-monitoring + expr: | + sum by(pod, namespace) (kube_pod_status_ready{condition="true",namespace="openshift-sdn"}) * on(pod, namespace) group_right() kube_pod_info == 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-operators + rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ClusterOperatorDegraded + annotations: + description: The {{ $labels.name }} operator is degraded because {{ $labels.reason + }}, and the components it manages may have reduced quality of service. Cluster + upgrades may not complete. For more information refer to '{{ if eq $labels.name + "version" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ + $labels.name }}{{ end }}'{{ with $console_url := "console_url" | query + }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label + "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md + summary: Cluster operator has been degraded for 30 minutes. + syn_component: openshift4-monitoring + expr: | + max by (namespace, name, reason) + ( + ( + cluster_operator_conditions{job="cluster-version-operator", name!="version", condition="Degraded"} + or on (namespace, name) + cluster_operator_conditions{job="cluster-version-operator", name="version", condition="Failing"} + or on (namespace, name) + group by (namespace, name) (cluster_operator_up{job="cluster-version-operator"}) + ) == 1 + ) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ClusterOperatorDown + annotations: + description: The {{ $labels.name }} operator may be down or disabled because + {{ $labels.reason }}, and the components it manages may be unavailable + or degraded. Cluster upgrades may not complete. For more information + refer to '{{ if eq $labels.name "version" }}oc adm upgrade{{ else }}oc + get -o yaml clusteroperator {{ $labels.name }}{{ end }}'{{ with $console_url + := "console_url" | query }}{{ if ne (len (label "url" (first $console_url + ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{ + end }}{{ end }}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md + summary: Cluster operator has not been available for 10 minutes. + syn_component: openshift4-monitoring + expr: | + max by (namespace, name, reason) (cluster_operator_up{job="cluster-version-operator"} == 0) + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ClusterOperatorFlapping + annotations: + description: The {{ $labels.name }} operator behavior might cause upgrades + to be unstable. For more information refer to '{{ if eq $labels.name + "version" }}oc adm upgrade{{ else }}oc get -o yaml clusteroperator {{ + $labels.name }}{{ end }}'{{ with $console_url := "console_url" | query + }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label + "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}. + summary: Cluster operator up status is changing often. + syn_component: openshift4-monitoring + expr: | + max by (namespace, name) (changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2) + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-version + rules: + - alert: SYN_ClusterReleaseNotAccepted + annotations: + description: The desired cluster release has not been accepted because + {{ $labels.reason }}, and the cluster will continue to reconcile an + earlier release instead of moving towards that desired release. For + more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" + | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or + {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ + end }}. + summary: The desired cluster release has not been accepted for at least + an hour. + syn_component: openshift4-monitoring + expr: | + max by (namespace, name, reason) (cluster_operator_conditions{name="version", condition="ReleaseAccepted", endpoint="metrics"} == 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ClusterVersionOperatorDown + annotations: + description: The operator may be down or disabled. The cluster will not + be kept up to date and upgrades will not be possible. Inspect the openshift-cluster-version + namespace for events or changes to the cluster-version-operator deployment + or pods to diagnose and repair. {{ with $console_url := "console_url" + | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} For + more information refer to {{ label "url" (first $console_url ) }}/k8s/cluster/projects/openshift-cluster-version.{{ + end }}{{ end }} + summary: Cluster version operator has disappeared from Prometheus target + discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="cluster-version-operator"} == 1) + for: 10m + labels: + namespace: openshift-cluster-version + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeControllerManagerDown + annotations: + description: KubeControllerManager has disappeared from Prometheus target + discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/KubeControllerManagerDown.md + summary: Target disappeared from Prometheus target discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + namespace: openshift-kube-controller-manager + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeSchedulerDown + annotations: + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-scheduler-operator/KubeSchedulerDown.md + summary: Target disappeared from Prometheus target discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="scheduler"} == 1) + for: 15m + labels: + namespace: openshift-kube-scheduler + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetLimit + annotations: + description: The pod disruption budget is below the minimum disruptions + allowed level and is not satisfied. The number of current healthy pods + is less than the desired healthy pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetLimit.md + summary: The pod disruption budget registers insufficient amount of pods. + syn_component: openshift4-monitoring + expr: | + max by (namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy < kube_poddisruptionbudget_status_desired_healthy) + for: 15m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-control-plane-cpu-utilization + rules: + - alert: SYN_ExtremelyHighIndividualControlPlaneCPU + annotations: + description: Extreme CPU pressure can cause slow serialization and poor + performance from the kube-apiserver and etcd. When this happens, there + is a risk of clients seeing non-responsive API requests which are issued + again causing even more CPU pressure. It can also cause failing liveness + probes due to slow etcd responsiveness on the backend. If one kube-apiserver + fails under this condition, chances are you will experience a cascade + as the remaining kube-apiservers are also under-provisioned. To fix + this, increase the CPU and memory on your control plane nodes. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md + summary: Sustained high CPU utilization on a single control plane node, + more CPU pressure is likely to cause a failover; increase available + CPU. + syn_component: openshift4-monitoring + expr: | + 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 90 AND on (instance) label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + for: 1h + labels: + namespace: openshift-kube-apiserver + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_HighOverallControlPlaneCPU + annotations: + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. + syn_component: openshift4-monitoring + expr: | + sum( + 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) + AND on (instance) label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) + / + count(kube_node_role{role="master"}) + > 60 + for: 10m + labels: + namespace: openshift-kube-apiserver + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-etcd + rules: + - alert: SYN_etcdDatabaseQuotaLowSpace + annotations: + description: 'etcd cluster "{{ $labels.job }}": database size exceeds + the defined quota on etcd instance {{ $labels.instance }}, please defrag + or increase the quota as the writes to etcd will be disabled when it + is full.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdDatabaseQuotaLowSpace.md + summary: etcd cluster database is running full. + syn_component: openshift4-monitoring + expr: | + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync + durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md + summary: etcd cluster 99th percentile fsync durations are too high. + syn_component: openshift4-monitoring + expr: | + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 1 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdHighNumberOfFailedProposals + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal + failures within the last 30 minutes on etcd instance {{ $labels.instance + }}.' + summary: etcd cluster has high number of proposal failures. + syn_component: openshift4-monitoring + expr: | + rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdMembersDown + annotations: + description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value + }}).' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdMembersDown.md + summary: etcd cluster members are down. + syn_component: openshift4-monitoring + expr: | + max without (endpoint) ( + sum without (instance) (up{job=~".*etcd.*"} == bool 0) + or + count without (To) ( + sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 + ) + ) + > 0 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdNoLeader + annotations: + description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance + }} has no leader.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdNoLeader.md + summary: etcd cluster has no leader. + syn_component: openshift4-monitoring + expr: | + etcd_server_has_leader{job=~".*etcd.*"} == 0 + for: 1m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-extremely-high-individual-control-plane-memory + rules: + - alert: SYN_ExtremelyHighIndividualControlPlaneMemory + annotations: + description: The memory utilization per instance within control plane + nodes influence the stability, and responsiveness of the cluster. This + can lead to cluster instability and slow responses from kube-apiserver + or failing requests specially on etcd. Moreover, OOM kill is expected + which negatively influences the pod scheduling. If this happens on container + level, the descheduler will not be able to detect it, as it works on + the pod level. To fix this, increase memory of the affected node of + control plane nodes. + summary: Extreme memory utilization per node within control plane nodes + is extremely high, and could impact responsiveness and stability. + syn_component: openshift4-monitoring + expr: | + ( + 1 + - + sum by (instance) ( + node_memory_MemFree_bytes + + node_memory_Buffers_bytes + + node_memory_Cached_bytes + AND on (instance) + label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) / sum by (instance) ( + node_memory_MemTotal_bytes + AND on (instance) + label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) + ) * 100 > 90 + for: 45m + labels: + namespace: openshift-machine-config-operator + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-general.rules + rules: + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + summary: An alert that should always be firing to certify that Alertmanager + is working properly. + syn_component: openshift4-monitoring + expr: vector(1) + labels: + namespace: openshift-monitoring + severity: none + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-high-overall-control-plane-memory + rules: + - alert: SYN_HighOverallControlPlaneMemory + annotations: + description: | + The overall memory usage is high. + kube-apiserver and etcd might be slow to respond. + To fix this, increase memory of the control plane nodes. + + This alert was adjusted to be less sensitive in 4.11. + Newer Go versions use more memory, if available, to reduce GC pauses. + + Old memory behavior can be restored by setting `GOGC=63`. + See https://bugzilla.redhat.com/show_bug.cgi?id=2074031 for more details. + summary: Memory utilization across all control plane nodes is high, and + could impact responsiveness and stability. + syn_component: openshift4-monitoring + expr: | + ( + 1 + - + sum ( + node_memory_MemFree_bytes + + node_memory_Buffers_bytes + + node_memory_Cached_bytes + AND on (instance) + label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) / sum ( + node_memory_MemTotal_bytes + AND on (instance) + label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) + ) * 100 > 80 + for: 1h + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kube-state-metrics + rules: + - alert: SYN_KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated + rate in watch operations. This is likely causing it to not be able to + expose metrics about Kubernetes objects correctly or at all. + summary: kube-state-metrics is experiencing errors in watch operations. + syn_component: openshift4-monitoring + expr: | + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) + > 0.01 + for: 15m + labels: + namespace: openshift-monitoring + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-apps + rules: + - alert: SYN_KubeContainerWaiting + annotations: + description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} + on container {{ $labels.container}} has been in waiting state for longer + than 1 hour. + summary: Pod container waiting longer than 1 hour + syn_component: openshift4-monitoring + expr: | + sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ + $labels.daemonset }} are running where they are not supposed to run.' + summary: DaemonSet pods are misscheduled. + syn_component: openshift4-monitoring + expr: | + kube_daemonset_status_number_misscheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ + $labels.daemonset }} are not scheduled.' + summary: DaemonSet pods are not scheduled. + syn_component: openshift4-monitoring + expr: | + kube_daemonset_status_desired_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} + has not finished or progressed for at least 30 minutes. + summary: DaemonSet rollout is stuck. + syn_component: openshift4-monitoring + expr: | + ( + ( + kube_daemonset_status_current_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but + has not been rolled back. + summary: Deployment generation mismatch due to possible roll-back + syn_component: openshift4-monitoring + expr: | + kube_deployment_status_observed_generation{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_deployment_metadata_generation{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDeploymentRolloutStuck + annotations: + description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment + }} is not progressing for longer than 15 minutes. + summary: Deployment rollout is not progressing. + syn_component: openshift4-monitoring + expr: | + kube_deployment_status_condition{condition="Progressing", status="false",namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed + to complete. Removing failed job after investigation should clear this + alert. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeJobFailed.md + summary: Job failed to complete. + syn_component: openshift4-monitoring + expr: | + kube_job_failed{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeJobNotCompleted + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking + more than {{ "43200" | humanizeDuration }} to complete. + summary: Job did not complete in time + syn_component: openshift4-monitoring + expr: | + time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + and + kube_job_status_active{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0) > 43200 + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePodCrashLooping + annotations: + description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is in waiting state (reason: "CrashLoopBackOff").' + summary: Pod is crash looping. + syn_component: openshift4-monitoring + expr: | + max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[5m]) >= 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in + a non-ready state for longer than 15 minutes. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePodNotReady.md + summary: Pod has been in a non-ready state for more than 15 minutes. + syn_component: openshift4-monitoring + expr: | + sum by (namespace, pod, cluster) ( + max by(namespace, pod, cluster) ( + kube_pod_status_phase{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", job="kube-state-metrics", phase=~"Pending|Unknown"} + unless ignoring(phase) (kube_pod_status_unschedulable{job="kube-state-metrics"} == 1) + ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( + 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but + has not been rolled back. + summary: StatefulSet generation mismatch due to possible roll-back + syn_component: openshift4-monitoring + expr: | + kube_statefulset_status_observed_generation{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset + }} has not matched the expected number of replicas for longer than 15 + minutes. + summary: StatefulSet has not matched the expected number of replicas. + syn_component: openshift4-monitoring + expr: | + ( + kube_statefulset_status_replicas_ready{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_statefulset_status_replicas{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset + }} update has not been rolled out. + summary: StatefulSet update has not been rolled out. + syn_component: openshift4-monitoring + expr: | + ( + max by(namespace, statefulset, job, cluster) ( + kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-storage + rules: + - alert: SYN_KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} {{ with + $labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase + }}. + summary: PersistentVolume is having issues with provisioning. + syn_component: openshift4-monitoring + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} + on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage + }} free. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md + summary: PersistentVolume is filling up. + syn_component: openshift4-monitoring + expr: | + ( + kubelet_volume_stats_available_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} > 0 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", access_mode="ReadOnlyMany"} == 1 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",label_alerts_k8s_io_kube_persistent_volume_filling_up="disabled"} == 1 + for: 1m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by + {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace + }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected + to fill up within four days. Currently {{ $value | humanizePercentage + }} is available. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md + summary: PersistentVolume is filling up. + syn_component: openshift4-monitoring + expr: | + ( + kubelet_volume_stats_available_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", access_mode="ReadOnlyMany"} == 1 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",label_alerts_k8s_io_kube_persistent_volume_filling_up="disabled"} == 1 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePersistentVolumeInodesFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} + on Cluster {{ . }} {{- end }} only has {{ $value | humanizePercentage + }} free inodes. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeInodesFillingUp.md + summary: PersistentVolumeInodes are filling up. + syn_component: openshift4-monitoring + expr: | + ( + kubelet_volume_stats_inodes_free{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_inodes_used{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} > 0 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", access_mode="ReadOnlyMany"} == 1 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",label_alerts_k8s_io_kube_persistent_volume_filling_up="disabled"} == 1 + for: 1m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePersistentVolumeInodesFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by + {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace + }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected + to run out of inodes within four days. Currently {{ $value | humanizePercentage + }} of its inodes are free. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeInodesFillingUp.md + summary: PersistentVolumeInodes are filling up. + syn_component: openshift4-monitoring + expr: | + ( + kubelet_volume_stats_inodes_free{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_inodes_used{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_inodes_free{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", access_mode="ReadOnlyMany"} == 1 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",label_alerts_k8s_io_kube_persistent_volume_filling_up="disabled"} == 1 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-system + rules: + - alert: SYN_KubeClientErrors + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ $value | humanizePercentage }} errors.' + summary: Kubernetes API server client is experiencing errors. + syn_component: openshift4-monitoring + expr: | + (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace) + / + sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) + > 0.01 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-system-apiserver + rules: + - alert: SYN_KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAPIDown.md + summary: Target disappeared from Prometheus target discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeAPITerminatedRequests + annotations: + description: The kubernetes apiserver has terminated {{ $value | humanizePercentage + }} of its incoming requests. + summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage + }} of its incoming requests. + syn_component: openshift4-monitoring + expr: | + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeAggregatedAPIDown + annotations: + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace + }} has been only {{ $value | humanize }}% available over the last 10m. + summary: Kubernetes aggregated API is down. + syn_component: openshift4-monitoring + expr: | + (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeAggregatedAPIErrors + annotations: + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace + }} has reported errors. It has appeared unavailable {{ $value | humanize + }} times averaged over the past 10m. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAggregatedAPIErrors.md + summary: Kubernetes aggregated API has reported errors. + syn_component: openshift4-monitoring + expr: | + sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-system-kubelet + rules: + - alert: SYN_KubeNodeNotReady + annotations: + description: '{{ $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeNodeNotReady.md + summary: Node is not ready. + syn_component: openshift4-monitoring + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ $labels.node }} has changed + {{ $value }} times in the last 15 minutes. + summary: Node readiness status is flapping. + syn_component: openshift4-monitoring + expr: | + sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 + for: 15m + labels: + namespace: kube-system + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeNodeUnreachable + annotations: + description: '{{ $labels.node }} is unreachable and some workloads may + be rescheduled.' + summary: Node is unreachable. + syn_component: openshift4-monitoring + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its + client certificate ({{ $value | humanize }} errors in the last 5 minutes). + summary: Kubelet has failed to renew its client certificate. + syn_component: openshift4-monitoring + expr: | + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeletDown.md + summary: Target disappeared from Prometheus target discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + namespace: kube-system + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile + duration of {{ $value }} seconds on node {{ $labels.node }}. + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + syn_component: openshift4-monitoring + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + namespace: kube-system + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ $value + }} seconds on node {{ $labels.node }}. + summary: Kubelet Pod startup latency is too high. + syn_component: openshift4-monitoring + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + namespace: kube-system + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its + server certificate ({{ $value | humanize }} errors in the last 5 minutes). + summary: Kubelet has failed to renew its server certificate. + syn_component: openshift4-monitoring + expr: | + increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-api-operator-metrics-collector-up + rules: + - alert: SYN_MachineAPIOperatorMetricsCollectionFailing + annotations: + description: 'For more details: oc logs + -n openshift-machine-api' + summary: machine api operator metrics collection is failing. + syn_component: openshift4-monitoring + expr: | + mapi_mao_collector_up == 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-health-check-unterminated-short-circuit + rules: + - alert: SYN_MachineHealthCheckUnterminatedShortCircuit + annotations: + description: | + The number of unhealthy machines has exceeded the `maxUnhealthy` limit for the check, you should check + the status of machines in the cluster. + summary: machine health check {{ $labels.name }} has been disabled by + short circuit for more than 30 minutes + syn_component: openshift4-monitoring + expr: | + mapi_machinehealthcheck_short_circuit == 1 + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-not-yet-deleted + rules: + - alert: SYN_MachineNotYetDeleted + annotations: + description: | + The machine is not properly deleting, this may be due to a configuration issue with the + infrastructure provider, or because workloads on the node have PodDisruptionBudgets or + long termination periods which are preventing deletion. + summary: machine {{ $labels.name }} has been in Deleting phase for more + than 6 hours + syn_component: openshift4-monitoring + expr: | + sum by (name, namespace) (avg_over_time(mapi_machine_created_timestamp_seconds{phase="Deleting"}[15m])) > 0 + for: 360m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-with-no-running-phase + rules: + - alert: SYN_MachineWithNoRunningPhase + annotations: + description: | + The machine has been without a Running or Deleting phase for more than 60 minutes. + The machine may not have been provisioned properly from the infrastructure provider, or + it might have issues with CertificateSigningRequests being approved. + summary: 'machine {{ $labels.name }} is in phase: {{ $labels.phase }}' + syn_component: openshift4-monitoring + expr: | + sum by (name, namespace) (mapi_machine_created_timestamp_seconds{phase!~"Running|Deleting"}) > 0 + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-without-valid-node-ref + rules: + - alert: SYN_MachineWithoutValidNode + annotations: + description: | + If the machine never became a node, you should diagnose the machine related failures. + If the node was deleted from the API, you may delete the machine if appropriate. + summary: machine {{ $labels.name }} does not have valid node reference + syn_component: openshift4-monitoring + expr: | + sum by (name, namespace) (mapi_machine_created_timestamp_seconds unless on(node) kube_node_info) > 0 + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcc-boot-image-update-error + rules: + - alert: SYN_MCCBootImageUpdateError + annotations: + description: 'The boot images of Machineset {{ $labels.machineset }} could + not be updated. For more details check MachineConfigController pod logs: + oc logs -n {{ $labels.namespace }} -f $(oc get pod -o name -l=''k8s-app=machine-config-controller'' + -n openshift-machine-config-operator) | grep machine_set' + summary: Triggers when machineset boot images could not be updated + syn_component: openshift4-monitoring + expr: | + mcc_bootimage_update_error > 0 + for: 30m + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcc-drain-error + rules: + - alert: SYN_MCCDrainError + annotations: + description: 'Drain failed on {{ $labels.exported_node }} , updates may + be blocked. For more details check MachineConfigController pod logs: + oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Alerts the user to a failed node drain. Always triggers when + the failure happens one or more times. + syn_component: openshift4-monitoring + expr: | + mcc_drain_err > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcc-pool-alert + rules: + - alert: SYN_MCCPoolAlert + annotations: + description: 'Node {{ $labels.exported_node }} has triggered a pool alert + due to a label change. For more details check MachineConfigController + pod logs: oc logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx + -c machine-config-controller' + summary: Triggers when nodes in a pool have overlapping labels such as + master, worker, and a custom label therefore a choice must be made as + to which is honored. + syn_component: openshift4-monitoring + expr: | + mcc_pool_alert > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcd-kubelet-health-state-error + rules: + - alert: SYN_KubeletHealthState + annotations: + description: Kubelet health failure threshold reached + summary: This keeps track of Kubelet health failures, and tallys them. + The warning is triggered if 2 or more failures occur. + syn_component: openshift4-monitoring + expr: | + mcd_kubelet_state > 2 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcd-missing-mc + rules: + - alert: SYN_MissingMachineConfig + annotations: + description: Could not find config {{ $labels.mc }} in-cluster, this likely + indicates the MachineConfigs in-cluster has changed during the install + process. If you are seeing this when installing the cluster, please + compare the in-cluster rendered machineconfigs to /etc/mcs-machine-config-content.json + summary: This keeps track of Machine Config failures. Specifically a common + failure on install when a rendered Machine Config is missing. Triggered + when this error happens once. + syn_component: openshift4-monitoring + expr: | + mcd_missing_mc > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcd-pivot-error + rules: + - alert: SYN_MCDPivotError + annotations: + description: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + may be blocked. For more details: oc logs -f -n {{ $labels.namespace + }} {{ $labels.pod }} -c machine-config-daemon ' + summary: Alerts the user when an error is detected upon pivot. This triggers + if the pivot errors are above zero for 2 minutes. + syn_component: openshift4-monitoring + expr: | + mcd_pivot_errors_total > 0 + for: 2m + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcd-reboot-error + rules: + - alert: SYN_MCDRebootError + annotations: + description: 'Reboot failed on {{ $labels.node }} , update may be blocked. + For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod + }} -c machine-config-daemon ' + summary: Alerts the user that a node failed to reboot one or more times + over a span of 5 minutes. + syn_component: openshift4-monitoring + expr: | + mcd_reboots_failed_total > 0 + for: 5m + labels: + namespace: openshift-machine-config-operator + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-node-exporter + rules: + - alert: SYN_NodeBondingDegraded + annotations: + description: Bonding interface {{ $labels.master }} on {{ $labels.instance + }} is in degraded state due to one or more slave failures. + summary: Bonding interface is degraded + syn_component: openshift4-monitoring + expr: | + (node_bonding_slaves - node_bonding_active) != 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeClockNotSynchronising + annotations: + description: Clock at {{ $labels.instance }} is not synchronising. Ensure + NTP is configured on this host. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md + summary: Clock not synchronising. + syn_component: openshift4-monitoring + expr: |- + ( + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeClockSkewDetected + annotations: + description: Clock at {{ $labels.instance }} is out of sync by more than + 0.05s. Ensure NTP is configured correctly on this host. + summary: Clock skew detected. + syn_component: openshift4-monitoring + expr: |- + ( + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + ) and on() absent(up{job="ptp-monitor-service"}) + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently + at {{ printf "%.2f" $value }}%. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFileDescriptorLimit.md + summary: Kernel is predicted to exhaust file descriptors limit soon. + syn_component: openshift4-monitoring + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfFiles.md + summary: Filesystem has less than 3% inodes left. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 1h + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfSpace.md + summary: Filesystem has less than 3% space left. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left and is filling up fast. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemFilesFillingUp.md + summary: Filesystem is predicted to run out of inodes within the next + 4 hours. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 1h + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left and is filling up. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md + summary: Filesystem is predicted to run out of space within the next 24 + hours. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left and is filling up fast. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md + summary: Filesystem is predicted to run out of space within the next 4 + hours. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 1h + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are + used.' + summary: Number of conntrack are getting close to the limit. + syn_component: openshift4-monitoring + expr: | + (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeMemoryMajorPagesFaults + annotations: + description: | + Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Please check that there is enough memory available at this instance. + summary: Memory major page faults are occurring at very high rate. + syn_component: openshift4-monitoring + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > on (instance) + (count by (instance) (node_cpu_info{}) * 100) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has + encountered {{ printf "%.0f" $value }} receive errors in the last two + minutes.' + summary: Network interface is reporting many receive errors. + syn_component: openshift4-monitoring + expr: | + rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has + encountered {{ printf "%.0f" $value }} transmit errors in the last two + minutes.' + summary: Network interface is reporting many transmit errors. + syn_component: openshift4-monitoring + expr: | + rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeSystemdServiceFailed + annotations: + description: Systemd service {{ $labels.name }} has entered failed state + at {{ $labels.instance }} + summary: Systemd service has entered failed state. + syn_component: openshift4-monitoring + expr: | + node_systemd_unit_state{job="node-exporter", state="failed"} == 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-node-network + rules: + - alert: SYN_NodeNetworkInterfaceFlapping + annotations: + description: Network interface "{{ $labels.device }}" changing its up + status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod + }} + summary: Network interface is often changing its status + syn_component: openshift4-monitoring + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+|tunbr"}[2m]) > 2 + for: 2m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-node-utilization + rules: + - alert: SYN_NodeTcpMemoryUtilizationHigh + annotations: + description: | + TCP memory usage exceeds the TCP memory pressure threshold on node {{ $labels.instance }}. + + Check the node for processes with unusual amounts of TCP sockets. + message: TCP memory usage is high on {{ $labels.instance }} + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/tcp-memory-usage.html + syn_component: openshift4-monitoring + expr: node_sockstat_TCP_mem_bytes > on(instance) node_memory_MemTotal_bytes*0.0625 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_node_memory_free_percent + annotations: + message: '{{ $labels.instance }}: Memory usage more than 97% (current + value is: {{ $value | humanizePercentage }})%' + syn_component: openshift4-monitoring + expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes + > 0.97 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-etcd.rules + rules: + - alert: SYN_etcdGRPCRequestsSlow + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC + requests is {{ $value }}s on etcd instance {{ $labels.instance }} for + {{ $labels.grpc_method }} method.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdGRPCRequestsSlow.md + summary: etcd grpc requests are slow + syn_component: openshift4-monitoring + expr: | + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd", grpc_method!="Defragment", grpc_type="unary"}[10m])) without(grpc_type)) + > 1 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdHighNumberOfFailedGRPCRequests + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests + for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance + }}.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighNumberOfFailedGRPCRequests.md + summary: etcd cluster has high number of failed grpc requests. + syn_component: openshift4-monitoring + expr: | + (sum(rate(grpc_server_handled_total{job="etcd", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) + / + (sum(rate(grpc_server_handled_total{job="etcd"}[5m])) without (grpc_type, grpc_code) + > 2 and on ()(sum(cluster_infrastructure_provider{type!~"ipi|BareMetal"} == bool 1)))) * 100 > 50 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdHighNumberOfLeaderChanges + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} average leader + changes within the last 10 minutes. Frequent elections may be a sign + of insufficient resources, high network latency, or disruptions by other + components and should be investigated.' + summary: etcd cluster has high number of leader changes. + syn_component: openshift4-monitoring + expr: | + avg(changes(etcd_server_is_leader[10m])) > 5 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdInsufficientMembers + annotations: + description: etcd is reporting fewer instances are available than are + needed ({{ $value }}). When etcd does not have a majority of instances + available the Kubernetes and OpenShift APIs will reject read and write + requests and operations that preserve the health of workloads cannot + be performed. This can occur when multiple control plane nodes are powered + off or are unable to connect to each other via the network. Check that + all control plane nodes are powered on and that network connections + between each machine are functional. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdInsufficientMembers.md + summary: etcd is reporting that a majority of instances are unavailable. + syn_component: openshift4-monitoring + expr: sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} + == bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod) + + 1) / 2) + for: 3m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdSignerCAExpirationCritical + annotations: + description: etcd is reporting the signer ca "{{ $labels.name }}" to have + less than year (({{ printf "%.f" $value }} days) of validity left. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdSignerCAExpirationCritical.md + summary: etcd has critical signer ca expiration + syn_component: openshift4-monitoring + expr: avg(openshift_etcd_operator_signer_expiration_days) by (name) < 365 + for: 1h + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdSignerCAExpirationWarning + annotations: + description: etcd is reporting the signer ca "{{ $labels.name }}" to have + less than two years (({{ printf "%.f" $value }} days) of validity left. + summary: etcd signer ca is about to expire + syn_component: openshift4-monitoring + expr: avg(openshift_etcd_operator_signer_expiration_days) by (name) < 730 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-general.rules + rules: + - alert: SYN_TargetDown + annotations: + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ + $labels.service }} targets in {{ $labels.namespace }} namespace have + been unreachable for more than 15 minutes. This may be a symptom of + network connectivity issues, down nodes, or failures within these components. + Assess the health of the infrastructure and nodes running these targets + and then contact support.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/TargetDown.md + summary: Some targets were not reachable from the monitoring server for + an extended period of time. + syn_component: openshift4-monitoring + expr: | + 100 * (( + 1 - sum by (job, namespace, service) (up and on(namespace, pod) kube_pod_info) / + count by (job, namespace, service) (up and on(namespace, pod) kube_pod_info) + ) or ( + count by (job, namespace, service) (up == 0) / + count by (job, namespace, service) (up) + )) > 10 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-ingress-to-route-controller.rules + rules: + - alert: SYN_UnmanagedRoutes + annotations: + description: This alert fires when there is a Route owned by an unmanaged + Ingress. + message: Route {{ $labels.namespace }}/{{ $labels.name }} is owned by + an unmanaged Ingress. + summary: Route owned by an Ingress no longer managed + syn_component: openshift4-monitoring + expr: openshift_ingress_to_route_controller_route_with_unmanaged_owner == + 1 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-ingress.rules + rules: + - alert: SYN_HAProxyDown + annotations: + description: This alert fires when metrics report that HAProxy is down. + message: HAProxy metrics are reporting that HAProxy is down on pod {{ + $labels.namespace }} / {{ $labels.pod }} + summary: HAProxy is down + syn_component: openshift4-monitoring + expr: haproxy_up == 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_HAProxyReloadFail + annotations: + description: This alert fires when HAProxy fails to reload its configuration, + which will result in the router not picking up recently created or modified + routes. + message: HAProxy reloads are failing on {{ $labels.pod }}. Router is not + respecting recently created or modified routes + summary: HAProxy reload failure + syn_component: openshift4-monitoring + expr: template_router_reload_failure == 1 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_IngressControllerDegraded + annotations: + description: This alert fires when the IngressController status is degraded. + message: | + The {{ $labels.namespace }}/{{ $labels.name }} ingresscontroller is + degraded: {{ $labels.reason }}. + summary: IngressController is degraded + syn_component: openshift4-monitoring + expr: ingress_controller_conditions{condition="Degraded"} == 1 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_IngressControllerUnavailable + annotations: + description: This alert fires when the IngressController is not available. + message: | + The {{ $labels.namespace }}/{{ $labels.name }} ingresscontroller is + unavailable: {{ $labels.reason }}. + summary: IngressController is unavailable + syn_component: openshift4-monitoring + expr: ingress_controller_conditions{condition="Available"} == 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-kubernetes.rules + rules: + - alert: SYN_ClusterMonitoringOperatorReconciliationErrors + annotations: + description: Errors are occurring during reconciliation cycles. Inspect + the cluster-monitoring-operator log for potential root causes. + summary: Cluster Monitoring Operator is experiencing unexpected reconciliation + errors. + syn_component: openshift4-monitoring + expr: max_over_time(cluster_monitoring_operator_last_reconciliation_successful[5m]) + == 0 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment + }} has not matched the expected number of replicas for longer than 15 + minutes. This indicates that cluster infrastructure is unable to start + or restart the necessary components. This most often occurs when one + or more nodes are down or partioned from the cluster, or a fault occurs + on the node that prevents the workload from starting. In rare cases + this may indicate a new version of a cluster component cannot start + due to a bug or configuration error. Assess the pods for this deployment + to verify they are running on healthy nodes and then contact support. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeDeploymentReplicasMismatch.md + summary: Deployment has not matched the expected number of replicas + syn_component: openshift4-monitoring + expr: | + ((( + kube_deployment_spec_replicas{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + > + kube_deployment_status_replicas_available{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[5m]) + == + 0 + )) * on() group_left cluster:control_plane:all_nodes_ready) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePodNotScheduled + annotations: + description: |- + Pod {{ $labels.namespace }}/{{ $labels.pod }} cannot be scheduled for more than 30 minutes. + Check the details of the pod with the following command: + oc describe -n {{ $labels.namespace }} pod {{ $labels.pod }} + summary: Pod cannot be scheduled. + syn_component: openshift4-monitoring + expr: last_over_time(kube_pod_status_unschedulable{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)"}[5m]) + == 1 + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-prometheus + rules: + - alert: SYN_PrometheusBadConfig + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + to reload its configuration. + summary: Failed Prometheus configuration reload. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) == 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping + {{ printf "%.4g" $value }} samples/s with different values but duplicated + timestamp. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusDuplicateTimestamps.md + summary: Prometheus is dropping samples with duplicate timestamps. + syn_component: openshift4-monitoring + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ printf "%.1f" $value }}% errors while sending alerts + from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager + {{$labels.alertmanager}}.' + summary: Prometheus has encountered more than 1% errors sending alerts + to a specific Alertmanager. + syn_component: openshift4-monitoring + expr: | + ( + rate(prometheus_notifications_errors_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + / + rate(prometheus_notifications_sent_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusHighQueryLoad + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API + has less than 20% available capacity in its query engine for the last + 15 minutes. + summary: Prometheus is reaching its maximum capacity serving concurrent + requests. + syn_component: openshift4-monitoring + expr: | + avg_over_time(prometheus_engine_queries{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0.8 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusKubernetesListWatchFailures + annotations: + description: Kubernetes service discovery of Prometheus {{$labels.namespace}}/{{$labels.pod}} + is experiencing {{ printf "%.0f" $value }} failures with LIST/WATCH + requests to the Kubernetes API in the last 5 minutes. + summary: Requests in Kubernetes SD are failing. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_sd_kubernetes_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusLabelLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped + {{ printf "%.0f" $value }} targets because some samples exceeded the + configured label_limit, label_name_length_limit or label_value_length_limit. + summary: Prometheus has dropped targets because some scrape configs have + exceeded the labels limit. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed + {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + summary: Prometheus is missing rule evaluations due to slow rule group + evaluation. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_rule_group_iterations_missed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected + to any Alertmanagers. + summary: Prometheus is not connected to any Alertmanagers. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) < 1 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusNotIngestingSamples + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting + samples. + summary: Prometheus is not ingesting samples. + syn_component: openshift4-monitoring + expr: | + ( + sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=~"prometheus-k8s|prometheus-user-workload"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job=~"prometheus-k8s|prometheus-user-workload"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} + is running full. + summary: Prometheus alert notification queue predicted to run full in + less than 30m. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job=~"prometheus-k8s|prometheus-user-workload"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + ) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping + {{ printf "%.4g" $value }} samples/s with timestamps arriving out of + order. + summary: Prometheus drops samples with out-of-order timestamps. + syn_component: openshift4-monitoring + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to + send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ + $labels.url }} + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusRemoteStorageFailures.md + summary: Prometheus fails to send samples to remote storage. + syn_component: openshift4-monitoring + expr: | + ( + (rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) + / + ( + (rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) + + + (rate(prometheus_remote_storage_succeeded_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write + desired shards calculation wants to run {{ $value }} shards for queue + {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max + of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job=~"prometheus-k8s|prometheus-user-workload"}` + $labels.instance | query | first | value }}. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/remotewrite.html + summary: Prometheus remote write desired shards calculation wants to run + more than configured max shards. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + ) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusRuleFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusRuleFailures.md + summary: Prometheus is failing rule evaluations. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_rule_evaluation_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusSDRefreshFailure + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + to refresh SD with mechanism {{$labels.mechanism}}. + summary: Failed Prometheus SD refresh. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_sd_refresh_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[10m]) > 0 + for: 20m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusScrapeBodySizeLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + {{ printf "%.0f" $value }} scrapes in the last 5m because some targets + exceeded the configured body_size_limit. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusScrapeBodySizeLimitHit.md + summary: Prometheus has dropped some targets that exceeded body size limit. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusScrapeSampleLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + {{ printf "%.0f" $value }} scrapes in the last 5m because some targets + exceeded the configured sample_limit. + summary: Prometheus has failed scrapes that have exceeded the configured + sample limit. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_scrapes_exceeded_sample_limit_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected + {{$value | humanize}} compaction failures over the last 3h. + summary: Prometheus has issues compacting blocks. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_tsdb_compactions_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[3h]) > 0 + for: 4h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected + {{$value | humanize}} reload failures over the last 3h. + summary: Prometheus has issues reloading blocks from disk. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_tsdb_reloads_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[3h]) > 0 + for: 4h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusTargetLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped + {{ printf "%.0f" $value }} targets because the number of targets exceeded + the configured target_limit. + summary: Prometheus has dropped targets because some scrape configs have + exceeded the targets limit. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusTargetSyncFailure + annotations: + description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} + have failed to sync because invalid configuration was supplied.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusTargetSyncFailure.md + summary: Prometheus has failed to sync targets. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_sync_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[30m]) > 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-prometheus-operator + rules: + - alert: SYN_PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace + isn't ready to reconcile {{ $labels.controller }} resources. + summary: Prometheus operator not ready + syn_component: openshift4-monitoring + expr: | + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorReconcileErrors + annotations: + description: '{{ $value | humanizePercentage }} of reconciling operations + failed for {{ $labels.controller }} controller in {{ $labels.namespace + }} namespace.' + summary: Errors while reconciling objects. + syn_component: openshift4-monitoring + expr: | + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorRejectedResources + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace + rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource + }} resources. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusOperatorRejectedResources.md + summary: Resources rejected by Prometheus operator + syn_component: openshift4-monitoring + expr: | + min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorStatusUpdateErrors + annotations: + description: '{{ $value | humanizePercentage }} of status update operations + failed for {{ $labels.controller }} controller in {{ $labels.namespace + }} namespace.' + summary: Errors while updating objects status. + syn_component: openshift4-monitoring + expr: | + (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorSyncFailed + annotations: + description: Controller {{ $labels.controller }} in {{ $labels.namespace + }} namespace fails to reconcile {{ $value }} objects. + summary: Last controller reconciliation failed + syn_component: openshift4-monitoring + expr: | + min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{$labels.controller}} + in {{$labels.namespace}} namespace. + summary: Errors while performing watch operations in controller. + syn_component: openshift4-monitoring + expr: | + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-system-memory-exceeds-reservation + rules: + - alert: SYN_SystemMemoryExceedsReservation + annotations: + description: System memory usage of {{ $value | humanize }} on {{ $labels.node + }} exceeds 95% of the reservation. Reserved memory ensures system processes + can function even when the node is fully allocated and protects against + workload out of memory events impacting the proper functioning of the + node. The default reservation is expected to be sufficient for most + configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) + when running nodes with high numbers of pods (either due to rate of + change or at steady state). + summary: Alerts the user when, for 15 miutes, a specific node is using + more memory than is reserved + syn_component: openshift4-monitoring + expr: | + sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) + for: 15m + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-thanos-query + rules: + - alert: SYN_ThanosQueryHttpRequestQueryErrorRateHigh + annotations: + description: Thanos Query {{$labels.job}} in {{$labels.namespace}} is + failing to handle {{$value | humanize}}% of "query" requests. + summary: Thanos Query is failing to handle requests. + syn_component: openshift4-monitoring + expr: | + ( + sum by (namespace, job) (rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query"}[5m])) + / + sum by (namespace, job) (rate(http_requests_total{job="thanos-querier", handler="query"}[5m])) + ) * 100 > 5 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosQueryHttpRequestQueryRangeErrorRateHigh + annotations: + description: Thanos Query {{$labels.job}} in {{$labels.namespace}} is + failing to handle {{$value | humanize}}% of "query_range" requests. + summary: Thanos Query is failing to handle requests. + syn_component: openshift4-monitoring + expr: | + ( + sum by (namespace, job) (rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query_range"}[5m])) + / + sum by (namespace, job) (rate(http_requests_total{job="thanos-querier", handler="query_range"}[5m])) + ) * 100 > 5 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} in {{$labels.namespace}} has + been overloaded for more than 15 minutes. This may be a symptom of excessive + simultaneous complex requests, low performance of the Prometheus API, + or failures within these components. Assess the health of the Thanos + query instances, the connected Prometheus instances, look for potential + senders of these requests and then contact support. + summary: Thanos query reaches its maximum capacity serving concurrent + requests. + syn_component: openshift4-monitoring + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-thanos-rule + rules: + - alert: SYN_ThanosNoRuleEvaluations + annotations: + description: Thanos Rule {{$labels.instance}} in {{$labels.namespace}} + did not perform any rule evaluations in the past 10 minutes. + summary: Thanos Rule did not perform any rule evaluations. + syn_component: openshift4-monitoring + expr: | + sum by (namespace, job, instance) (rate(prometheus_rule_evaluations_total{job="thanos-ruler"}[5m])) <= 0 + and + sum by (namespace, job, instance) (thanos_rule_loaded_rules{job="thanos-ruler"}) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosRuleGrpcErrorRate + annotations: + description: Thanos Rule {{$labels.job}} in {{$labels.namespace}} is failing + to handle {{$value | humanize}}% of requests. + summary: Thanos Rule is failing to handle grpc requests. + syn_component: openshift4-monitoring + expr: | + ( + sum by (namespace, job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job="thanos-ruler"}[5m])) + / + sum by (namespace, job, instance) (rate(grpc_server_started_total{job="thanos-ruler"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosRuleHighRuleEvaluationFailures + annotations: + description: Thanos Rule {{$labels.instance}} in {{$labels.namespace}} + is failing to evaluate rules. + summary: Thanos Rule is failing to evaluate rules. + syn_component: openshift4-monitoring + expr: | + ( + sum by (namespace, job, instance) (rate(prometheus_rule_evaluation_failures_total{job="thanos-ruler"}[5m])) + / + sum by (namespace, job, instance) (rate(prometheus_rule_evaluations_total{job="thanos-ruler"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosRuleQueueIsDroppingAlerts + annotations: + description: Thanos Rule {{$labels.instance}} in {{$labels.namespace}} + is failing to queue alerts. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ThanosRuleQueueIsDroppingAlerts.md + summary: Thanos Rule is failing to queue alerts. + syn_component: openshift4-monitoring + expr: | + sum by (namespace, job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosRuleSenderIsFailingAlerts + annotations: + description: Thanos Rule {{$labels.instance}} in {{$labels.namespace}} + is failing to send alerts to alertmanager. + summary: Thanos Rule is failing to send alerts to alertmanager. + syn_component: openshift4-monitoring + expr: | + sum by (namespace, job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/rbac.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/rbac.yaml new file mode 100644 index 00000000..1c6d4fea --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/rbac.yaml @@ -0,0 +1,44 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + annotations: + syn_component: openshift4-monitoring + labels: + name: syn-prometheus-auto-discovery + name: syn-prometheus-auto-discovery +rules: + - apiGroups: + - '' + resources: + - pods + - services + - endpoints + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + syn_component: openshift4-monitoring + labels: + name: syn-prometheus-auto-discovery + name: syn-prometheus-auto-discovery +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: syn-prometheus-auto-discovery +subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: openshift-monitoring diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/silence.yaml new file mode 100644 index 00000000..ccae3b65 --- /dev/null +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -0,0 +1,107 @@ +apiVersion: v1 +data: + silence: | + #!/bin/bash + set -euo pipefail + + curl_opts=( https://alertmanager-main.openshift-monitoring.svc.cluster.local:9095/api/v2/silences --cacert /etc/ssl/certs/serving-certs/service-ca.crt --header 'Content-Type: application/json' --header "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" --resolve "alertmanager-main.openshift-monitoring.svc.cluster.local:9095:$(getent hosts alertmanager-operated.openshift-monitoring.svc.cluster.local | awk '{print $1}' | head -n 1)" --silent ) + + while IFS= read -r silence; do + comment=$(printf %s "${silence}" | jq -r '.comment') + + body=$(printf %s "$silence" | \ + jq \ + --arg startsAt "$(date -u +'%Y-%m-%dT%H:%M:%S' --date '-1 min')" \ + --arg endsAt "$(date -u +'%Y-%m-%dT%H:%M:%S' --date '+1 day')" \ + --arg createdBy "Kubernetes object \`cronjob/silence\` in the monitoring namespace" \ + '.startsAt = $startsAt | .endsAt = $endsAt | .createdBy = $createdBy' + ) + + id=$(curl "${curl_opts[@]}" | jq -r ".[] | select(.status.state == \"active\") | select(.comment == \"${comment}\") | .id" | head -n 1) + if [ -n "${id}" ]; then + body=$(printf %s "${body}" | jq --arg id "${id}" '.id = $id') + fi + + curl "${curl_opts[@]}" -XPOST -d "${body}" + done <<<"$(printf %s "${SILENCES_JSON}" | jq -cr '.[]')" + silences.json: '[{"comment":"Silence non syn alerts","matchers":[{"isRegex":true,"name":"alertname","value":".+"},{"isRegex":false,"name":"syn","value":""}]}]' +kind: ConfigMap +metadata: + annotations: {} + labels: + name: silence + name: silence + namespace: openshift-monitoring +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + annotations: {} + labels: + name: silence + name: silence + namespace: openshift-monitoring +spec: + concurrencyPolicy: Forbid + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + completions: 1 + parallelism: 1 + template: + metadata: + labels: + name: silence + spec: + containers: + - args: [] + command: + - /usr/local/bin/silence + env: + - name: SILENCES_JSON + valueFrom: + configMapKeyRef: + key: silences.json + name: silence + image: quay.io/appuio/oc:v4.15 + imagePullPolicy: IfNotPresent + name: silence + ports: [] + stdin: false + tty: false + volumeMounts: + - mountPath: /etc/ssl/certs/serving-certs/ + name: ca-bundle + readOnly: true + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access + readOnly: true + - mountPath: /usr/local/bin/silence + name: scripts + readOnly: true + subPath: silence + imagePullSecrets: [] + initContainers: [] + nodeSelector: + node-role.kubernetes.io/infra: '' + restartPolicy: Never + serviceAccountName: prometheus-k8s + terminationGracePeriodSeconds: 30 + volumes: + - configMap: + defaultMode: 288 + name: serving-certs-ca-bundle + name: ca-bundle + - name: kube-api-access + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - configMap: + defaultMode: 360 + name: silence + name: scripts + schedule: 0 */4 * * * + successfulJobsHistoryLimit: 3 diff --git a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 6b2db14c..40d2cd74 100644 --- a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -852,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1040,7 +1040,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 629e442b..965f34b4 100644 --- a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -896,7 +896,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1092,7 +1092,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 8fd7c5d3..ef14096c 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -852,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1040,7 +1040,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 057401ab..61e8d57f 100644 --- a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -852,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max by(namespace, statefulset) ( + max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1040,7 +1040,7 @@ spec: }} of its incoming requests. syn_component: openshift4-monitoring expr: | - sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/tests/release-4.16.yml b/tests/release-4.16.yml index a4da5b7b..ced6cdc1 100644 --- a/tests/release-4.16.yml +++ b/tests/release-4.16.yml @@ -1,3 +1,14 @@ -# Overwrite parameters here +parameters: + kapitan: + dependencies: + - type: https + source: https://raw.githubusercontent.com/projectsyn/component-patch-operator/master/lib/patch-operator.libsonnet + output_path: vendor/lib/patch-operator.libsonnet -# parameters: {...} + patch_operator: + namespace: syn-patch-operator + patch_serviceaccount: + name: patch-sa + + openshift4_monitoring: + manifests_version: release-4.16