Merge pull request #190 from appuio/template-sync

Update from component template
appuio · Nov 27, 2023 · 50fb5d0 · 50fb5d0
2 parents e8d2994 + e8d1e9f
commit 50fb5d0
Show file tree

Hide file tree

Showing 12 changed files with 179 additions and 61 deletions.
diff --git a/.cruft.json b/.cruft.json
@@ -1,6 +1,6 @@
 {
   "template": "https://github.com/projectsyn/commodore-component-template.git",
-  "commit": "8a2e7800ac2d62a4827968ce108f4d7b9bfd587f",
+  "commit": "a4aff6a9d004c1aad085a875c7759c8f8f1e0d3d",
   "checkout": "main",
   "context": {
     "cookiecutter": {

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -3,7 +3,7 @@
 
 ## Checklist
 
-- [ ] The PR has a meaningful title. It will be used to auto generate the
+- [ ] The PR has a meaningful title. It will be used to auto-generate the
       changelog.
       The PR has a meaningful description that sums up the change. It will be
       linked in the changelog.
@@ -21,6 +21,6 @@ review the checklist.
 Contributors guide: ./CONTRIBUTING.md
 
 Remove items that do not apply. For completed items, change [ ] to [x].
-These things are not required to open a PR and can be done afterwards,
+These things are not required to open a PR and can be done afterwards
 while the PR is open.
 -->
diff --git a/README.md b/README.md
@@ -3,15 +3,15 @@
 This is a [Commodore][commodore] Component for OpenShift4 Monitoring.
 
 This repository is part of Project Syn.
-For documentation on Project Syn and this component, see https://syn.tools.
+For documentation on Project Syn and this component, see [syn.tools](https://syn.tools).
 
 ## Documentation
 
 The rendered documentation for this component is available on the [Commodore Components Hub](https://hub.syn.tools/openshift4-monitoring).
 
 Documentation for this component is written using [Asciidoc][asciidoc] and [Antora][antora].
-It is located in the [docs/](docs) folder.
-The [Divio documentation structure](https://documentation.divio.com/) is used to organize its content.
+It can be found in the [`docs`](docs) folder.
+We use the [Divio documentation structure](https://documentation.divio.com/) to organize our documentation.
 
 Run the `make docs-serve` command in the root of the project, and then browse to http://localhost:2020 to see a preview of the current state of the documentation.
 
@@ -20,7 +20,7 @@ After writing the documentation, please use the `make docs-vale` command and cor
 ## Contributing and license
 
 This library is licensed under [BSD-3-Clause](LICENSE).
-For information about how to contribute see [CONTRIBUTING](CONTRIBUTING.md).
+For information about how to contribute, see [CONTRIBUTING](CONTRIBUTING.md).
 
 [commodore]: https://syn.tools/commodore/
 [asciidoc]: https://asciidoctor.org/

diff --git a/...alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/...alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -46,7 +46,7 @@ spec:
             min by (namespace,service, integration) (
               rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
             /
-              rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
+              ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
             )
             > 0.01
           for: 5m
@@ -99,7 +99,7 @@ spec:
             (
               rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
             /
-              rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
+              ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
             )
             > 0.01
           for: 5m
@@ -429,7 +429,7 @@ spec:
             summary: etcd cluster database is running full.
             syn_component: openshift4-monitoring
           expr: |
-            (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
+            (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
           for: 10m
           labels:
             severity: critical
@@ -1299,6 +1299,19 @@ spec:
             syn_component: openshift4-monitoring
     - name: syn-node-exporter
       rules:
+        - alert: SYN_NodeBondingDegraded
+          annotations:
+            description: Bonding interface {{ $labels.master }} on {{ $labels.instance
+              }} is in degraded state due to one or more slave failures.
+            summary: Bonding interface is degraded
+            syn_component: openshift4-monitoring
+          expr: |
+            (node_bonding_slaves - node_bonding_active) != 0
+          for: 5m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_NodeClockNotSynchronising
           annotations:
             description: Clock at {{ $labels.instance }} is not synchronising. Ensure
@@ -2148,7 +2161,7 @@ spec:
             summary: Prometheus operator not ready
             syn_component: openshift4-monitoring
           expr: |
-            min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0)
+            min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0)
           for: 5m
           labels:
             severity: warning
@@ -2162,7 +2175,7 @@ spec:
             summary: Errors while reconciling controller.
             syn_component: openshift4-monitoring
           expr: |
-            (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1
+            (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1
           for: 10m
           labels:
             severity: warning
@@ -2202,7 +2215,7 @@ spec:
             summary: Errors while performing watch operations in controller.
             syn_component: openshift4-monitoring
           expr: |
-            (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4
+            (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4
           for: 15m
           labels:
             severity: warning

diff --git a/.../golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/.../golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -46,7 +46,7 @@ spec:
             min by (namespace,service, integration) (
               rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
             /
-              rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
+              ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
             )
             > 0.01
           for: 5m
@@ -99,7 +99,7 @@ spec:
             (
               rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
             /
-              rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
+              ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
             )
             > 0.01
           for: 5m
@@ -429,7 +429,7 @@ spec:
             summary: etcd cluster database is running full.
             syn_component: openshift4-monitoring
           expr: |
-            (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
+            (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
           for: 10m
           labels:
             severity: critical
@@ -1299,6 +1299,19 @@ spec:
             syn_component: openshift4-monitoring
     - name: syn-node-exporter
       rules:
+        - alert: SYN_NodeBondingDegraded
+          annotations:
+            description: Bonding interface {{ $labels.master }} on {{ $labels.instance
+              }} is in degraded state due to one or more slave failures.
+            summary: Bonding interface is degraded
+            syn_component: openshift4-monitoring
+          expr: |
+            (node_bonding_slaves - node_bonding_active) != 0
+          for: 5m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_NodeClockNotSynchronising
           annotations:
             description: Clock at {{ $labels.instance }} is not synchronising. Ensure
@@ -2148,7 +2161,7 @@ spec:
             summary: Prometheus operator not ready
             syn_component: openshift4-monitoring
           expr: |
-            min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0)
+            min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0)
           for: 5m
           labels:
             severity: warning
@@ -2162,7 +2175,7 @@ spec:
             summary: Errors while reconciling controller.
             syn_component: openshift4-monitoring
           expr: |
-            (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1
+            (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1
           for: 10m
           labels:
             severity: warning
@@ -2202,7 +2215,7 @@ spec:
             summary: Errors while performing watch operations in controller.
             syn_component: openshift4-monitoring
           expr: |
-            (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4
+            (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4
           for: 15m
           labels:
             severity: warning

diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -46,7 +46,7 @@ spec:
             min by (namespace,service, integration) (
               rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
             /
-              rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
+              ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m])
             )
             > 0.01
           for: 5m
@@ -99,7 +99,7 @@ spec:
             (
               rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
             /
-              rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
+              ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m])
             )
             > 0.01
           for: 5m
@@ -429,7 +429,7 @@ spec:
             summary: etcd cluster database is running full.
             syn_component: openshift4-monitoring
           expr: |
-            (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
+            (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
           for: 10m
           labels:
             severity: critical
@@ -1302,6 +1302,19 @@ spec:
             syn_component: openshift4-monitoring
     - name: syn-node-exporter
       rules:
+        - alert: SYN_NodeBondingDegraded
+          annotations:
+            description: Bonding interface {{ $labels.master }} on {{ $labels.instance
+              }} is in degraded state due to one or more slave failures.
+            summary: Bonding interface is degraded
+            syn_component: openshift4-monitoring
+          expr: |
+            (node_bonding_slaves - node_bonding_active) != 0
+          for: 5m
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_NodeClockNotSynchronising
           annotations:
             description: Clock at {{ $labels.instance }} is not synchronising. Ensure
@@ -2151,7 +2164,7 @@ spec:
             summary: Prometheus operator not ready
             syn_component: openshift4-monitoring
           expr: |
-            min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0)
+            min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0)
           for: 5m
           labels:
             severity: warning
@@ -2165,7 +2178,7 @@ spec:
             summary: Errors while reconciling controller.
             syn_component: openshift4-monitoring
           expr: |
-            (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1
+            (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1
           for: 10m
           labels:
             severity: warning
@@ -2205,7 +2218,7 @@ spec:
             summary: Errors while performing watch operations in controller.
             syn_component: openshift4-monitoring
           expr: |
-            (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4
+            (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4
           for: 15m
           labels:
             severity: warning