Skip to content

Commit

Permalink
Create hoprd node alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
ausias-armesto committed Oct 18, 2024
1 parent e314f70 commit 5987d73
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 3 deletions.
2 changes: 1 addition & 1 deletion charts/hoprd-operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

apiVersion: v2
name: hoprd-operator
version: 0.2.10
version: 0.2.11
appVersion: 0.2.14
description: A Helm chart operator for managing Hopr nodes
type: application
Expand Down
56 changes: 56 additions & 0 deletions charts/hoprd-operator/templates/prometheus-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
prometheus-operator-validated: "true"
labels:
prometheus_rule: "true"
name: hoprd-node-rules
spec:
groups:
- name: hoprd-nodes
rules:
- alert: HoprdNodePodRestarted
annotations:
description: Hoprd node {{`{{`}} $labels.namespace{{`}}`}} /{{`{{`}} $labels.job {{`}}`}} restarted.
summary: Hoprd node restarted.
expr: |-
# When the node is running for less than 1 minute
abs(sum (kube_pod_start_time) by (pod) - sum (hopr_up) by (pod)) <= 10 AND avg((time() - hopr_up < 60)) by (pod)
for: 1m
labels:
severity: critical
environment: {{ .Values.environmentName }}
- alert: HoprdNodeProcessRebooted
annotations:
description: Hoprd node {{`{{`}} $labels.namespace{{`}}`}} /{{`{{`}} $labels.job {{`}}`}} process rebooted.
summary: Hoprd node rebooted (OOM).
expr: |-
# When the pod is running correctly and the node reboots internally without affecting the pod
abs(sum (kube_pod_start_time) by (pod) - sum (hopr_up) by (pod)) > 10 AND avg((time() - hopr_up < 60)) by (pod)
for: 1m
labels:
severity: critical
environment: {{ .Values.environmentName }}
- alert: HoprdHealthChanged
annotations:
description: Hoprd node {{`{{`}} $labels.namespace{{`}}`}} /{{`{{`}} $labels.job {{`}}`}} restarted.
summary: Hoprd node restarted.
expr: |-
# When the node is running for more than 15 minutes and the health status changes in the last 5 minutes
avg((time() - hopr_up) > 900 and changes(hopr_network_health[5m]) > 0) by (namespace,job)
for: 1m
labels:
severity: critical
environment: {{ .Values.environmentName }}
- alert: HoprdAPIErrors
annotations:
description: Hoprd api call errors on {{`{{`}} $labels.namespace{{`}}`}} /{{`{{`}} $labels.job {{`}}`}} .
summary: Hoprd api call errors.
expr: |-
# When the node is running for more than 15 minutes and the http api call errors are more than 30 in the last 10 minutes
sum(increase(hopr_http_api_call_count{status!~"[1,2,3].*"}[10m])) by (namespace, job) > 30 and avg((time() - hopr_up)) by (namespace,job) > 900
for: 10m
labels:
severity: critical
environment: {{ .Values.environmentName }}
2 changes: 1 addition & 1 deletion charts/hoprd-operator/values-prod.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

environmentName: prod
operator:
persistence:
storageClassName: ceph-ephimeral
Expand Down
2 changes: 1 addition & 1 deletion charts/hoprd-operator/values-staging.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

environmentName: staging
operator:
persistence:
storageClassName: ceph-ephimeral
Expand Down
2 changes: 2 additions & 0 deletions charts/hoprd-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ nameOverride: ""
##
fullnameOverride: ""

environmentName: ""

## @section Replicator Parameters
##

Expand Down

0 comments on commit 5987d73

Please sign in to comment.