Skip to content

Commit

Permalink
Alerting (#189)
Browse files Browse the repository at this point in the history
Add Prometheus rules for alerting on CPU, Memory and disk space.
  • Loading branch information
johnml1135 authored Oct 20, 2023
1 parent 83d08d7 commit f29be64
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 0 deletions.
2 changes: 2 additions & 0 deletions deploy/qa-ext-values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
externalHost: qa.serval-api.org
environment: Staging
alertEmail: [email protected]
emailsToAlert: '[email protected], [email protected]'
enableTls: true
namespace: serval
auth0Domain: dev-sillsdev.auth0.com
Expand Down
2 changes: 2 additions & 0 deletions deploy/qa-int-values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
externalHost: qa-int.serval-api.org
environment: Staging
alertEmail: [email protected]
emailsToAlert: [email protected]
enableTls: true
namespace: nlp
auth0Domain: sil-appbuilder.auth0.com
Expand Down
25 changes: 25 additions & 0 deletions deploy/serval/templates/alert-manager-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
name: limit-alerts
namespace: {{ .Values.namespace }}
spec:
receivers:
- emailConfigs:
- authPassword:
name: aqua-ml-data
key: smtp_password
authUsername: {{ .Values.alertEmail }}
from: {{ .Values.alertEmail }}
requireTLS: true
sendResolved: true
smarthost: mail.languagetechnology.org:587
tlsConfig: {}
to: {{ .Values.emailsToAlert }}
name: alert-serval
route:
groupBy: []
groupInterval: 5m
groupWait: 10s
receiver: alert-serval
repeatInterval: 4h
63 changes: 63 additions & 0 deletions deploy/serval/templates/prometheus-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: cpu
rules:
- alert: cpu-80perc
annotations:
description: >-
'{{ "{{ $labels.container }}" }} has high CPU for over 3 minutes.'
expr: >-
min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m]))
by (container) / on (container)
min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"})
by (container) * 100 >= 80
for: 0s
labels:
severity: warning
- name: cpu-job
rules:
- alert: cpu-long-job
annotations:
description: >-
'{{ "{{ $labels.container }}" }} has a job running over 3 hours.'
expr: >-
min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h]))
by (container) / on (container)
min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"})
by (container) * 100 >= 80
for: 0s
labels:
severity: warning
- name: memory
rules:
- alert: memory-near-limit
annotations:
description: >-
'{{ "{{ $labels.container }}" }} is over 80% memory.'
expr: >-
min (container_memory_working_set_bytes{image!="", namespace="{{ .Values.namespace }}", container!="POD" })
by (container) / on (container)
min (kube_pod_container_resource_limits{resource="memory", namespace="{{ .Values.namespace }}", container!="POD"})
by (container) * 100 >= 80
for: 0s
labels:
severity: warning
- name: disk
rules:
- alert: disk-near-limit
annotations:
description: >-
'{{ "{{ $labels.container }}" }} is over 80% disk space utilization.'
expr: >-
min (kubelet_volume_stats_used_bytes{namespace="{{ .Values.namespace }}"})
by (persistentvolumeclaim) / on (persistentvolumeclaim)
min (kube_persistentvolumeclaim_resource_requests_storage_bytes{namespace="{{ .Values.namespace }}"})
by (persistentvolumeclaim) * 100 > 80
for: 0s
labels:
severity: warning
2 changes: 2 additions & 0 deletions deploy/values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
externalHost: prod.serval-api.org
environment: Production
alertEmail: [email protected]
emailsToAlert: '[email protected], [email protected]'
enableTls: true
namespace: serval
auth0Domain: languagetechnology.auth0.com
Expand Down

0 comments on commit f29be64

Please sign in to comment.