Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alerting #189

Merged
merged 3 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions deploy/qa-ext-values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
externalHost: qa.serval-api.org
environment: Staging
alertEmail: [email protected]
emailsToAlert: '[email protected], [email protected]'
enableTls: true
namespace: serval
auth0Domain: dev-sillsdev.auth0.com
Expand Down
2 changes: 2 additions & 0 deletions deploy/qa-int-values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
externalHost: qa-int.serval-api.org
environment: Staging
alertEmail: [email protected]
emailsToAlert: [email protected]
enableTls: true
namespace: nlp
auth0Domain: sil-appbuilder.auth0.com
Expand Down
25 changes: 25 additions & 0 deletions deploy/serval/templates/alert-manager-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
name: limit-alerts
namespace: {{ .Values.namespace }}
spec:
receivers:
- emailConfigs:
- authPassword:
name: aqua-ml-data
key: smtp_password
authUsername: {{ .Values.alertEmail }}
from: {{ .Values.alertEmail }}
requireTLS: true
sendResolved: true
smarthost: mail.languagetechnology.org:587
tlsConfig: {}
to: {{ .Values.emailsToAlert }}
name: alert-serval
route:
groupBy: []
groupInterval: 5m
groupWait: 10s
receiver: alert-serval
repeatInterval: 4h
63 changes: 63 additions & 0 deletions deploy/serval/templates/prometheus-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: cpu
rules:
- alert: cpu-80perc
annotations:
description: >-
'{{ "{{ $labels.container }}" }} has high CPU for over 3 minutes.'
expr: >-
min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m]))
by (container) / on (container)
min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"})
by (container) * 100 >= 80
for: 0s
labels:
severity: warning
- name: cpu-job
rules:
- alert: cpu-long-job
annotations:
description: >-
'{{ "{{ $labels.container }}" }} has a job running over 3 hours.'
expr: >-
min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h]))
by (container) / on (container)
min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"})
by (container) * 100 >= 80
for: 0s
labels:
severity: warning
- name: memory
rules:
- alert: memory-near-limit
annotations:
description: >-
'{{ "{{ $labels.container }}" }} is over 80% memory.'
expr: >-
min (container_memory_working_set_bytes{image!="", namespace="{{ .Values.namespace }}", container!="POD" })
by (container) / on (container)
min (kube_pod_container_resource_limits{resource="memory", namespace="{{ .Values.namespace }}", container!="POD"})
by (container) * 100 >= 80
for: 0s
labels:
severity: warning
- name: disk
rules:
- alert: disk-near-limit
annotations:
description: >-
'{{ "{{ $labels.container }}" }} is over 80% disk space utilization.'
expr: >-
min (kubelet_volume_stats_used_bytes{namespace="{{ .Values.namespace }}"})
by (persistentvolumeclaim) / on (persistentvolumeclaim)
min (kube_persistentvolumeclaim_resource_requests_storage_bytes{namespace="{{ .Values.namespace }}"})
by (persistentvolumeclaim) * 100 > 80
for: 0s
labels:
severity: warning
2 changes: 2 additions & 0 deletions deploy/values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
externalHost: prod.serval-api.org
environment: Production
alertEmail: [email protected]
emailsToAlert: '[email protected], [email protected]'
enableTls: true
namespace: serval
auth0Domain: languagetechnology.auth0.com
Expand Down