diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml index 6c3270a4..65717cc2 100644 --- a/deploy/qa-ext-values.yaml +++ b/deploy/qa-ext-values.yaml @@ -1,5 +1,7 @@ externalHost: qa.serval-api.org environment: Staging +alertEmail: ext-qa-serval-alerts@languagetechnology.org +emailsToAlert: 'john_lambert@sil.org, eli_lowry@sil.org' enableTls: true namespace: serval auth0Domain: dev-sillsdev.auth0.com diff --git a/deploy/qa-int-values.yaml b/deploy/qa-int-values.yaml index 692bd2f6..1ee3333c 100644 --- a/deploy/qa-int-values.yaml +++ b/deploy/qa-int-values.yaml @@ -1,5 +1,7 @@ externalHost: qa-int.serval-api.org environment: Staging +alertEmail: int-qa-serval-alerts@languagetechnology.org +emailsToAlert: john_lambert@sil.org enableTls: true namespace: nlp auth0Domain: sil-appbuilder.auth0.com diff --git a/deploy/serval/templates/alert-manager-config.yaml b/deploy/serval/templates/alert-manager-config.yaml new file mode 100644 index 00000000..71841611 --- /dev/null +++ b/deploy/serval/templates/alert-manager-config.yaml @@ -0,0 +1,25 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: limit-alerts + namespace: {{ .Values.namespace }} +spec: + receivers: + - emailConfigs: + - authPassword: + name: aqua-ml-data + key: smtp_password + authUsername: {{ .Values.alertEmail }} + from: {{ .Values.alertEmail }} + requireTLS: true + sendResolved: true + smarthost: mail.languagetechnology.org:587 + tlsConfig: {} + to: {{ .Values.emailsToAlert }} + name: alert-serval + route: + groupBy: [] + groupInterval: 5m + groupWait: 10s + receiver: alert-serval + repeatInterval: 4h \ No newline at end of file diff --git a/deploy/serval/templates/prometheus-rules.yaml b/deploy/serval/templates/prometheus-rules.yaml new file mode 100644 index 00000000..400b7daf --- /dev/null +++ b/deploy/serval/templates/prometheus-rules.yaml @@ -0,0 +1,63 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: prometheus-rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: cpu + rules: + - alert: cpu-80perc + annotations: + description: >- + '{{ "{{ $labels.container }}" }} has high CPU for over 3 minutes.' + expr: >- + min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m])) + by (container) / on (container) + min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"}) + by (container) * 100 >= 80 + for: 0s + labels: + severity: warning + - name: cpu-job + rules: + - alert: cpu-long-job + annotations: + description: >- + '{{ "{{ $labels.container }}" }} has a job running over 3 hours.' + expr: >- + min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h])) + by (container) / on (container) + min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"}) + by (container) * 100 >= 80 + for: 0s + labels: + severity: warning + - name: memory + rules: + - alert: memory-near-limit + annotations: + description: >- + '{{ "{{ $labels.container }}" }} is over 80% memory.' + expr: >- + min (container_memory_working_set_bytes{image!="", namespace="{{ .Values.namespace }}", container!="POD" }) + by (container) / on (container) + min (kube_pod_container_resource_limits{resource="memory", namespace="{{ .Values.namespace }}", container!="POD"}) + by (container) * 100 >= 80 + for: 0s + labels: + severity: warning + - name: disk + rules: + - alert: disk-near-limit + annotations: + description: >- + '{{ "{{ $labels.container }}" }} is over 80% disk space utilization.' + expr: >- + min (kubelet_volume_stats_used_bytes{namespace="{{ .Values.namespace }}"}) + by (persistentvolumeclaim) / on (persistentvolumeclaim) + min (kube_persistentvolumeclaim_resource_requests_storage_bytes{namespace="{{ .Values.namespace }}"}) + by (persistentvolumeclaim) * 100 > 80 + for: 0s + labels: + severity: warning \ No newline at end of file diff --git a/deploy/values.yaml b/deploy/values.yaml index f8175f59..6a58b804 100644 --- a/deploy/values.yaml +++ b/deploy/values.yaml @@ -1,5 +1,7 @@ externalHost: prod.serval-api.org environment: Production +alertEmail: prod-serval-alerts@languagetechnology.org +emailsToAlert: 'john_lambert@sil.org, eli_lowry@sil.org' enableTls: true namespace: serval auth0Domain: languagetechnology.auth0.com