From ae2394030eccf8d722a654be6a679e0864195ba3 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 18 Oct 2023 14:40:57 -0400 Subject: [PATCH 1/3] Untested, unverified, incomplete --- .../templates/alert-manager-config.yaml | 25 +++++++++++++++++++ deploy/serval/templates/prometheus-rules.yaml | 21 ++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 deploy/serval/templates/alert-manager-config.yaml create mode 100644 deploy/serval/templates/prometheus-rules.yaml diff --git a/deploy/serval/templates/alert-manager-config.yaml b/deploy/serval/templates/alert-manager-config.yaml new file mode 100644 index 00000000..b1a933e2 --- /dev/null +++ b/deploy/serval/templates/alert-manager-config.yaml @@ -0,0 +1,25 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: limit-alerts + namespace: {{ .Values.namespace }} +spec: + receivers: + - emailConfigs: + - authPassword: + key: ????? + name: ????? + authUsername: ?????? + from: serval-dallas@languagetechnology.org + requireTLS: true + sendResolved: true + smarthost: ????? + tlsConfig: {} + to: 'john_lambert@sil.org, eli_lowry@sil.org' + name: alert-nlp + route: + groupBy: [...] + groupInterval: 5m + groupWait: 10s + receiver: alert-nlp + repeatInterval: 4h \ No newline at end of file diff --git a/deploy/serval/templates/prometheus-rules.yaml b/deploy/serval/templates/prometheus-rules.yaml new file mode 100644 index 00000000..5cccbf5e --- /dev/null +++ b/deploy/serval/templates/prometheus-rules.yaml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + namespace: {{ .Values.namespace }} +spec: + groups: + - name: cpu + rules: + - alert: cpu-80perc-{{ .Values.namspace }} + annotations: + description: >- + '{{ $labels.container }} has high CPU.' + expr: >- + max(rate (container_cpu_usage_seconds_total {image!="", namespace=~"serval|nlp", container!="POD" } [3m])) + by (container, namespace) + / on (container, namespace) + min(kube_pod_container_resource_limits{resource="cpu", namespace=~"serval|nlp", container!="POD"}) + by (container, namespace) * 100 >= 80 + for: 0s + labels: + severity: warning From e27bfb9285d85fdfa552ed55721fbb5fa879a43e Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 18 Oct 2023 16:45:02 -0400 Subject: [PATCH 2/3] Try 1 for email alerting --- deploy/qa-ext-values.yaml | 2 ++ deploy/qa-int-values.yaml | 2 ++ .../serval/templates/alert-manager-config.yaml | 18 +++++++++--------- deploy/values.yaml | 2 ++ 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml index 6c3270a4..65717cc2 100644 --- a/deploy/qa-ext-values.yaml +++ b/deploy/qa-ext-values.yaml @@ -1,5 +1,7 @@ externalHost: qa.serval-api.org environment: Staging +alertEmail: ext-qa-serval-alerts@languagetechnology.org +emailsToAlert: 'john_lambert@sil.org, eli_lowry@sil.org' enableTls: true namespace: serval auth0Domain: dev-sillsdev.auth0.com diff --git a/deploy/qa-int-values.yaml b/deploy/qa-int-values.yaml index 692bd2f6..1ee3333c 100644 --- a/deploy/qa-int-values.yaml +++ b/deploy/qa-int-values.yaml @@ -1,5 +1,7 @@ externalHost: qa-int.serval-api.org environment: Staging +alertEmail: int-qa-serval-alerts@languagetechnology.org +emailsToAlert: john_lambert@sil.org enableTls: true namespace: nlp auth0Domain: sil-appbuilder.auth0.com diff --git a/deploy/serval/templates/alert-manager-config.yaml b/deploy/serval/templates/alert-manager-config.yaml index b1a933e2..71841611 100644 --- a/deploy/serval/templates/alert-manager-config.yaml +++ b/deploy/serval/templates/alert-manager-config.yaml @@ -7,19 +7,19 @@ spec: receivers: - emailConfigs: - authPassword: - key: ????? - name: ????? - authUsername: ?????? - from: serval-dallas@languagetechnology.org + name: aqua-ml-data + key: smtp_password + authUsername: {{ .Values.alertEmail }} + from: {{ .Values.alertEmail }} requireTLS: true sendResolved: true - smarthost: ????? + smarthost: mail.languagetechnology.org:587 tlsConfig: {} - to: 'john_lambert@sil.org, eli_lowry@sil.org' - name: alert-nlp + to: {{ .Values.emailsToAlert }} + name: alert-serval route: - groupBy: [...] + groupBy: [] groupInterval: 5m groupWait: 10s - receiver: alert-nlp + receiver: alert-serval repeatInterval: 4h \ No newline at end of file diff --git a/deploy/values.yaml b/deploy/values.yaml index f8175f59..6a58b804 100644 --- a/deploy/values.yaml +++ b/deploy/values.yaml @@ -1,5 +1,7 @@ externalHost: prod.serval-api.org environment: Production +alertEmail: prod-serval-alerts@languagetechnology.org +emailsToAlert: 'john_lambert@sil.org, eli_lowry@sil.org' enableTls: true namespace: serval auth0Domain: languagetechnology.auth0.com From f1f3bc6409fdb9acb647e549111b87dd078a33a2 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 19 Oct 2023 15:04:33 -0400 Subject: [PATCH 3/3] Rules are working! --- deploy/serval/templates/prometheus-rules.yaml | 56 ++++++++++++++++--- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/deploy/serval/templates/prometheus-rules.yaml b/deploy/serval/templates/prometheus-rules.yaml index 5cccbf5e..400b7daf 100644 --- a/deploy/serval/templates/prometheus-rules.yaml +++ b/deploy/serval/templates/prometheus-rules.yaml @@ -1,21 +1,63 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: + name: prometheus-rules namespace: {{ .Values.namespace }} spec: groups: - name: cpu rules: - - alert: cpu-80perc-{{ .Values.namspace }} + - alert: cpu-80perc annotations: description: >- - '{{ $labels.container }} has high CPU.' + '{{ "{{ $labels.container }}" }} has high CPU for over 3 minutes.' expr: >- - max(rate (container_cpu_usage_seconds_total {image!="", namespace=~"serval|nlp", container!="POD" } [3m])) - by (container, namespace) - / on (container, namespace) - min(kube_pod_container_resource_limits{resource="cpu", namespace=~"serval|nlp", container!="POD"}) - by (container, namespace) * 100 >= 80 + min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m])) + by (container) / on (container) + min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"}) + by (container) * 100 >= 80 for: 0s labels: severity: warning + - name: cpu-job + rules: + - alert: cpu-long-job + annotations: + description: >- + '{{ "{{ $labels.container }}" }} has a job running over 3 hours.' + expr: >- + min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h])) + by (container) / on (container) + min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"}) + by (container) * 100 >= 80 + for: 0s + labels: + severity: warning + - name: memory + rules: + - alert: memory-near-limit + annotations: + description: >- + '{{ "{{ $labels.container }}" }} is over 80% memory.' + expr: >- + min (container_memory_working_set_bytes{image!="", namespace="{{ .Values.namespace }}", container!="POD" }) + by (container) / on (container) + min (kube_pod_container_resource_limits{resource="memory", namespace="{{ .Values.namespace }}", container!="POD"}) + by (container) * 100 >= 80 + for: 0s + labels: + severity: warning + - name: disk + rules: + - alert: disk-near-limit + annotations: + description: >- + '{{ "{{ $labels.container }}" }} is over 80% disk space utilization.' + expr: >- + min (kubelet_volume_stats_used_bytes{namespace="{{ .Values.namespace }}"}) + by (persistentvolumeclaim) / on (persistentvolumeclaim) + min (kube_persistentvolumeclaim_resource_requests_storage_bytes{namespace="{{ .Values.namespace }}"}) + by (persistentvolumeclaim) * 100 > 80 + for: 0s + labels: + severity: warning \ No newline at end of file