From ae2394030eccf8d722a654be6a679e0864195ba3 Mon Sep 17 00:00:00 2001
From: John Lambert <john_lambert@sil.org>
Date: Wed, 18 Oct 2023 14:40:57 -0400
Subject: [PATCH 1/3] Untested, unverified, incomplete

---
 .../templates/alert-manager-config.yaml       | 25 +++++++++++++++++++
 deploy/serval/templates/prometheus-rules.yaml | 21 ++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 deploy/serval/templates/alert-manager-config.yaml
 create mode 100644 deploy/serval/templates/prometheus-rules.yaml

diff --git a/deploy/serval/templates/alert-manager-config.yaml b/deploy/serval/templates/alert-manager-config.yaml
new file mode 100644
index 00000000..b1a933e2
--- /dev/null
+++ b/deploy/serval/templates/alert-manager-config.yaml
@@ -0,0 +1,25 @@
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: AlertmanagerConfig
+metadata:
+  name: limit-alerts
+  namespace: {{ .Values.namespace }}
+spec:
+  receivers:
+    - emailConfigs:
+        - authPassword:
+            key: ?????
+            name: ?????
+          authUsername: ??????
+          from: serval-dallas@languagetechnology.org
+          requireTLS: true
+          sendResolved: true
+          smarthost: ?????
+          tlsConfig: {}
+          to: 'john_lambert@sil.org, eli_lowry@sil.org'
+      name: alert-nlp
+  route:
+    groupBy: [...]
+    groupInterval: 5m
+    groupWait: 10s
+    receiver: alert-nlp
+    repeatInterval: 4h
\ No newline at end of file
diff --git a/deploy/serval/templates/prometheus-rules.yaml b/deploy/serval/templates/prometheus-rules.yaml
new file mode 100644
index 00000000..5cccbf5e
--- /dev/null
+++ b/deploy/serval/templates/prometheus-rules.yaml
@@ -0,0 +1,21 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  namespace: {{ .Values.namespace }}
+spec:
+  groups:
+    - name: cpu
+      rules:
+        - alert: cpu-80perc-{{ .Values.namspace }}
+          annotations:
+            description: >-
+              '{{ $labels.container }} has high CPU.'
+          expr: >-
+            max(rate (container_cpu_usage_seconds_total {image!="", namespace=~"serval|nlp", container!="POD" } [3m]))
+            by (container, namespace) 
+            / on (container, namespace) 
+            min(kube_pod_container_resource_limits{resource="cpu", namespace=~"serval|nlp", container!="POD"}) 
+            by (container, namespace) * 100 >= 80
+          for: 0s
+          labels:
+            severity: warning

From e27bfb9285d85fdfa552ed55721fbb5fa879a43e Mon Sep 17 00:00:00 2001
From: John Lambert <john_lambert@sil.org>
Date: Wed, 18 Oct 2023 16:45:02 -0400
Subject: [PATCH 2/3] Try 1 for email alerting

---
 deploy/qa-ext-values.yaml                      |  2 ++
 deploy/qa-int-values.yaml                      |  2 ++
 .../serval/templates/alert-manager-config.yaml | 18 +++++++++---------
 deploy/values.yaml                             |  2 ++
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml
index 6c3270a4..65717cc2 100644
--- a/deploy/qa-ext-values.yaml
+++ b/deploy/qa-ext-values.yaml
@@ -1,5 +1,7 @@
 externalHost: qa.serval-api.org
 environment: Staging
+alertEmail: ext-qa-serval-alerts@languagetechnology.org
+emailsToAlert: 'john_lambert@sil.org, eli_lowry@sil.org'
 enableTls: true
 namespace: serval
 auth0Domain: dev-sillsdev.auth0.com
diff --git a/deploy/qa-int-values.yaml b/deploy/qa-int-values.yaml
index 692bd2f6..1ee3333c 100644
--- a/deploy/qa-int-values.yaml
+++ b/deploy/qa-int-values.yaml
@@ -1,5 +1,7 @@
 externalHost: qa-int.serval-api.org
 environment: Staging
+alertEmail: int-qa-serval-alerts@languagetechnology.org
+emailsToAlert: john_lambert@sil.org
 enableTls: true
 namespace: nlp
 auth0Domain: sil-appbuilder.auth0.com
diff --git a/deploy/serval/templates/alert-manager-config.yaml b/deploy/serval/templates/alert-manager-config.yaml
index b1a933e2..71841611 100644
--- a/deploy/serval/templates/alert-manager-config.yaml
+++ b/deploy/serval/templates/alert-manager-config.yaml
@@ -7,19 +7,19 @@ spec:
   receivers:
     - emailConfigs:
         - authPassword:
-            key: ?????
-            name: ?????
-          authUsername: ??????
-          from: serval-dallas@languagetechnology.org
+            name: aqua-ml-data
+            key: smtp_password
+          authUsername: {{ .Values.alertEmail }}
+          from: {{ .Values.alertEmail }}
           requireTLS: true
           sendResolved: true
-          smarthost: ?????
+          smarthost: mail.languagetechnology.org:587
           tlsConfig: {}
-          to: 'john_lambert@sil.org, eli_lowry@sil.org'
-      name: alert-nlp
+          to: {{ .Values.emailsToAlert }}
+      name: alert-serval
   route:
-    groupBy: [...]
+    groupBy: []
     groupInterval: 5m
     groupWait: 10s
-    receiver: alert-nlp
+    receiver: alert-serval
     repeatInterval: 4h
\ No newline at end of file
diff --git a/deploy/values.yaml b/deploy/values.yaml
index f8175f59..6a58b804 100644
--- a/deploy/values.yaml
+++ b/deploy/values.yaml
@@ -1,5 +1,7 @@
 externalHost: prod.serval-api.org
 environment: Production
+alertEmail: prod-serval-alerts@languagetechnology.org
+emailsToAlert: 'john_lambert@sil.org, eli_lowry@sil.org'
 enableTls: true
 namespace: serval
 auth0Domain: languagetechnology.auth0.com

From f1f3bc6409fdb9acb647e549111b87dd078a33a2 Mon Sep 17 00:00:00 2001
From: John Lambert <john_lambert@sil.org>
Date: Thu, 19 Oct 2023 15:04:33 -0400
Subject: [PATCH 3/3] Rules are working!

---
 deploy/serval/templates/prometheus-rules.yaml | 56 ++++++++++++++++---
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/deploy/serval/templates/prometheus-rules.yaml b/deploy/serval/templates/prometheus-rules.yaml
index 5cccbf5e..400b7daf 100644
--- a/deploy/serval/templates/prometheus-rules.yaml
+++ b/deploy/serval/templates/prometheus-rules.yaml
@@ -1,21 +1,63 @@
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
+  name: prometheus-rules
   namespace: {{ .Values.namespace }}
 spec:
   groups:
     - name: cpu
       rules:
-        - alert: cpu-80perc-{{ .Values.namspace }}
+        - alert: cpu-80perc
           annotations:
             description: >-
-              '{{ $labels.container }} has high CPU.'
+              '{{ "{{ $labels.container }}" }} has high CPU for over 3 minutes.'
           expr: >-
-            max(rate (container_cpu_usage_seconds_total {image!="", namespace=~"serval|nlp", container!="POD" } [3m]))
-            by (container, namespace) 
-            / on (container, namespace) 
-            min(kube_pod_container_resource_limits{resource="cpu", namespace=~"serval|nlp", container!="POD"}) 
-            by (container, namespace) * 100 >= 80
+            min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m])) 
+            by (container) / on (container) 
+            min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"}) 
+            by (container) * 100 >= 80
           for: 0s
           labels:
             severity: warning
+    - name: cpu-job
+      rules:
+        - alert: cpu-long-job
+          annotations:
+            description: >-
+              '{{ "{{ $labels.container }}" }} has a job running over 3 hours.'
+          expr: >-
+            min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h])) 
+            by (container) / on (container) 
+            min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"}) 
+            by (container) * 100 >= 80
+          for: 0s
+          labels:
+            severity: warning
+    - name: memory
+      rules:
+        - alert: memory-near-limit
+          annotations:
+            description: >-
+              '{{ "{{ $labels.container }}" }} is over 80% memory.'
+          expr: >-
+            min (container_memory_working_set_bytes{image!="", namespace="{{ .Values.namespace }}", container!="POD" })
+            by (container) / on (container)
+            min (kube_pod_container_resource_limits{resource="memory", namespace="{{ .Values.namespace }}", container!="POD"})
+            by (container) * 100 >= 80
+          for: 0s
+          labels:
+            severity: warning
+    - name: disk
+      rules:
+        - alert: disk-near-limit
+          annotations:
+            description: >-
+              '{{ "{{ $labels.container }}" }} is over 80% disk space utilization.'
+          expr: >-
+            min (kubelet_volume_stats_used_bytes{namespace="{{ .Values.namespace }}"})
+            by (persistentvolumeclaim) / on (persistentvolumeclaim)
+            min (kube_persistentvolumeclaim_resource_requests_storage_bytes{namespace="{{ .Values.namespace }}"})
+            by (persistentvolumeclaim) * 100 > 80
+          for: 0s
+          labels:
+            severity: warning
\ No newline at end of file