diff --git a/deploy/serval/templates/alert-manager-config.yaml b/deploy/serval/templates/alert-manager-config.yaml new file mode 100644 index 00000000..b1a933e2 --- /dev/null +++ b/deploy/serval/templates/alert-manager-config.yaml @@ -0,0 +1,25 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: limit-alerts + namespace: {{ .Values.namespace }} +spec: + receivers: + - emailConfigs: + - authPassword: + key: ????? + name: ????? + authUsername: ?????? + from: serval-dallas@languagetechnology.org + requireTLS: true + sendResolved: true + smarthost: ????? + tlsConfig: {} + to: 'john_lambert@sil.org, eli_lowry@sil.org' + name: alert-nlp + route: + groupBy: [...] + groupInterval: 5m + groupWait: 10s + receiver: alert-nlp + repeatInterval: 4h \ No newline at end of file diff --git a/deploy/serval/templates/prometheus-rules.yaml b/deploy/serval/templates/prometheus-rules.yaml new file mode 100644 index 00000000..5cccbf5e --- /dev/null +++ b/deploy/serval/templates/prometheus-rules.yaml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + namespace: {{ .Values.namespace }} +spec: + groups: + - name: cpu + rules: + - alert: cpu-80perc-{{ .Values.namspace }} + annotations: + description: >- + '{{ $labels.container }} has high CPU.' + expr: >- + max(rate (container_cpu_usage_seconds_total {image!="", namespace=~"serval|nlp", container!="POD" } [3m])) + by (container, namespace) + / on (container, namespace) + min(kube_pod_container_resource_limits{resource="cpu", namespace=~"serval|nlp", container!="POD"}) + by (container, namespace) * 100 >= 80 + for: 0s + labels: + severity: warning