diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml index 25e555db..6442b13d 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml @@ -11,3 +11,12 @@ groups: annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." + + - alert: PrometheusConfigFailed + expr: prometheus_config_last_reload_successful == 0 + for: 0m + labels: + severity: page + annotations: + summary: "Prometheus config reload in pod {{ $labels.kubernetes_pod_name }} has failed" + description: "Prometheus instance {{ $labels.kubernetes_pod_name }} (`{{ $labels.instance }}`) has failed to reload its config." diff --git a/kubernetes/namespaces/monitoring/prometheus/deployment.yaml b/kubernetes/namespaces/monitoring/prometheus/deployment.yaml index 2dd1e2dc..e13f92fe 100644 --- a/kubernetes/namespaces/monitoring/prometheus/deployment.yaml +++ b/kubernetes/namespaces/monitoring/prometheus/deployment.yaml @@ -13,6 +13,9 @@ spec: metadata: labels: app: prometheus + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: prometheus spec: serviceAccountName: prometheus containers: @@ -41,6 +44,32 @@ spec: mountPath: /etc/prometheus - name: prometheus-alerts mountPath: /opt/pydis/prometheus/alerts.d + - image: ghcr.io/owl-corp/inotify-base:latest + imagePullPolicy: Always + name: prometheus-reloader + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - name: prometheus-config + mountPath: /opt/monitor/prom-config + - name: prometheus-alerts + mountPath: /opt/monitor/prom-alerts + - name: reloader-hook + mountPath: /opt/pydis + - name: reloader-tmpfs + mountPath: /tmp + env: + - name: INOTIFY_HOOK_SCRIPT + value: /opt/pydis/hook.sh + # When a ConfigMap volume updates we see a delete event for the old + # container timestamp + - name: INOTIFY_WATCH_EVENTS + value: delete + - name: INOTIFY_HOOK_DELAY + value: "5" + envFrom: + - secretRef: + name: prometheus-reloader-env restartPolicy: Always securityContext: fsGroup: 2000 @@ -56,3 +85,11 @@ spec: - name: prometheus-alerts configMap: name: prometheus-alert-rules + - name: reloader-hook + configMap: + name: prometheus-reloader-script + defaultMode: 0777 + - name: reloader-tmpfs + emptyDir: + medium: Memory + sizeLimit: 50Mi diff --git a/kubernetes/namespaces/monitoring/prometheus/reloader-script.yaml b/kubernetes/namespaces/monitoring/prometheus/reloader-script.yaml new file mode 100644 index 00000000..6aae9b30 --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/reloader-script.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-reloader-script + namespace: monitoring +data: + hook.sh: |- + #!/bin/sh + + set -exo pipefail + + # Endpoint to call to reload Prometheus + RELOAD_URL="http://localhost:9090/-/reload" + # Icon for the webhook + PROMETHEUS_ICON_URL="https://static-00.iconduck.com/assets.00/prometheus-icon-511x512-1vmxbcxr.png" + + echo "Detected change in mounted configmaps, reloading Prometheus..." + + # Make a temporary store to keep any errors + RESPONSE_STORE="$(mktemp)" + + # Attempt the reload, writing the response to the tempfile and the reload HTTP + # code to the variable + RELOAD_RESULT="$(curl -o "$RESPONSE_STORE" -X POST $RELOAD_URL -s -w "%{http_code}")" + + # Parse and filter the response body into a JSON string + RESPONSE_CONTENT="$(cat "$RESPONSE_STORE")" + FILTERED_BODY="$(echo "$RESPONSE_CONTENT" | jq -Rsa)" + + # Send a notification based on pass/failure + if [ $RELOAD_RESULT -eq 200 ]; then + BODY='{"username": "Prometheus Reloader", "embeds": [{ "title": "Prometheus Config Reload Succeeded", "description": "No errors.", "color": 6663286 } ], "avatar_url": "'"$PROMETHEUS_ICON_URL"'" }' + else + BODY='{"username": "Prometheus Reloader", "embeds": [{ "title": "Prometheus Config Reload Failed", "description": '"$FILTERED_BODY"', "color": 12799052 } ], "avatar_url": "'"$PROMETHEUS_ICON_URL"'" }' + fi; + + # Send the webhook + curl -X POST -H "Content-Type: application/json" "$RELOADER_DISCORD_HOOK" -d "$BODY" diff --git a/kubernetes/namespaces/monitoring/prometheus/secrets.yaml b/kubernetes/namespaces/monitoring/prometheus/secrets.yaml new file mode 100644 index 00000000..875ab138 Binary files /dev/null and b/kubernetes/namespaces/monitoring/prometheus/secrets.yaml differ