From 9adeeaf34af86e104d10b67ce63fb6caaeaed89b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20Alejandro=20Marug=C3=A1n?= Date: Fri, 20 Oct 2023 10:08:04 +0200 Subject: [PATCH] feat: add default PrometheusRules --- charts/prometheus-prefect-exporter/Chart.yaml | 4 +- .../prometheus-prefect-exporter/values.yaml | 65 ++++++++++++++++++- 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/charts/prometheus-prefect-exporter/Chart.yaml b/charts/prometheus-prefect-exporter/Chart.yaml index f84cee9..46f6496 100644 --- a/charts/prometheus-prefect-exporter/Chart.yaml +++ b/charts/prometheus-prefect-exporter/Chart.yaml @@ -8,8 +8,8 @@ maintainers: name: prometheus-prefect-exporter sources: - https://github.com/devops-ia/prometheus-prefect-exporter -version: 0.1.1 -appVersion: "0.2.0" +version: 1.0.0 +appVersion: "1.2.4" home: https://github.com/devops-ia/helm-charts/tree/main/charts/prometheus-prefect-exporter keywords: - prometheus-prefect-exporter diff --git a/charts/prometheus-prefect-exporter/values.yaml b/charts/prometheus-prefect-exporter/values.yaml index 8fe3301..c520ee2 100644 --- a/charts/prometheus-prefect-exporter/values.yaml +++ b/charts/prometheus-prefect-exporter/values.yaml @@ -77,11 +77,74 @@ prometheusRule: enabled: false additionalLabels: {} rules: [] + # - alert: PrefectServerDown + # expr: sum by (pod, namespace) (kube_pod_info{namespace="prefect", pod=~"prefect-server.*"}) == 0 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Flow Run {{ $labels.flow_name }} {{ $labels.state_name }} (Flow {{ $labels.flow_name }}) + # description: "Flow Run failed or crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectDeploymentsAllPaused + # expr: (sum by (namespace) (prefect_deployments_total) == bool sum by (namespace) (prefect_info_deployment{is_schedule_active="False"})) + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: All deployments are paused + # description: "All deployments are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectAgentWorkersNotRunning + # expr: sum(kube_deployment_status_replicas{deployment=~".*agent|.*worker"}) == 0 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: All agent and worker are not running + # description: "All agent and worker are not running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectWorkPoolNotAllow + # expr: prefect_work_pools_total == 0 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Worker pool not allowed + # description: "Worker pool not allowed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectWorkPoolsAllPaused + # expr: (sum by (namespace) (prefect_work_pools_total) == bool sum by (namespace) (prefect_info_work_pools{is_paused="True"})) + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: All work pools are paused + # description: "All work pools are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectWorkQueueNotAllow + # expr: prefect_work_queues_total == 0 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Worker queue not allowed + # description: "Worker queue not allowed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectWorkQueuesAllPaused + # expr: (sum by (namespace) (prefect_work_queues_total) == bool sum by (namespace) (prefect_info_work_queues{is_paused="True"})) + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: All work queues are paused + # description: "All work queues are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectFlowRunCrashedOrFailed + # expr: group by (flow_name, flow_run_name, namespace) (count_over_time(prefect_info_flow_runs{state_name=~"Failed|Crashed"}[1m])) + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Flow Run {{ $labels.flow_name }} {{ $labels.state_name }} (Flow {{ $labels.flow_name }}) + # description: "Flow Run failed or crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # -- Enable livenessProbe and readinessProbe testConnection: false - # -- Ingress configuration to expose app ingress: enabled: false