diff --git a/charts/prometheus-prefect-exporter/Chart.yaml b/charts/prometheus-prefect-exporter/Chart.yaml index f84cee9..46f6496 100644 --- a/charts/prometheus-prefect-exporter/Chart.yaml +++ b/charts/prometheus-prefect-exporter/Chart.yaml @@ -8,8 +8,8 @@ maintainers: name: prometheus-prefect-exporter sources: - https://github.com/devops-ia/prometheus-prefect-exporter -version: 0.1.1 -appVersion: "0.2.0" +version: 1.0.0 +appVersion: "1.2.4" home: https://github.com/devops-ia/helm-charts/tree/main/charts/prometheus-prefect-exporter keywords: - prometheus-prefect-exporter diff --git a/charts/prometheus-prefect-exporter/values.yaml b/charts/prometheus-prefect-exporter/values.yaml index 8fe3301..c520ee2 100644 --- a/charts/prometheus-prefect-exporter/values.yaml +++ b/charts/prometheus-prefect-exporter/values.yaml @@ -77,11 +77,74 @@ prometheusRule: enabled: false additionalLabels: {} rules: [] + # - alert: PrefectServerDown + # expr: sum by (pod, namespace) (kube_pod_info{namespace="prefect", pod=~"prefect-server.*"}) == 0 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Flow Run {{ $labels.flow_name }} {{ $labels.state_name }} (Flow {{ $labels.flow_name }}) + # description: "Flow Run failed or crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectDeploymentsAllPaused + # expr: (sum by (namespace) (prefect_deployments_total) == bool sum by (namespace) (prefect_info_deployment{is_schedule_active="False"})) + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: All deployments are paused + # description: "All deployments are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectAgentWorkersNotRunning + # expr: sum(kube_deployment_status_replicas{deployment=~".*agent|.*worker"}) == 0 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: All agent and worker are not running + # description: "All agent and worker are not running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectWorkPoolNotAllow + # expr: prefect_work_pools_total == 0 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Worker pool not allowed + # description: "Worker pool not allowed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectWorkPoolsAllPaused + # expr: (sum by (namespace) (prefect_work_pools_total) == bool sum by (namespace) (prefect_info_work_pools{is_paused="True"})) + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: All work pools are paused + # description: "All work pools are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectWorkQueueNotAllow + # expr: prefect_work_queues_total == 0 + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Worker queue not allowed + # description: "Worker queue not allowed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectWorkQueuesAllPaused + # expr: (sum by (namespace) (prefect_work_queues_total) == bool sum by (namespace) (prefect_info_work_queues{is_paused="True"})) + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: All work queues are paused + # description: "All work queues are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # - alert: PrefectFlowRunCrashedOrFailed + # expr: group by (flow_name, flow_run_name, namespace) (count_over_time(prefect_info_flow_runs{state_name=~"Failed|Crashed"}[1m])) + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Flow Run {{ $labels.flow_name }} {{ $labels.state_name }} (Flow {{ $labels.flow_name }}) + # description: "Flow Run failed or crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # -- Enable livenessProbe and readinessProbe testConnection: false - # -- Ingress configuration to expose app ingress: enabled: false