Skip to content

Commit

Permalink
feat: add default PrometheusRules
Browse files Browse the repository at this point in the history
  • Loading branch information
ialejandro committed Oct 20, 2023
1 parent 0877f85 commit 9adeeaf
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 3 deletions.
4 changes: 2 additions & 2 deletions charts/prometheus-prefect-exporter/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ maintainers:
name: prometheus-prefect-exporter
sources:
- https://github.com/devops-ia/prometheus-prefect-exporter
version: 0.1.1
appVersion: "0.2.0"
version: 1.0.0
appVersion: "1.2.4"
home: https://github.com/devops-ia/helm-charts/tree/main/charts/prometheus-prefect-exporter
keywords:
- prometheus-prefect-exporter
Expand Down
65 changes: 64 additions & 1 deletion charts/prometheus-prefect-exporter/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,74 @@ prometheusRule:
enabled: false
additionalLabels: {}
rules: []
# - alert: PrefectServerDown
# expr: sum by (pod, namespace) (kube_pod_info{namespace="prefect", pod=~"prefect-server.*"}) == 0
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: Flow Run {{ $labels.flow_name }} {{ $labels.state_name }} (Flow {{ $labels.flow_name }})
# description: "Flow Run failed or crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrefectDeploymentsAllPaused
# expr: (sum by (namespace) (prefect_deployments_total) == bool sum by (namespace) (prefect_info_deployment{is_schedule_active="False"}))
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: All deployments are paused
# description: "All deployments are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrefectAgentWorkersNotRunning
# expr: sum(kube_deployment_status_replicas{deployment=~".*agent|.*worker"}) == 0
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: All agent and worker are not running
# description: "All agent and worker are not running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrefectWorkPoolNotAllow
# expr: prefect_work_pools_total == 0
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: Worker pool not allowed
# description: "Worker pool not allowed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrefectWorkPoolsAllPaused
# expr: (sum by (namespace) (prefect_work_pools_total) == bool sum by (namespace) (prefect_info_work_pools{is_paused="True"}))
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: All work pools are paused
# description: "All work pools are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrefectWorkQueueNotAllow
# expr: prefect_work_queues_total == 0
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: Worker queue not allowed
# description: "Worker queue not allowed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrefectWorkQueuesAllPaused
# expr: (sum by (namespace) (prefect_work_queues_total) == bool sum by (namespace) (prefect_info_work_queues{is_paused="True"}))
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: All work queues are paused
# description: "All work queues are paused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# - alert: PrefectFlowRunCrashedOrFailed
# expr: group by (flow_name, flow_run_name, namespace) (count_over_time(prefect_info_flow_runs{state_name=~"Failed|Crashed"}[1m]))
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: Flow Run {{ $labels.flow_name }} {{ $labels.state_name }} (Flow {{ $labels.flow_name }})
# description: "Flow Run failed or crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

# -- Enable livenessProbe and readinessProbe
testConnection: false


# -- Ingress configuration to expose app
ingress:
enabled: false
Expand Down

0 comments on commit 9adeeaf

Please sign in to comment.