Skip to content

Commit

Permalink
Unify alertmanager naming
Browse files Browse the repository at this point in the history
Closes #451.
  • Loading branch information
jchristgit authored and jb3 committed Aug 25, 2024
1 parent 134a90f commit ba5208b
Show file tree
Hide file tree
Showing 14 changed files with 31 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ groups:
- name: alertmanager
rules:

- alert: AlertManagerClusterFailedPeers
- alert: alert-manager/cluster-failed-peers
expr: alertmanager_cluster_failed_peers > 0
for: 1m
labels:
Expand All @@ -11,7 +11,7 @@ groups:
summary: "An Alertmanager node is reporting failed peers"
description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid."

- alert: AlertManagerHealthScore
- alert: alert-manager/health-score
expr: alertmanager_cluster_health_score > 0
for: 1m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ groups:
interval: 1d
rules:

- alert: CertificateExpiringSoon
- alert: cert-manager/certificate-expiring-soon
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7
for: 0m
labels:
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ groups:
- name: coredns
rules:

- alert: CoreDNSPanics
- alert: core-dns/panics
expr: increase(coredns_panics_total[1m]) > 0
for: 0m
labels:
Expand All @@ -11,7 +11,7 @@ groups:
summary: "CoreDNS is experiencing panic"
description: "Number of CoreDNS panics encountered: {{ $value }}"

- alert: CoreDNSCacheMisses
- alert: core-dns/cache-misses
expr: rate(coredns_cache_misses_total{}[10m]) / rate(coredns_cache_misses_total{}[10m] offset 10m) > 5.00
labels:
severity: page
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ groups:
- name: cpu
rules:

- alert: HighCPUThrottling
- alert: containers/high-cpu-throttling
expr: rate(container_cpu_cfs_throttled_seconds_total{pod=~".+", container_name!="POD", image!=""}[5m]) > 1
for: 5m
labels:
Expand All @@ -11,7 +11,7 @@ groups:
summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} high throttling "
description: "{{ $labels.container_name }} inside {{ $labels.pod }} is at {{ $value }}"

- alert: HighNodeCPU
- alert: kubernetes/high-node-cpu
expr: 100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
Expand Down
6 changes: 3 additions & 3 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
groups:
- name: django
rules:
- alert: DjangoErrors
- alert: django/errors
expr: increase(django_http_responses_total_by_status_total{status=~"5.."}[5m]) > 0
for: 5m
labels:
Expand All @@ -10,7 +10,7 @@ groups:
summary: "Django is experiencing 5xx errors"
description: "Django is experiencing 5xx errors on {{ $labels.namespace }}/{{ $labels.job }}"

- alert: DjangoLatencyElevated
- alert: django/latency-elevated
expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts", view!="api:github-webhook-filter", view!="home:home", view!="content:tag"}[5m])) > 1.0
for: 15m
labels:
Expand All @@ -19,7 +19,7 @@ groups:
summary: "Django route is experiencing high latency"
description: "Django route {{ $labels.method }} {{ $labels.view }} has raised latency"

- alert: DjangoLatencyHigh
- alert: django/latency-high
expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts", view!="api:github-webhook-filter", view!="home:home", view!="content:tag"}[5m])) > 10.0
for: 15m
labels:
Expand Down
2 changes: 1 addition & 1 deletion kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
groups:
- name: etcd
rules:
- alert: EtcdErrorsSpike
- alert: etcd/error-spike
expr: rate(etcd_request_error_total[5m]) > 0
for: 5m
labels:
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
groups:
- name: jobs
rules:
- alert: KubernetesCronjobSuspended
- alert: kubernetes/cronjob-suspended
expr: kube_cronjob_spec_suspend != 0
for: 0m
labels:
Expand All @@ -10,7 +10,7 @@ groups:
summary: "Kubernetes CronJob suspended: {{ $labels.cronjob }}"
description: "CronJob {{ $labels.kubernetes_namespace }}/{{ $labels.cronjob }} is suspended"

- alert: KubernetesJobFailed
- alert: kubernetes/jobs-failed
expr: kube_job_status_failed > 0
for: 0m
labels:
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ groups:
- name: memory
rules:

- alert: NodeHighMemoryUsage
- alert: node/high-memory-usage
expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8
for: 30s
labels:
Expand All @@ -11,7 +11,7 @@ groups:
summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes"
description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}'

- alert: ContainerOOMEvent
- alert: container/oom
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
for: 0m
labels:
Expand Down
6 changes: 3 additions & 3 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ groups:
- name: nginx
rules:

- alert: NGINX4XXRequests
- alert: nginx/4xx-requests
expr: sum by (service) (rate(nginx_ingress_controller_requests{service!="pixels",status!~"404|444",status=~"^4.."}[1m])) / sum by (service) (rate(nginx_ingress_controller_requests[1m])) > 0.5
for: 1m
labels:
Expand All @@ -11,7 +11,7 @@ groups:
summary: "High rate of 4XX requests for inbound requests"
description: "Rate of 4XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"

- alert: NGINX5XXRequests
- alert: nginx/5xx-requests
expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service) > 0.5
for: 1m
labels:
Expand All @@ -20,7 +20,7 @@ groups:
summary: "High rate of 5XX requests for inbound requests"
description: "Rate of 5XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"

- alert: NGINXP99Timing
- alert: nginx/p99-timing
expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prestashop-svc)"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10
for: 5m
labels:
Expand Down
12 changes: 6 additions & 6 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ groups:
- name: nodes
rules:

- alert: KubernetesNodeDiskPressure
- alert: kubernetes/node-disk-pressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 1m
labels:
Expand All @@ -11,7 +11,7 @@ groups:
summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure
description: "{{ $labels.kubernetes_node }} does not have adequate space to work with."

- alert: KubernetesNodeMemoryPressure
- alert: kubernetes/node-memory-pressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 15s
labels:
Expand All @@ -20,7 +20,7 @@ groups:
summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure
description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with."

- alert: KubernetesNodeNetworkUnavailable
- alert: kubernetes/node-network-unavailable
expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
for: 15s
labels:
Expand All @@ -30,7 +30,7 @@ groups:
description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections"


- alert: KubernetesNodePIDPressure
- alert: kubernetes/node-pid-pressure
expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1
for: 15s
labels:
Expand All @@ -39,7 +39,7 @@ groups:
summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion
description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with."

- alert: KubernetesNodeReady
- alert: kubernetes/node-not-ready
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1m
labels:
Expand All @@ -48,7 +48,7 @@ groups:
summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready
description: "Node {{ $labels.kubernetes_node }} has been unready for a long time"

- alert: KubernetesNodeCordoned
- alert: kubernetes/node-cordoned
expr: kube_node_spec_unschedulable == 1
for: 30m
labels:
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
groups:
- name: pods
rules:
- alert: KubernetesPodNotHealthy
- alert: kubernetes/pod-not-healthy
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[3m:1m]) > 0
for: 3m
labels:
Expand All @@ -10,7 +10,7 @@ groups:
summary: "Kubernetes Pod not healthy: {{ $labels.namespace }}/{{ $labels.pod }}"
description: "Pod has been in a non-ready state for longer than 3 minutes."

- alert: KubernetesPodCrashLooping
- alert: kubernetes/pod-crash-looping
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
for: 1m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ groups:
rules:

# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
- alert: prometheus/instance-down
expr: up == 0
for: 5m
labels:
Expand All @@ -12,7 +12,7 @@ groups:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."

- alert: PrometheusConfigFailed
- alert: prometheus/config-failed
expr: prometheus_config_last_reload_successful == 0
for: 0m
labels:
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
groups:
- name: redis
rules:
- alert: RedisDown
- alert: redis/down
expr: redis_up == 0
for: 1m
labels:
Expand All @@ -10,7 +10,7 @@ groups:
summary: "Redis is offline"
description: "Redis Exporter cannot connect to Redis."

- alert: RedisOutOfMemory
- alert: redis/oom
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
for: 0m
labels:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
groups:
- name: volumes
rules:
- alert: KubernetesVolumeOutOfDiskSpace
- alert: kubernetes/volume-out-of-space
expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim!="prometheus-storage"} / kubelet_volume_stats_capacity_bytes * 100 < 10
for: 2m
labels:
Expand Down

0 comments on commit ba5208b

Please sign in to comment.