From e3f38a86ad0a20972bca642b87aa0ff48402774d Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Wed, 20 Nov 2024 18:40:31 -0300 Subject: [PATCH] Change duplicated rules names (#504) ## Issue Some rules has duplicated names that can be a little bit confusing. ## Solution This PR separate into different names to be more clear with the alert rule expression and adds spaces between alerts to increase readability. --- .../prometheus/prometheus_alerts.yaml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index 6e651108a..7da5e0449 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -1,7 +1,7 @@ "groups": - "name": "opensearch.alerts" "rules": - - "alert": "OpenSearchClusterNotHealthy" + - "alert": "OpenSearchClusterRed" "annotations": "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." "summary": "Cluster health status is RED" @@ -10,15 +10,17 @@ "for": "2m" "labels": "severity": "critical" - - "alert": "OpenSearchClusterNotHealthy" + + - "alert": "OpenSearchClusterYellow" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m." + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some cluster replicas shards are not allocated." "summary": "Cluster health status is YELLOW" "expr": | sum by (cluster) (opensearch_cluster_status == 1) "for": "20m" "labels": "severity": "warning" + - "alert": "OpenSearchBulkRequestsRejectionJumps" "annotations": "message": "High Bulk Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." @@ -28,7 +30,8 @@ "for": "10m" "labels": "severity": "warning" - - "alert": "OpenSearchNodeDiskWatermarkReached" + + - "alert": "OpenSearchNodeDiskLowWatermarkReached" "annotations": "message": "Disk Low Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" @@ -44,7 +47,8 @@ "for": "5m" "labels": "severity": "alert" - - "alert": "OpenSearchNodeDiskWatermarkReached" + + - "alert": "OpenSearchNodeDiskHighWatermarkReached" "annotations": "message": "Disk High Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" @@ -60,6 +64,7 @@ "for": "5m" "labels": "severity": "high" + - "alert": "OpenSearchJVMHeapUseHigh" "annotations": "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." @@ -69,6 +74,7 @@ "for": "10m" "labels": "severity": "alert" + - "alert": "OpenSearchHostSystemCPUHigh" "annotations": "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%" @@ -78,6 +84,7 @@ "for": "1m" "labels": "severity": "alert" + - "alert": "OpenSearchProcessCPUHigh" "annotations": "message": "OSE process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"