From a1e39700445565e3a689f7cd7df8d809cfc55144 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Thu, 14 Nov 2024 19:34:45 -0300 Subject: [PATCH] Add CI and unit tests for prometheus rules using promtool - It's good to have unit tests for the prometheus alerts and also check if the prometheus_alerts.yaml is a valid file. - using promtool is possible to check the rules and also run unit tests for the alerts. --- .github/workflows/test_prometheus_rules.yaml | 34 +++++ .../test_opensearch_rules.yaml | 133 ++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 .github/workflows/test_prometheus_rules.yaml create mode 100644 tests/unit/test_alert_rules/test_opensearch_rules.yaml diff --git a/.github/workflows/test_prometheus_rules.yaml b/.github/workflows/test_prometheus_rules.yaml new file mode 100644 index 000000000..9b231520b --- /dev/null +++ b/.github/workflows/test_prometheus_rules.yaml @@ -0,0 +1,34 @@ +name: Test prometheus rules + +on: + workflow_call: + workflow_dispatch: + pull_request: + types: [opened, synchronize, reopened] + branches: [main] + paths-ignore: + - "**.md" + - "**.rst" + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +jobs: + promtool: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v3 + + # prometheus snap includes promtool + - name: Install prometheus snap + run: sudo snap install prometheus + + - name: Check validity of prometheus alert rules + run: | + promtool check rules src/alert_rules/prometheus/*.yaml + + - name: Run unit tests for prometheus alert rules + run: | + promtool test rules tests/unit/test_alert_rules/*.yaml diff --git a/tests/unit/test_alert_rules/test_opensearch_rules.yaml b/tests/unit/test_alert_rules/test_opensearch_rules.yaml new file mode 100644 index 000000000..3d26e839e --- /dev/null +++ b/tests/unit/test_alert_rules/test_opensearch_rules.yaml @@ -0,0 +1,133 @@ +rule_files: + - ../../../src/alert_rules/prometheus/prometheus_alerts.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}' + values: '2x3' + alert_rule_test: + - eval_time: 2m + alertname: OpenSearchClusterNotHealthy + exp_alerts: + - exp_labels: + severity: critical + cluster: opensearch-x7zb + exp_annotations: + message: "Cluster opensearch-x7zb health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." + summary: "Cluster health status is RED" + + - interval: 1m + input_series: + - series: 'opensearch_cluster_status{cluster="opensearch-x7zb"}' + values: '1x21' + alert_rule_test: + - eval_time: 20m + alertname: OpenSearchClusterNotHealthy + exp_alerts: + - exp_labels: + severity: warning + cluster: opensearch-x7zb + exp_annotations: + message: "Cluster opensearch-x7zb health status has been YELLOW for at least 20m." + summary: "Cluster health status is YELLOW" + + - interval: 1m + input_series: + - series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '69802552852x10' # just 70 GB available + - series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '498589663232x10' # HD with 500 GB + alert_rule_test: + - eval_time: 5m + alertname: OpenSearchNodeDiskWatermarkReached + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "Disk Low Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." + summary: "Disk Low Watermark Reached - disk saturation is 86%" + + - interval: 1m + input_series: + - series: 'opensearch_fs_path_available_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '44873069690x10' # just 45 GB available + - series: 'opensearch_fs_path_total_bytes{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '498589663232x10' # HD with 500 GB + alert_rule_test: + - eval_time: 5m + alertname: OpenSearchNodeDiskWatermarkReached + # both low and high water mark alerts are triggered + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "Disk Low Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." + summary: "Disk Low Watermark Reached - disk saturation is 91%" + - exp_labels: + severity: high + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "Disk High Watermark Reached at opensearch-0.fa9 node in opensearch-x7zb cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." + summary: "Disk High Watermark Reached - disk saturation is 91%" + + - interval: 1m + input_series: + - series: 'opensearch_jvm_mem_heap_used_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '76x60' + alert_rule_test: + - eval_time: 10m + alertname: OpenSearchJVMHeapUseHigh + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "JVM Heap usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 76%." + summary: "JVM Heap usage on the node is high" + + - interval: 1m + input_series: + - series: 'opensearch_os_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '91x6' + alert_rule_test: + - eval_time: 1m + alertname: OpenSearchHostSystemCPUHigh + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "System CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%" + summary: "System CPU usage is high" + + - interval: 1m + input_series: + - series: 'opensearch_process_cpu_percent{cluster="opensearch-x7zb", instance="10.1.156.70:9200", node="opensearch-0.fa9"}' + values: '91x6' + alert_rule_test: + - eval_time: 1m + alertname: OpenSearchProcessCPUHigh + exp_alerts: + - exp_labels: + severity: alert + cluster: opensearch-x7zb + instance: 10.1.156.70:9200 + node: opensearch-0.fa9 + exp_annotations: + message: "OSE process CPU usage on the node opensearch-0.fa9 in opensearch-x7zb cluster is 91%" + summary: "OSE process CPU usage is high"