From d623aae43e6b71f0ccbcce502b1ee0feb5ed4780 Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Thu, 12 Dec 2024 04:57:37 +0000 Subject: [PATCH 1/5] Boilerplate for the SLI dashboard --- infra/terraform/monitoring.tf | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 infra/terraform/monitoring.tf diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf new file mode 100644 index 0000000000..753ea99bf6 --- /dev/null +++ b/infra/terraform/monitoring.tf @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { + dashboard_id = "clusterfuzz_sli_dashboard" + display_name = "ClusterFuzz General Health" + dashboard_json = < Date: Fri, 13 Dec 2024 02:54:46 +0000 Subject: [PATCH 2/5] Adding the dashboard contents --- infra/terraform/monitoring.tf | 1605 ++++++++++++++++++++++++++++++++- 1 file changed, 1603 insertions(+), 2 deletions(-) diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf index 753ea99bf6..4edc6f437c 100644 --- a/infra/terraform/monitoring.tf +++ b/infra/terraform/monitoring.tf @@ -16,7 +16,1608 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { dashboard_id = "clusterfuzz_sli_dashboard" display_name = "ClusterFuzz General Health" dashboard_json = < Date: Fri, 13 Dec 2024 03:04:03 +0000 Subject: [PATCH 3/5] Update dashboard with TASK_OUTCOME_COUNT widgets --- infra/terraform/monitoring.tf | 73 +++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 8 deletions(-) diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf index 4edc6f437c..ba7c90e11d 100644 --- a/infra/terraform/monitoring.tf +++ b/infra/terraform/monitoring.tf @@ -17,7 +17,7 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { display_name = "ClusterFuzz General Health" dashboard_json = < Date: Mon, 16 Dec 2024 16:53:13 +0000 Subject: [PATCH 4/5] Replace task count metric from histogram to counter, sort untriaged testcase count by status --- infra/terraform/monitoring.tf | 166 ++++++++++++++++++++++------------ 1 file changed, 107 insertions(+), 59 deletions(-) diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf index ba7c90e11d..e3192888cc 100644 --- a/infra/terraform/monitoring.tf +++ b/infra/terraform/monitoring.tf @@ -31,7 +31,7 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "dataSets": [ { "timeSeriesQuery": { - "prometheusQuery": "sum by (task) (rate(custom_googleapis_com:utask_subtask_duration_secs_count{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[1h]))", + "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))\n", "unitOverride": "", "outputFullDuration": false }, @@ -67,7 +67,7 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "dataSets": [ { "timeSeriesQuery": { - "prometheusQuery": "sum by (task) (rate(custom_googleapis_com:utask_subtask_duration_secs_count{monitored_resource=\"gce_instance\",subtask=\"preprocess\"}[1h]))", + "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"preprocess\"}[${__interval}]))\n", "unitOverride": "", "outputFullDuration": false }, @@ -104,7 +104,7 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "dataSets": [ { "timeSeriesQuery": { - "prometheusQuery": "sum by (task) (rate(custom_googleapis_com:utask_subtask_duration_secs_count{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[1h]))", + "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"postprocess\"}[${__interval}]))\n", "unitOverride": "", "outputFullDuration": false }, @@ -1468,27 +1468,35 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "width": 16, "height": 16, "widget": { - "title": "preprocess unhandled exception rate (by task)", "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, "dataSets": [ { - "plotType": "LINE", - "targetAxis": "Y1", "timeSeriesQuery": { "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"preprocess\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"preprocess\"}[${__interval}]))", - "unitOverride": "%" - } + "unitOverride": "%", + "outputFullDuration": false + }, + "plotType": "LINE", + "legendTemplate": "", + "targetAxis": "Y1", + "dimensions": [], + "measures": [], + "breakdowns": [] } ], "thresholds": [], "yAxis": { "label": "", "scale": "LINEAR" + }, + "chartOptions": { + "mode": "COLOR", + "showLegend": false, + "displayHorizontal": false } - } + }, + "title": "preprocess unhandled exception rate (by task)", + "id": "" } }, { @@ -1497,27 +1505,35 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "width": 16, "height": 16, "widget": { - "title": "uworker_main unhandled exception rate (by task)", "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, "dataSets": [ { - "plotType": "LINE", - "targetAxis": "Y1", "timeSeriesQuery": { "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))", - "unitOverride": "%" - } + "unitOverride": "%", + "outputFullDuration": false + }, + "plotType": "LINE", + "legendTemplate": "", + "targetAxis": "Y1", + "dimensions": [], + "measures": [], + "breakdowns": [] } ], "thresholds": [], "yAxis": { "label": "", "scale": "LINEAR" + }, + "chartOptions": { + "mode": "COLOR", + "showLegend": false, + "displayHorizontal": false } - } + }, + "title": "uworker_main unhandled exception rate (by task)", + "id": "" } }, { @@ -1526,27 +1542,35 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "width": 16, "height": 16, "widget": { - "title": "postprocess unhandled exception rate (by task)", "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, "dataSets": [ { - "plotType": "LINE", - "targetAxis": "Y1", "timeSeriesQuery": { "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\", subtask=\"postprocess\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\", subtask=\"postprocess\"}[${__interval}]))", - "unitOverride": "%" - } + "unitOverride": "%", + "outputFullDuration": false + }, + "plotType": "LINE", + "legendTemplate": "", + "targetAxis": "Y1", + "dimensions": [], + "measures": [], + "breakdowns": [] } ], "thresholds": [], "yAxis": { "label": "", "scale": "LINEAR" + }, + "chartOptions": { + "mode": "COLOR", + "showLegend": false, + "displayHorizontal": false } - } + }, + "title": "postprocess unhandled exception rate (by task)", + "id": "" } }, { @@ -1558,7 +1582,7 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "dataSets": [ { "timeSeriesQuery": { - "prometheusQuery": "last_over_time(custom_googleapis_com:issues_untriaged_testcase_count{monitored_resource=\"gce_instance\"}[2h])\n", + "prometheusQuery": "sum by (status)(last_over_time((custom_googleapis_com:issues_untriaged_testcase_count{monitored_resource=\"gce_instance\"}[2h])))\n", "unitOverride": "", "outputFullDuration": false }, @@ -1581,7 +1605,7 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "displayHorizontal": false } }, - "title": "Untriaged testcase count", + "title": "Untriaged testcase count (by status)", "id": "" } }, @@ -1591,27 +1615,35 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "width": 16, "height": 16, "widget": { - "title": "uworker_main overall failure rate (by task) - CAN BE DRILLED BY JOB", "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, "dataSets": [ { - "plotType": "LINE", - "targetAxis": "Y1", "timeSeriesQuery": { "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", outcome=\"error\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))", - "unitOverride": "%" - } + "unitOverride": "%", + "outputFullDuration": false + }, + "plotType": "LINE", + "legendTemplate": "", + "targetAxis": "Y1", + "dimensions": [], + "measures": [], + "breakdowns": [] } ], "thresholds": [], "yAxis": { "label": "", "scale": "LINEAR" + }, + "chartOptions": { + "mode": "COLOR", + "showLegend": false, + "displayHorizontal": false } - } + }, + "title": "uworker_main overall failure rate (by task) - CAN BE DRILLED BY JOB", + "id": "" } }, { @@ -1620,27 +1652,35 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "width": 16, "height": 16, "widget": { - "title": "uworker_main overall retry rate (by task) - CAN BE DRILLED BY JOB", "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, "dataSets": [ { - "plotType": "LINE", - "targetAxis": "Y1", "timeSeriesQuery": { "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", outcome=\"maybe_retry\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))", - "unitOverride": "%" - } + "unitOverride": "%", + "outputFullDuration": false + }, + "plotType": "LINE", + "legendTemplate": "", + "targetAxis": "Y1", + "dimensions": [], + "measures": [], + "breakdowns": [] } ], "thresholds": [], "yAxis": { "label": "", "scale": "LINEAR" + }, + "chartOptions": { + "mode": "COLOR", + "showLegend": false, + "displayHorizontal": false } - } + }, + "title": "uworker_main overall retry rate (by task) - CAN BE DRILLED BY JOB", + "id": "" } }, { @@ -1648,27 +1688,35 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { "width": 16, "height": 16, "widget": { - "title": "uworker_main overall success rate (by task) - CAN BE DRILLED BY JOB", "xyChart": { - "chartOptions": { - "mode": "COLOR" - }, "dataSets": [ { - "plotType": "LINE", - "targetAxis": "Y1", "timeSeriesQuery": { "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", outcome=\"success\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))", - "unitOverride": "%" - } + "unitOverride": "%", + "outputFullDuration": false + }, + "plotType": "LINE", + "legendTemplate": "", + "targetAxis": "Y1", + "dimensions": [], + "measures": [], + "breakdowns": [] } ], "thresholds": [], "yAxis": { "label": "", "scale": "LINEAR" + }, + "chartOptions": { + "mode": "COLOR", + "showLegend": false, + "displayHorizontal": false } - } + }, + "title": "uworker_main overall success rate (by task) - CAN BE DRILLED BY JOB", + "id": "" } } ] From afdc43d6cbdf358ac2d401872c0dda79158906b8 Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Thu, 26 Dec 2024 18:31:03 +0000 Subject: [PATCH 5/5] Address nits, update metric names and labels --- infra/terraform/monitoring.tf | 316 ++++++++++++---------------------- 1 file changed, 106 insertions(+), 210 deletions(-) diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf index e3192888cc..a725e4ea07 100644 --- a/infra/terraform/monitoring.tf +++ b/infra/terraform/monitoring.tf @@ -17,12 +17,11 @@ resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { display_name = "ClusterFuzz General Health" dashboard_json = <