diff --git a/serverlessworkflow/modules/ROOT/assets/images/cloud/operator/monitoring/grafana-dashboard-example.png b/serverlessworkflow/modules/ROOT/assets/images/cloud/operator/monitoring/grafana-dashboard-example.png new file mode 100644 index 000000000..ea7ac4c8a Binary files /dev/null and b/serverlessworkflow/modules/ROOT/assets/images/cloud/operator/monitoring/grafana-dashboard-example.png differ diff --git a/serverlessworkflow/modules/ROOT/assets/images/cloud/operator/monitoring/grafana-data-source-test.png b/serverlessworkflow/modules/ROOT/assets/images/cloud/operator/monitoring/grafana-data-source-test.png new file mode 100644 index 000000000..3acc2d854 Binary files /dev/null and b/serverlessworkflow/modules/ROOT/assets/images/cloud/operator/monitoring/grafana-data-source-test.png differ diff --git a/serverlessworkflow/modules/ROOT/nav.adoc b/serverlessworkflow/modules/ROOT/nav.adoc index 68e118c32..48d8510d0 100644 --- a/serverlessworkflow/modules/ROOT/nav.adoc +++ b/serverlessworkflow/modules/ROOT/nav.adoc @@ -83,6 +83,7 @@ *** xref:cloud/operator/service-discovery.adoc[Service Discovery] *** xref:cloud/operator/using-persistence.adoc[Workflow Persistence] *** xref:cloud/operator/configuring-workflow-eventing-system.adoc[Workflow Eventing System] +*** xref:cloud/operator/monitoring-workflows.adoc[Workflow Monitoring] // *** xref:cloud/operator/configuring-knative-eventing-resources.adoc[Knative Eventing] *** xref:cloud/operator/known-issues.adoc[Roadmap and Known Issues] *** xref:cloud/operator/add-custom-ca-to-a-workflow-pod.adoc[Add Custom CA to Workflow Pod] diff --git a/serverlessworkflow/modules/ROOT/pages/cloud/index.adoc b/serverlessworkflow/modules/ROOT/pages/cloud/index.adoc index e8cf3bbd8..f1752ecc9 100644 --- a/serverlessworkflow/modules/ROOT/pages/cloud/index.adoc +++ b/serverlessworkflow/modules/ROOT/pages/cloud/index.adoc @@ -128,6 +128,14 @@ xref:cloud/operator/using-persistence.adoc[] Learn how to define the workflow `Persistence` field to allow the workflow to store its context -- +[.card] +-- +[.card-title] +xref:cloud/operator/monitoring-workflows.adoc[] +[.card-description] +Learn how to configure Prometheus, Grafana and Grafana Dashboard for monitoring of workflow instances +-- + [.card] -- [.card-title] diff --git a/serverlessworkflow/modules/ROOT/pages/cloud/operator/grafana-dashbord-example.adoc b/serverlessworkflow/modules/ROOT/pages/cloud/operator/grafana-dashbord-example.adoc new file mode 100644 index 000000000..8b6fb2be8 --- /dev/null +++ b/serverlessworkflow/modules/ROOT/pages/cloud/operator/grafana-dashbord-example.adoc @@ -0,0 +1,1502 @@ +[source, json] +---- +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.4.3" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 3, + "panels": [], + "title": "Summary All", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 4, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_started_total{service=~\"$workflow\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Total", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 5, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_completed_total{service=~\"$workflow\",process_state=\"Completed\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Completed", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 6, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_running_total{service=~\"$workflow\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Running", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 7, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_completed_total{service=~\"$workflow\",process_state=\"Aborted\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Aborted", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 10, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_error{service=~\"$workflow\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Error", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 15, + "x": 0, + "y": 5 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kogito_process_instance_duration_seconds_sum)/sum(kogito_process_instance_duration_seconds_count)", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Average Duration (s)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 2, + "panels": [], + "repeat": "workflow", + "repeatDirection": "h", + "title": "Summary: $workflow", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 11 + }, + "id": 1, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(service) (kogito_process_instance_started_total{service=~\"$workflow\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Total", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 11 + }, + "id": 12, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_completed_total{service=~\"$workflow\",process_state=\"Completed\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Completed", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 11 + }, + "id": 16, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_running_total{service=~\"$workflow\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Running", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 11 + }, + "id": 17, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_completed_total{service=~\"$workflow\",process_state=\"Aborted\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Aborted", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 11 + }, + "id": 19, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum (kogito_process_instance_error{service=~\"$workflow\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Error", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 47, + "panels": [], + "repeat": "workflow", + "repeatDirection": "h", + "title": "Average Duration: $workflow", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 15, + "x": 0, + "y": 15 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(kogito_process_instance_duration_seconds_sum{service=~\"$workflow\"})/sum(kogito_process_instance_duration_seconds_count{service=~\"$workflow\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 86, + "panels": [], + "repeat": "workflow", + "repeatDirection": "h", + "title": "Functions and States Average Duration (ms): $workflow", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1000 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 15, + "x": 0, + "y": 20 + }, + "id": 38, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "text" + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (node_name) (kogito_node_instance_duration_milliseconds_sum{service=~\"$workflow\"})/sum by (node_name) (kogito_node_instance_duration_milliseconds_count{service=~\"$workflow\"})", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 57, + "panels": [], + "repeat": "workflow", + "repeatDirection": "h", + "title": "Input Parameters: $workflow", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.width", + "value": 309 + }, + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "app_id" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "artifactId" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "container" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "endpoint" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "namespace" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "process_id" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "prometheus" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "service" + }, + "properties": [ + { + "id": "displayName", + "value": "workflow" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "version" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 4, + "w": 15, + "x": 0, + "y": 25 + }, + "id": 67, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "workflow" + } + ] + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sonataflow_input_parameters_counter_total{service=~\"$workflow\"}", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "type": "table" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "allValue": "", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(kogito_process_instance_started_total,service)", + "description": "workflow", + "hide": 0, + "includeAll": true, + "label": "Workflows", + "multi": true, + "name": "workflow", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kogito_process_instance_started_total,service)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "093c6eb7-1dcd-4b66-afa2-68691888f1d8" + }, + "filters": [ + { + "key": "service", + "operator": "=", + "value": "greeting" + } + ], + "hide": 0, + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Workflows", + "uid": "ae4jtpwnm76rka", + "version": 3, + "weekStart": "" +} +---- \ No newline at end of file diff --git a/serverlessworkflow/modules/ROOT/pages/cloud/operator/monitoring-workflows.adoc b/serverlessworkflow/modules/ROOT/pages/cloud/operator/monitoring-workflows.adoc new file mode 100644 index 000000000..246e31078 --- /dev/null +++ b/serverlessworkflow/modules/ROOT/pages/cloud/operator/monitoring-workflows.adoc @@ -0,0 +1,337 @@ += Monitoring Workflows +:compat-mode!: +// Metadata: +:description: Workflows monitoring configuration configuration +:keywords: kogito, sonataflow, workflow, operator, kubernetes, prometheus, grafana + +// External pages +:openshift_enable_monitoring_user_defined_projects_url: https://docs.openshift.com/container-platform/4.17/observability/monitoring/enabling-monitoring-for-user-defined-projects.html + +:dev_services_url: https://quarkus.io/guides/dev-services +:test_containers_url: https://www.testcontainers.org/ + +This document describes how to deploy and configure Prometheus and Grafana components for monitoring of {product_name} workflows. + +[NOTE] +==== +Currently, only those {product_name} workflows deployed as Kubernetes deployments have workflow related metrics exposed to Prometheus and are hence available for monitoring by Grafana Dashboards. Monitoring of {product_name} workflows deployed as Knative services is not supported and such serverless workflows are not included in the Grafana Dashboards. +==== + +[#deploy-monitoring] +== Deploy Prometheus and Grafana +=== Deploy Prometheus and Grafana on OpenShift Container Platform +==== Deploy Prometheus +OpenShift Container Platform includes a preconfigured, preinstalled, and self-updating monitoring stack that provides monitoring for core platform components. As such the Prometheus Operator is already installed on the cluster. To monitor SonataFlow workflows, you shall enable monitoring for user-defined projects. This is achieved by updating `cluster-monitoring-config` ConfigMap in the openshift-monitoring namespace. Create a new one if the ConfigMap does not exist. +[source, yaml] +---- +cat << EOF | oc apply -f - +apiVersion: v1 +kind: ConfigMap +metadata: + name: cluster-monitoring-config + namespace: openshift-monitoring +data: + config.yaml: | + enableUserWorkload: true +EOF +---- +A new Prometheus server pod will be started and running in the namespace `openshift-user-workload-monitoring`. + +==== Deploy Grafana +===== Deploy Grafana Operator +Create a namespace for the Grafana Operator to be installed in +[source,shell,subs="attributes+"] +---- +oc new-project grafana-operator +---- +Deploy the Grafana Operator using command line. You can also deploy the operator through OperatorHub. +[source,shell,subs="attributes+"] +---- +cat << EOF | oc create -f - +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + generateName: grafana-operator- + namespace: grafana-operator +spec: + targetNamespaces: + - grafana-operator +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + generateName: grafana-operator- + namespace: grafana-operator +spec: + channel: v5 + name: grafana-operator + installPlanApproval: Automatic + source: community-operators + sourceNamespace: openshift-marketplace +EOF +---- +Wait for the Operator to be ready +[source,shell,subs="attributes+"] +---- +oc -n grafana-operator rollout status \ + deployment grafana-operator-controller-manager-v5 +---- +===== Deploy Grafana Instance +[source, yaml] +---- +cat << EOF | oc create -f - +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: grafana + labels: + dashboards: "grafana" +spec: + config: + security: + admin_user: root + admin_password: secret +EOF +---- +===== Give the Grafana service account the cluster-monitoring-view role +[source, yaml] +---- +oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana-sa +---- +===== Generate a bearer token for the grafana service account +[source, yaml] +---- +TOKEN=`oc sa new-token grafana-sa` +---- +===== Deploy the Prometheus Data Source +[source, yaml] +---- +cat << EOF | oc create -f - +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: example-grafanadatasource +spec: + datasource: + access: proxy + isDefault: true + type: prometheus + jsonData: + httpHeaderName1: 'Authorization' + timeInterval: 5s + tlsSkipVerify: true + secureJsonData: + httpHeaderValue1: 'Bearer ${TOKEN}' + name: Prometheus + url: https://thanos-querier.openshift-monitoring.svc.cluster.local:9091 + instanceSelector: + matchLabels: + dashboards: grafana +EOF +---- +Wait until the Grafana server is ready. +[source,shell,subs="attributes+"] +---- +oc wait --for=condition=Available=True deployment/grafana-deployment +---- +===== Create a route for Grafana service +[source,shell,subs="attributes+"] +---- +oc expose service grafana-service +---- +===== Get the URL for Grafana +[source,shell,subs="attributes+"] +---- +oc get route grafana-service -o jsonpath='{"http://"}{.spec.host}{"\n"}' +---- +===== Open Grafana Dashboard UI +Open Grafana Dashboard UI in your web browser with the URL found. Log in using with admin user name `root` and passward `secret`. + +=== Deploy Prometheus and Grafana on Kubernetes +==== Deploy Prometheus +===== Deploy Prometheus Operator +[source,shell,subs="attributes+"] +---- +PROMETHEUS_VERSION=v0.70.0 +kubectl create -f https://github.com/prometheus-operator/prometheus-operator/releases/download/${PROMETHEUS_VERSION}/bundle.yaml -n default +---- +Wait until the operator is ready. +[source,shell,subs="attributes+"] +---- +PROMETHEUS_VERSION=v0.70.0 +kubectl wait --for=condition=Available=True deploy/prometheus-operator -n default +---- +===== Deploy Prometheus Instance +[source, yaml] +---- +cat << EOF | kubectl create -n default -f - +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus +spec: + serviceAccountName: prometheus + serviceMonitorNamespaceSelector: {} + serviceMonitorSelector: {} + podMonitorSelector: {} + resources: + requests: + memory: 400Mi +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: default +EOF +---- +Wait until the Prometheus server is ready. +[source,shell,subs="attributes+"] +---- +kubectl apply -f ./test/testdata/prometheus.yaml -n default +kubectl wait --for=condition=Available=True prometheus/prometheus -n default +---- + +==== Deploy Grafana +===== Deploy Grafana Operator +[source,shell,subs="attributes+"] +---- +GRAFANA_VERSION=v5.13.0 +kubectl create -f https://github.com/grafana/grafana-operator/releases/download/${GRAFANA_VERSION}/kustomize-cluster_scoped.yaml +---- +Wait until Grafana Operator is ready. +[source,shell,subs="attributes+"] +---- +kubectl wait --for=condition=Available=True deploy/grafana-operator-controller-manager -n grafana +---- +===== Deploy Grafana Instance +[source, yaml] +---- +cat << EOF | kubectl create -n default -f - +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: grafana + labels: + dashboards: "grafana" +spec: + config: + security: + admin_user: root + admin_password: secret +EOF +---- +===== Create a Grafana Datasource for Prometheus +[source, yaml] +---- +cat << EOF | kubectl create -n default -f - +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: example-grafanadatasource +spec: + datasource: + access: proxy + type: prometheus + jsonData: + timeInterval: 5s + tlsSkipVerify: true + name: Prometheus + url: http://prometheus-operated.default.svc.cluster.local:9090 + instanceSelector: + matchLabels: + dashboards: grafana +EOF +---- + +Wait until the Grafana server is ready. +[source,shell,subs="attributes+"] +---- +kubectl wait --for=condition=Available=True deployment/grafana-deployment -n default +---- + +===== Open Grafana Dashboard UI +Now you can forward local port number `3000` to the Grafana service. +[source,shell,subs="attributes+"] +---- +kubectl port-forward svc/grafana-service -n default 3000:3000 +---- +Open Grafana Dashboard UI in your web browser with the URL `http://localhost:30000`. Log in using with admin user name `root` and passward `secret`. + +[#workflow-monitoring] +== Workflows Monitoring +=== Enable monitoring in `SonataFlowPlatform` CR +When `SonataFlowPlatform` CR has `spec.monitoring.enabled` set, and Prometheus has been deployed in the cluster, {operator_name} will automatically create a service monitor for each workflow that is deployed as a Kubernetes Deployment object. The service monitor allows Prometheus to scrape the workflow related metrics from the workflow pod. +[source, yaml] +---- +apiVersion: sonataflow.org/v1alpha08 +kind: SonataFlowPlatform +metadata: + name: sonataflow-platform +spec: + monitoring: + enabled: true +---- + +[IMPORTANT] +==== +Currently, only those {product_name} workflows deployed as Kubernetes deployments have workflow related metrics exposed to Prometheus and are hence available for monitoring by Grafana Dashboards. Monitoring of {product_name} workflows deployed as Knative services is not supported and such serverless workflows are not included in the Grafana Dashboards. +==== + +=== Test Data Source Connection +In the Grafana UI, click `Connections` -> `Data sources`, and open the data source. Then click `Save & test` button to test the data source to make sure it can connect to the Prometheus server successfully. + +image::cloud/operator/monitoring/grafana-data-source-test.png[] + +=== Import the sample dashboard + +Click `+` -> `Import dashboard`, copy the json model data for xref::cloud/operator/grafana-dashbord-example.adoc[the sample dashboard] and then paste the data in the `Import via dashboard JSON model` text box, and then click `Load`. The sample dashboard is loaded. + +image::cloud/operator/monitoring/grafana-dashboard-example.png[] + +=== Customize or build your own dashboard +You can customize or build your own dashboard. For more information, see xref:https://grafana.com/docs/grafana/latest/dashboards[Grafana Dashboards] and xref:cloud/operator/sonataflow-metrics.adoc[SonataFlow Metrics]. + +== Additional resources + +* xref:cloud/operator/sonataflow-metrics.adoc[SonataFlow Metrics] +* xref:https://grafana.com/docs/grafana/latest/dashboards[Grafana Dashboards] + +include::../../../pages/_common-content/report-issue.adoc[] \ No newline at end of file diff --git a/serverlessworkflow/modules/ROOT/pages/cloud/operator/sonataflow-metrics.adoc b/serverlessworkflow/modules/ROOT/pages/cloud/operator/sonataflow-metrics.adoc new file mode 100644 index 000000000..25baa3f55 --- /dev/null +++ b/serverlessworkflow/modules/ROOT/pages/cloud/operator/sonataflow-metrics.adoc @@ -0,0 +1,99 @@ += SonataFlow Metrics + +== kogito_process_instance_completed_total +Workflow instances that have reached a terminal status, “Aborted” or “Completed”, and thus are +considered as completed. + +Note: These are the only two terminal status. The “Error” state is not terminal. + + +Additionally, the metric has the process_state="Completed", or could be "Aborted", to register exactly which of the two terminal status were reached. + +[source, yaml] +---- +# HELP kogito_process_instance_completed_total Completed Process Instances +# TYPE kogito_process_instance_completed_total counter +kogito_process_instance_completed_total{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",process_id="callbackstatetimeouts",process_state="Completed",version="1.0.0-SNAPSHOT",} 3.0 +---- + +== kogito_process_instance_started_total +Count the number of started workflow instances. + +[source, yaml] +---- +# HELP kogito_process_instance_started_total Started Process Instances +# TYPE kogito_process_instance_started_total counter +kogito_process_instance_started_total{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 7.0 +---- + +== kogito_process_instance_running_total +Records the number of running workflow instances. + +Note: This includes workflow instances that are in the "Error" state, since the error state is not a terminal state. + + +Only the process instances that has reached a terminal status, i.e. "Completed" or "Aborted", are not present in this metric. + +[source, yaml] +---- +# HELP kogito_process_instance_running_total Running Process Instances +# TYPE kogito_process_instance_running_total gauge +kogito_process_instance_running_total{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 4.0 +---- + +== kogito_process_instance_duration_seconds +Calculates duration of a workflow instance that has reached a terminal state,, i.e. "Aborted" or "Completed". This metric is registered when the process reaches the terminal state. + +[source, yaml] +---- +# HELP kogito_process_instance_duration_seconds_max Process Instances Duration +# TYPE kogito_process_instance_duration_seconds_max gauge + kogito_process_instance_duration_seconds_max{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 30.0 + + +# HELP kogito_process_instance_duration_seconds Process Instances Duration +# TYPE kogito_process_instance_duration_seconds summary + kogito_process_instance_duration_seconds_count{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 3.0 + kogito_process_instance_duration_seconds_sum{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 90.0 +---- + +== kogito_node_instance_duration_milliseconds +Records the duration of the execution for nodes “relevant” to the workflows. The metric is calculated when a given node has finished executing. + +[source, yaml] +---- +# HELP kogito_node_instance_duration_milliseconds_max Relevant nodes duration in milliseconds +# TYPE kogito_node_instance_duration_milliseconds_max gauge +kogito_node_instance_duration_milliseconds_max{artifactId="serverless-workflow-project",node_name="CallbackState",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 30014.0 + + +# HELP kogito_node_instance_duration_milliseconds Relevant nodes duration in milliseconds +# TYPE kogito_node_instance_duration_milliseconds summary +kogito_node_instance_duration_milliseconds_count{artifactId="serverless-workflow-project",node_name="CallbackState",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 3.0 +kogito_node_instance_duration_milliseconds_sum{artifactId="serverless-workflow-project",node_name="CallbackState",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 90128.0 +---- + +== kogito_process_instance_error +Records the number of errors that have occurred per processId and error, including the error message. + +[source, yaml] +---- +# HELP kogito_process_instance_error Number of errors that has occurred +# TYPE kogito_process_instance_error counter +---- + +== sonataflow_input_parameters_counter_total + +Records the occurrences of <"param_name", "param_value"> per processId. + +Note: parameters that are json values, or arrays are flattened. + +[source, yaml] +---- +# HELP sonataflow_input_parameters_counter_total Input parameters +# TYPE sonataflow_input_parameters_counter_total counter +sonataflow_input_parameters_counter_total{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",param_name="name",param_value="walter",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 1.0 +sonataflow_input_parameters_counter_total{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",param_name="surname.sur1",param_value="Medvedeo",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 1.0 +sonataflow_input_parameters_counter_total{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",param_name="name",param_value="bob",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 5.0 +sonataflow_input_parameters_counter_total{app_id="sonataflow-process-monitoring-listener",artifactId="serverless-workflow-project",param_name="surname",param_value="esponja",process_id="callbackstatetimeouts",version="1.0.0-SNAPSHOT",} 5.0 +----