Skip to content

Commit

Permalink
Use stacked panels in OTel dashboards (#6413)
Browse files Browse the repository at this point in the history
* Set explicit units in histograms

* Use stacked panels.
They give us more information about the total data for all pods.

* Fix units of batch histogram
  • Loading branch information
ptodev authored Mar 13, 2024
1 parent 40ba987 commit 4fd88f1
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 15 deletions.
4 changes: 2 additions & 2 deletions operations/agent-flow-mixin/dashboards/controller.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ local filename = 'agent-flow-controller.json';
//
// This panel supports both native and classic histograms, though it only shows one at a time.
(
panel.newNativeHistogramHeatmap('Component evaluation histogram') +
panel.newNativeHistogramHeatmap('Component evaluation histogram', 's') +
panel.withDescription(|||
Detailed histogram view of how long component evaluations take.
Expand All @@ -301,7 +301,7 @@ local filename = 'agent-flow-controller.json';
//
// This panel supports both native and classic histograms, though it only shows one at a time.
(
panel.newNativeHistogramHeatmap('Component dependency wait histogram') +
panel.newNativeHistogramHeatmap('Component dependency wait histogram', 's') +
panel.withDescription(|||
Detailed histogram of how long components wait to be evaluated after their dependency is updated.
Expand Down
27 changes: 17 additions & 10 deletions operations/agent-flow-mixin/dashboards/opentelemetry.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ local stackedPanelMixin = {
(
panel.new(title='Accepted spans', type='timeseries') +
panel.withDescription(|||
Number of spans successfully pushed into the pipeline.
|||) +
Number of spans successfully pushed into the pipeline.
|||) +
stackedPanelMixin +
panel.withPosition({ x: 0, y: 0, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
Expand All @@ -58,19 +59,19 @@ local stackedPanelMixin = {
panel.withDescription(|||
Number of spans that could not be pushed into the pipeline.
|||) +
stackedPanelMixin +
panel.withPosition({ x: 8, y: 0, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr=|||
rate(receiver_refused_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval])
rate(receiver_refused_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval])
|||,
legendFormat='{{ pod }} / {{ transport }}',
),
])
),
(
panel.newHeatmap('RPC server duration (traces)') +
panel.withUnit('milliseconds') +
panel.newHeatmap('RPC server duration', 'ms') +
panel.withDescription(|||
The duration of inbound RPCs.
|||) +
Expand All @@ -86,13 +87,14 @@ local stackedPanelMixin = {

// "Batching" row
(
panel.new('Batching [otelcol.processor.batch]', 'row') +
panel.new('Batching of logs, metrics, and traces [otelcol.processor.batch]', 'row') +
panel.withPosition({ h: 1, w: 24, x: 0, y: 10 })
),
(
panel.newHeatmap('Number of units in the batch') +
panel.newHeatmap('Number of units in the batch', 'short') +
panel.withUnit('short') +
panel.withDescription(|||
Number of units in the batch
Number of spans, metric datapoints, or log lines in a batch
|||) +
panel.withPosition({ x: 0, y: 10, w: 8, h: 10 }) +
panel.withQueries([
Expand All @@ -105,14 +107,17 @@ local stackedPanelMixin = {
),
(
panel.new(title='Distinct metadata values', type='timeseries') +
//TODO: Clarify what metadata means. I think it's the metadata in the HTTP headers?
//TODO: Mention that if this metric is too high, it could hit the metadata_cardinality_limit
//TODO: MAke a metric for the current value of metadata_cardinality_limit and create an alert if the actual cardinality reaches it?
panel.withDescription(|||
Number of distinct metadata value combinations being processed
|||) +
panel.withPosition({ x: 8, y: 10, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr=|||
processor_batch_metadata_cardinality_ratio{cluster="$cluster", namespace="$namespace", instance=~"$instance"}
processor_batch_metadata_cardinality_ratio{cluster="$cluster", namespace="$namespace", instance=~"$instance"}
|||,
legendFormat='{{ pod }}',
),
Expand All @@ -127,7 +132,7 @@ local stackedPanelMixin = {
panel.withQueries([
panel.newQuery(
expr=|||
rate(processor_batch_timeout_trigger_send_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval])
rate(processor_batch_timeout_trigger_send_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval])
|||,
legendFormat='{{ pod }}',
),
Expand All @@ -144,6 +149,7 @@ local stackedPanelMixin = {
panel.withDescription(|||
Number of spans successfully sent to destination.
|||) +
stackedPanelMixin +
panel.withPosition({ x: 0, y: 20, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
Expand All @@ -159,6 +165,7 @@ local stackedPanelMixin = {
panel.withDescription(|||
Number of spans in failed attempts to send to destination.
|||) +
stackedPanelMixin +
panel.withPosition({ x: 8, y: 20, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
Expand Down
6 changes: 3 additions & 3 deletions operations/agent-flow-mixin/dashboards/utils/panel.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
},
},

newHeatmap(title=''):: $.new(title, 'heatmap') {
newHeatmap(title='', unit=''):: $.new(title, 'heatmap') {
maxDataPoints: 30,
options: {
calculate: false,
Expand All @@ -53,13 +53,13 @@
yHistogram: true,
},
yAxis: {
unit: 's',
unit: unit,
},
},
pluginVersion: '9.0.6',
},

newNativeHistogramHeatmap(title=''):: $.newHeatmap(title) {
newNativeHistogramHeatmap(title='', unit=''):: $.newHeatmap(title, unit) {
options+: {
cellGap: 0,
color: {
Expand Down

0 comments on commit 4fd88f1

Please sign in to comment.