Skip to content

Commit

Permalink
feat: update metrics (#1560)
Browse files Browse the repository at this point in the history
* Use Prometheus [naming best
  practices](https://prometheus.io/docs/practices/naming/).
* Enable native histograms.
* Update dashboards to support new and old names.

Signed-off-by: SuperQ <[email protected]>
Co-authored-by: Dimitri Herzog <[email protected]>
  • Loading branch information
SuperQ and 0xERR0R authored Aug 19, 2024
1 parent 02e3828 commit 57b1bdb
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 50 deletions.
28 changes: 15 additions & 13 deletions docs/blocky-grafana.json
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": false,
"expr": "blocky_build_info ",
"expr": "blocky_build_info",
"format": "table",
"instant": true,
"interval": "",
Expand Down Expand Up @@ -378,7 +378,7 @@
}
]
},
"unit": "ms"
"unit": "s"
},
"overrides": []
},
Expand Down Expand Up @@ -413,7 +413,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "sum(increase(blocky_request_duration_ms_sum[$__range])) / sum(increase(blocky_request_duration_ms_count[$__range]))",
"expr": "(histogram_avg(rate(blocky_blocky_request_duration_seconds[$__rate_interval]))\nor\n(sum(rate(blocky_blocky_request_duration_seconds_sum[$__rate_interval])) / sum(rate(blocky_blocky_request_duration_seconds_count[$__rate_interval])))\nor\n(sum(rate(blocky_request_duration_ms_sum[$__rate_interval])) / sum(rate(blocky_request_duration_ms_count[$__rate_interval])) / 1000)",
"format": "table",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -487,7 +487,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "sum(blocky_denylist_cache) / sum(up{job=~\"$job\"})",
"expr": "(sum(blocky_denylist_cache_entries) or sum(blocky_denylist_cache)) / sum(up{job=~\"$job\"})",
"format": "table",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -642,7 +642,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "sum(increase(blocky_response_total{response_type=\"BLOCKED\"}[$__range])) / sum(increase(blocky_query_total[$__range])) ",
"expr": "sum(rate(blocky_response_total{response_type=\"BLOCKED\"}[$__rate_interval])) / sum(rate(blocky_query_total[$__rate_interval])) ",
"format": "table",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -717,9 +717,9 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "ceil(sum(increase(blocky_query_total[$__range]))) ",
"expr": "ceil(sum(increase(blocky_query_total[$__range])))",
"format": "table",
"instant": false,
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
Expand Down Expand Up @@ -778,7 +778,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "sum(blocky_cache_entry_count)/ sum(up{job=~\"$job\"})",
"expr": "(sum(blocky_cache_entries) or sum(blocky_cache_entry_count)) / sum(up{job=~\"$job\"})",
"format": "table",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -845,7 +845,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "sum(increase(blocky_cache_hit_count[$__range])) / (sum(increase(blocky_cache_hit_count[$__range])) + sum(increase(blocky_cache_miss_count[$__range])))",
"expr": "(\n sum(rate(blocky_cache_hits_total[$__rate_interval]))\n /\n (sum(rate(blocky_cache_hits_total[$__rate_interval])) + sum(rate(blocky_cache_misses_total[$__rate_interval]))\n)\nor\n(\n sum(rate(blocky_cache_hit_count[$__rate_interval]))\n /\n (sum(rate(blocky_cache_hit_count[$__rate_interval])) + sum(rate(blocky_cache_miss_count[$__rate_interval]))\n)",
"format": "table",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -929,7 +929,7 @@
"exemplar": true,
"expr": "sum(increase(blocky_error_total[$__range]))",
"format": "table",
"instant": false,
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
Expand Down Expand Up @@ -988,8 +988,9 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "ceil(sum(increase(blocky_prefetch_count[$__range])))",
"expr": "ceil(sum(increase(blocky_prefetches_total[$__range]) or sum(increase(blocky_prefetch_count[$__range])))",
"format": "table",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
Expand Down Expand Up @@ -1052,8 +1053,9 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "sum(rate(blocky_prefetch_count[5m])) * 60",
"expr": "(sum(rate(blocky_prefetchs_total[$__range])) or sum(rate(blocky_prefetch_count[$__range]))) * 60",
"format": "table",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
Expand Down Expand Up @@ -1978,4 +1980,4 @@
"uid": "JvOqE4gRk",
"version": 1,
"weekStart": ""
}
}
26 changes: 15 additions & 11 deletions docs/prometheus_grafana.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,21 @@ Following metrics will be exported:

| name | Description |
| ------------------------------------------------ | -------------------------------------------------------- |
| blocky_denylist_cache / blocky_allowlist_cache | Number of entries in denylist/allowlist cache, partitioned by group |
| blocky_error_total | Number of total queries that ended in error for any reason |
| blocky_query_total | Number of total queries, partitioned by client and DNS request type (A, AAAA, PTR, etc) |
| blocky_request_duration_ms_bucket | Request duration histogram, partitioned by response type (Blocked, cached, etc) |
| blocky_response_total | Number of responses, partitioned by response type (Blocked, cached, etc), DNS response code, and reason |
| blocky_blocking_enabled | 1 if blocking is enabled, 0 otherwise |
| blocky_cache_entry_count | Number of entries in cache |
| blocky_cache_hit_count / blocky_cache_miss_count | Cache hit/miss counters |
| blocky_prefetch_count | Amount of prefetched DNS responses |
| blocky_prefetch_domain_name_cache_count | Amount of domain names being prefetched |
| blocky_failed_download_count | Number of failed list downloads |
| blocky_denylist_cache_entries | Gauge of entries in the denylist cache, partitioned by group |
| blocky_allowlist_cache_entries | Gauge of entries in the allowlist cache, partitioned by group |
| blocky_error_total | Counter of total queries that ended in error for any reason |
| blocky_query_total | Counter of total queries, partitioned by client and DNS request type (A, AAAA, PTR, etc) |
| blocky_blocky_request_duration_seconds | Histogram of request duration, partitioned by response type (Blocked, cached, etc) |
| blocky_response_total | Counter of responses, partitioned by response type (Blocked, cached, etc), DNS response code, and reason |
| blocky_blocking_enabled | Boolean 1 if blocking is enabled, 0 otherwise |
| blocky_cache_entries | Gauge of entries in cache |
| blocky_cache_hits_total | Counter of the number of cache hits |
| blocky_cache_miss_count | Counter of the number of Cache misses |
| blocky_last_list_group_refresh_timestamp_seconds | Timestamp of last list refresh |
| blocky_prefetches_total | Counter of prefetched DNS responses |
| blocky_prefetch_hits_total | Counter of requests that hit the prefetch cache |
| blocky_prefetch_domain_name_cache_entries | Gauge of domain names being prefetched |
| blocky_failed_downloads_total | Counter of failed list downloads |

### Grafana dashboard

Expand Down
22 changes: 11 additions & 11 deletions e2e/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ var _ = Describe("Metrics functional tests", func() {
BeforeEach(func(ctx context.Context) {
Eventually(fetchBlockyMetrics).WithArguments(ctx, metricsURL).
Should(ContainElements(
"blocky_cache_entry_count 0",
"blocky_cache_hit_count 0",
"blocky_cache_miss_count 0",
"blocky_cache_entries 0",
"blocky_cache_hits_total 0",
"blocky_cache_misses_total 0",
))
})

Expand All @@ -101,9 +101,9 @@ var _ = Describe("Metrics functional tests", func() {

Eventually(fetchBlockyMetrics).WithArguments(ctx, metricsURL).
Should(ContainElements(
"blocky_cache_entry_count 1",
"blocky_cache_hit_count 0",
"blocky_cache_miss_count 1",
"blocky_cache_entries 1",
"blocky_cache_hits_total 0",
"blocky_cache_misses_total 1",
))
})

Expand All @@ -117,9 +117,9 @@ var _ = Describe("Metrics functional tests", func() {

Eventually(fetchBlockyMetrics).WithArguments(ctx, metricsURL).
Should(ContainElements(
"blocky_cache_entry_count 1",
"blocky_cache_hit_count 1",
"blocky_cache_miss_count 1",
"blocky_cache_entries 1",
"blocky_cache_hits_total 1",
"blocky_cache_misses_total 1",
))
})
})
Expand All @@ -129,8 +129,8 @@ var _ = Describe("Metrics functional tests", func() {
It("Should expose list cache sizes per group as metrics", func(ctx context.Context) {
Eventually(fetchBlockyMetrics).WithArguments(ctx, metricsURL).
Should(ContainElements(
"blocky_denylist_cache{group=\"group1\"} 1",
"blocky_denylist_cache{group=\"group2\"} 3",
"blocky_denylist_cache_entries{group=\"group1\"} 1",
"blocky_denylist_cache_entries{group=\"group2\"} 3",
))
})
})
Expand Down
20 changes: 10 additions & 10 deletions metrics/metrics_event_publisher.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func enabledGauge() prometheus.Gauge {
func denylistGauge() *prometheus.GaugeVec {
denylistCnt := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "blocky_denylist_cache",
Name: "blocky_denylist_cache_entries",
Help: "Number of entries in the denylist cache",
}, []string{"group"},
)
Expand All @@ -97,7 +97,7 @@ func denylistGauge() *prometheus.GaugeVec {
func allowlistGauge() *prometheus.GaugeVec {
allowlistCnt := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "blocky_allowlist_cache",
Name: "blocky_allowlist_cache_entries",
Help: "Number of entries in the allowlist cache",
}, []string{"group"},
)
Expand All @@ -108,7 +108,7 @@ func allowlistGauge() *prometheus.GaugeVec {
func lastListGroupRefresh() prometheus.Gauge {
return prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "blocky_last_list_group_refresh",
Name: "blocky_last_list_group_refresh_timestamp_seconds",
Help: "Timestamp of last list refresh",
},
)
Expand Down Expand Up @@ -162,15 +162,15 @@ func registerCachingEventListeners() {

func failedDownloadCount() prometheus.Counter {
return prometheus.NewCounter(prometheus.CounterOpts{
Name: "blocky_failed_download_count",
Name: "blocky_failed_downloads_total",
Help: "Failed download counter",
})
}

func cacheHitCount() prometheus.Counter {
return prometheus.NewCounter(
prometheus.CounterOpts{
Name: "blocky_cache_hit_count",
Name: "blocky_cache_hits_total",
Help: "Cache hit counter",
},
)
Expand All @@ -179,7 +179,7 @@ func cacheHitCount() prometheus.Counter {
func cacheMissCount() prometheus.Counter {
return prometheus.NewCounter(
prometheus.CounterOpts{
Name: "blocky_cache_miss_count",
Name: "blocky_cache_misses_total",
Help: "Cache miss counter",
},
)
Expand All @@ -188,7 +188,7 @@ func cacheMissCount() prometheus.Counter {
func domainPrefetchCount() prometheus.Counter {
return prometheus.NewCounter(
prometheus.CounterOpts{
Name: "blocky_prefetch_count",
Name: "blocky_prefetches_total",
Help: "Prefetch counter",
},
)
Expand All @@ -197,7 +197,7 @@ func domainPrefetchCount() prometheus.Counter {
func domainPrefetchHitCount() prometheus.Counter {
return prometheus.NewCounter(
prometheus.CounterOpts{
Name: "blocky_prefetch_hit_count",
Name: "blocky_prefetch_hits_total",
Help: "Prefetch hit counter",
},
)
Expand All @@ -206,7 +206,7 @@ func domainPrefetchHitCount() prometheus.Counter {
func cacheEntryCount() prometheus.Gauge {
return prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "blocky_cache_entry_count",
Name: "blocky_cache_entries",
Help: "Number of entries in cache",
},
)
Expand All @@ -215,7 +215,7 @@ func cacheEntryCount() prometheus.Gauge {
func prefetchDomainCacheCount() prometheus.Gauge {
return prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "blocky_prefetch_domain_name_cache_count",
Name: "blocky_prefetch_domain_name_cache_entries",
Help: "Number of entries in domain cache",
},
)
Expand Down
15 changes: 10 additions & 5 deletions resolver/metrics_resolver.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

// nativeHistogramBucketFactor controls the resolution of native histograms.
// The value of 1.05 is slightly higher accuracy than the default of 1.1.
const nativeHistogramBucketFactor = 1.05

// MetricsResolver resolver that records metrics about requests/response
type MetricsResolver struct {
configurable[*config.Metrics]
Expand All @@ -35,14 +39,14 @@ func (r *MetricsResolver) Resolve(ctx context.Context, request *model.Request) (
"type": dns.TypeToString[request.Req.Question[0].Qtype],
}).Inc()

reqDurationMs := float64(time.Since(request.RequestTS).Milliseconds())
reqDuration := time.Since(request.RequestTS)
responseType := "err"

if response != nil {
responseType = response.RType.String()
}

r.durationHistogram.WithLabelValues(responseType).Observe(reqDurationMs)
r.durationHistogram.WithLabelValues(responseType).Observe(reqDuration.Seconds())

if err != nil {
r.totalErrors.Inc()
Expand Down Expand Up @@ -103,9 +107,10 @@ func totalErrorMetric() prometheus.Counter {
func durationHistogram() *prometheus.HistogramVec {
return prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "blocky_request_duration_ms",
Help: "Request duration distribution",
Buckets: []float64{5, 10, 20, 30, 50, 75, 100, 200, 500, 1000, 2000},
Name: "blocky_request_duration_seconds",
Help: "Request duration distribution",
Buckets: []float64{0.005, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1, 0.2, 0.5, 1.0, 2.0},
NativeHistogramBucketFactor: nativeHistogramBucketFactor,
},
[]string{"response_type"},
)
Expand Down

0 comments on commit 57b1bdb

Please sign in to comment.