-
Notifications
You must be signed in to change notification settings - Fork 9
/
checker.example.yml
182 lines (182 loc) · 8.93 KB
/
checker.example.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Redis configuration depends on fields specified in redis config section:
# 1. Use field `master_name` to enable Redis Sentinel support
# 2. Specify two or more `addrs` to enable cluster support
# 3. Otherwise, standalone configuration is enabled
redis:
# Sentinel master name
master_name: ""
# address list, format: {host1_name:port},{ip:port}
addrs: "localhost:6379"
# Redis username
username: "username"
# Redis password
password: "password"
# Redis Sentinel username
sentinel_username: "sentinel_username"
# Redis Sentinel password
sentinel_password: "sentinel_password"
# Moira will delete metrics older than this value from Redis. Large values will lead to various problems everywhere.
# See https://github.com/moira-alert/moira/pull/519
metrics_ttl: 3h
# Dial timeout for establishing new connections
# Default is 500 milliseconds
dial_timeout: 1s
# Timeout for socket reads. If reached, commands will fail
# with a timeout instead of blocking. Default is 3 seconds.
# Skip this setting or set 0 for default.
read_timeout: 5s
# Timeout for socket writes. If reached, commands will fail
# with a timeout instead of blocking. Default is ReadTimeout.
# Skip this setting or set 0 for default.
write_timeout: 5s
# Enables read-only commands on slave nodes
# The flag does not work without `route_randomly` or `route_by_latency` set to true
read_only: true
# Allows routing read-only commands to the **random** master or slave node
# It automatically enables ReadOnly
route_randomly: true
# Allows routing read-only commands to the **closest** master or slave node
# It automatically enables ReadOnly
route_by_latency: false
# Minimum backoff between retries. Used to calculate exponential backoff. Default value is 0
min_retry_backoff: 1s
# Maximum backoff between retries. Used to calculate exponential backoff. Default value is 0
max_retry_backoff: 10s
# Max amount of attempts to find alive node (in Redis cluster)
max_redirects: 4
telemetry:
# Common port for all telemetry data: Prometheus scraping, pprof, etc.
listen: ":8091"
pprof:
# If true, pprof will be enabled on common telemetry port.
enabled: false
graphite:
# If true, graphite sender will be enabled.
enabled: true
# If true, runtime stats will be captured and sent to graphite. Note: It takes to call stoptheworld() with configured "graphite.interval" to capture runtime stats (https://golang.org/src/runtime/mstats.go)
runtime_stats: false
# Graphite relay URI, format: ip:port
uri: "graphite-relay:2003"
# Moira metrics prefix. Use 'prefix: {hostname}' to use hostname autoresolver.
prefix: DevOps.moira
# Metrics sending interval
interval: 60s
checker:
# Min period to perform triggers re-check. Note: Reducing of this value leads to increasing of CPU and memory usage values
# Used to be `checker.check_interval` before v2.10
metric_event_trigger_check_interval: 30s
# In Moira 2.4 we add a new entity - Lazy Trigger. This is a regular trigger but without any subscription for it.
# By default Moira treats any trigger equally regardless on its subscriptions number.
# You can change this behaviour using option below. This can reduce CPU usage on your server.
# Lazy triggers checker works if lazy_triggers_check_interval > check_interval. We recommend setting it to 10m.
lazy_triggers_check_interval: 10m
# Period to cancel forced checks for all triggers if no metrics were received (greater than 'local.check_interval')
stop_checking_interval: 3600s
# Time after which the check is considered critically slow. Such checks are logged for debug purposes
# default: 1h
critical_time_of_check: 10m
local:
# Period for every non-lazy trigger to perform a check on
# Used to be `checker.nodata_check_interval` before v2.10
check_interval: 60s
# Equals to the number of processor cores found on Moira host by default or when variable is defined as 0.
max_parallel_checks: 512
# This section configures the list of graphite remote triggers sources.
# See https://moira.readthedocs.io/en/latest/installation/configuration.html#graphite-remote-triggers-checker for further information
# Used to be `remote` and contain only one source before v2.10
graphite_remote:
- # Unique cluster id (no other graphite_remote cluster should have the same one)
# Use `default` for compatibility with old triggers without cluster_id
# Should not be changed if there are any triggers using it
cluster_id: default
# Cluster name to be displayed in UI
cluster_name: Graphite Remote
# URL of Graphite HTTP API: graphite-web, carbonapi, etc.
# Specify full URL including '/render'
url: "http://graphite.example.com/render"
# Auth username. Only Basic-auth supported
user: graphite_admin
# Auth password. Only Basic-auth supported
password: verySecurePassword
# Minimal period to perform triggers re-check.
# Note: Reducing of this value leads to increasing of CPU and memory usage values and extra load on Graphite HTTP API
check_interval: 60s
# Don't fetch metrics older than this value from remote storage
metrics_ttl: 168h
# Maximum timeout for HTTP-request made to Graphite HTTP API
timeout: 60s
# Equals to the number of processor cores found on Moira host by default or when variable is defined as 0.
max_parallel_checks: 0
# From 2.14.0
# Retries configuration for requests, that fetch data.
# Library used for calculating exponential backoff retries: https://pkg.go.dev/github.com/cenkalti/backoff/v4
retries:
# Initial interval for retry attempt
initial_interval: 60s
# Used to calc interval between retries: RandomizedRetryInterval = RetryInterval *
# * (random value from range [1 - randomization_factor, 1 + randomization_factor])
randomization_factor: 0.5
# For calculating next: RetryInterval = RetryInterval * multiplier
multiplier: 1.5
# Caps RetryInterval (NOT RandomizedRetryInterval)
max_interval: 120s
# If already max_retries_count retries performed, stop retrying.
# At least one of (max_retries_count, max_elapsed_time) should be specified
max_retries_count: 3
# If time passed since first try is more than max_elapsed_time than stop retrying
# At least one of (max_retries_count, max_elapsed_time) should be specified
max_elapsed_time: 360s
# From 2.14.0
# Maximum timeout for healthcheck requests
heathcheck_timeout: 60s
# From 2.14.0
# Retries configuration for healthcheck requests (same fields as for retries)
heathcheck_retries:
# Initial interval for retry attempt
initial_interval: 10s
# Used to calc interval between retries: RandomizedRetryInterval = RetryInterval *
# * (random value from range [1 - randomization_factor, 1 + randomization_factor])
randomization_factor: 0.5
# For calculating next: RetryInterval = RetryInterval * multiplier
multiplier: 1.5
# Caps RetryInterval (NOT RandomizedRetryInterval)
max_interval: 60s
# If already max_retries_count retries performed, stop retrying.
# At least one of (max_retries_count, max_elapsed_time) should be specified
max_retries_count: 3
# If time passed since first try is more than max_elapsed_time than stop retrying
# At least one of (max_retries_count, max_elapsed_time) should be specified
max_elapsed_time: 200s
# This section configures the list of prometheus remote triggers sources.
# See https://moira.readthedocs.io/en/latest/installation/configuration.html#prometheus-remote-triggers-checker for further information
# Used to be `prometheus` and contain only one source before v2.10
prometheus_remote:
- # Unique cluster id (no other prometheus_remote cluster should have the same one)
# Use `default` for compatibility with old triggers without cluster_id
# Should not be changed if there are any triggers using it
cluster_id: default
# Cluster name to be displayed in UI
cluster_name: Prometheus Remote
# URL of Prometheus HTTP API: Prometheus or VMSelect
# Only domain name must be specified, no URL-path
url: https://prometheus.example.com
# Auth username. Only Basic-auth supported
user: prometheus_admin
# Auth password. Only Basic-auth supported
password: verySecurePassword
# Minimal period to perform triggers re-check.
# Note: Reducing of this value leads to increasing of CPU and memory usage values and extra load on Prometheus HTTP API
check_interval: "60s"
# Maximum timeout for HTTP-request made to Prometheus HTTP API
timeout: "10s"
# Number of times failed request should be retried
retries: 3
# Delay between retries
retry_timeout: "3s"
# Don't fetch metrics older than this value from remote storage
metrics_ttl: "168h"
# Equals to the number of processor cores found on Moira host by default or when variable is defined as 0.
max_parallel_checks: 0
log:
log_file: stdout
log_level: info