Skip to content

Commit

Permalink
Merge pull request #19 from w3f/substrate-alertrules
Browse files Browse the repository at this point in the history
Substrate alertrules
  • Loading branch information
ironoa authored Sep 2, 2020
2 parents 674b8b5 + cb71b25 commit fef4726
Show file tree
Hide file tree
Showing 7 changed files with 8 additions and 213 deletions.
20 changes: 5 additions & 15 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: 2
jobs:
buildImage:
docker:
- image: web3f/ci-commons:v2.1.3
- image: web3f/ci-commons:v2.4.0
steps:
- checkout
- setup_remote_docker
Expand All @@ -13,7 +13,7 @@ jobs:
publishChart:
docker:
- image: web3f/ci-commons:v2.1.3
- image: web3f/ci-commons:v2.4.0
steps:
- checkout
- run:
Expand All @@ -22,20 +22,9 @@ jobs:
testPrometheusRules:
docker:
- image: web3f/ci-commons:v2.1.3
- image: web3f/ci-commons:v2.4.0
steps:
- checkout
- run:
name: Install missing dependencies
command: |
YQ_VER=3.3.0
wget -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/${YQ_VER}/yq_linux_amd64
chmod +x /usr/local/bin/yq
PROM_VER=2.18.1
wget -O /tmp/prometheus.tgz https://github.com/prometheus/prometheus/releases/download/v${PROM_VER}/prometheus-${PROM_VER}.linux-amd64.tar.gz
tar -xvf /tmp/prometheus.tgz prometheus-${PROM_VER}.linux-amd64/promtool -C /tmp
mv /tmp/prometheus-$PROM_VER.linux-amd64/promtool /usr/local/bin/
- run:
command: |
scripts/test_prometheus_rules.sh
Expand All @@ -54,11 +43,12 @@ workflows:
tags:
only: /.*/
- publishChart:
context: github-bot
requires:
- buildImage
- testPrometheusRules
filters:
branches:
ignore: /.*/
tags:
only: /v[0-9]+(\.[0-9]+)*/
only: /^v(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$/
Empty file added .gitignore
Empty file.
2 changes: 1 addition & 1 deletion charts/polkadot-base-services/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
description: Base services for Polkadot Node chart.
name: polkadot-base-services
version: v0.31.0
version: v0.32.0
apiVersion: v2
2 changes: 1 addition & 1 deletion charts/polkadot-secrets/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
description: Polkadot secrets
name: polkadot-secrets
version: v0.31.0
version: v0.32.0
apiVersion: v2
2 changes: 1 addition & 1 deletion charts/polkadot/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
description: Polkadot Node chart.
name: polkadot
version: v0.31.0
version: v0.32.0
apiVersion: v2
72 changes: 0 additions & 72 deletions charts/polkadot/templates/alertrules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,76 +9,4 @@ spec:
groups:
- name: polkadot.rules
rules:
- alert: PolkadotLowNumberOfPeersShort
annotations:
message: 'The node has less than 3 peers for 3 minutes'
# runbook_url: "https://github.com/w3f/infrastructure/wiki/<LINK>"
expr: polkadot_sub_libp2p_peers_count < 3
for: 3m
labels:
severity: warning
origin: {{ .Values.deploymentName }}
- alert: PolkadotLowNumberOfPeersLong
annotations:
message: 'The node has less than 3 peers for 15 minutes'
# runbook_url: "https://github.com/w3f/infrastructure/wiki/<LINK>"
expr: polkadot_sub_libp2p_peers_count < 3
for: 15m
labels:
severity: critical
origin: {{ .Values.deploymentName }}
- alert: PolkadotTransactionQueueSizeShort
annotations:
message: 'The node has more than 10 transactions in the queue for more than 10 minutes'
# runbook_url: "https://github.com/w3f/infrastructure/wiki/<LINK>"
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
for: 10m
labels:
severity: warning
origin: {{ .Values.deploymentName }}
- alert: PolkadotTransactionQueueSizeLong
annotations:
message: 'The node has more than 10 transactions in the queue for more than 30 minutes'
# runbook_url: "https://github.com/w3f/infrastructure/wiki/<LINK>"
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
for: 30m
labels:
severity: critical
origin: {{ .Values.deploymentName }}
- alert: PolkadotLowNumberOfNewBlocksShort
annotations:
message: 'The number of new blocks has not increased for the last 3 minutes'
# runbook_url: "https://github.com/w3f/infrastructure/wiki/<LINK>"
expr: increase(polkadot_block_height{status="best"}[1m]) == 0
for: 3m
labels:
severity: warning
origin: {{ .Values.deploymentName }}
- alert: PolkadotLowNumberOfNewBlocksLong
annotations:
message: 'The number of new blocks has not increased for the last 10 minutes'
# runbook_url: "https://github.com/w3f/infrastructure/wiki/<LINK>"
expr: increase(polkadot_block_height{status="best"}[1m]) == 0
for: 10m
labels:
severity: critical
origin: {{ .Values.deploymentName }}
- alert: PolkadotLowNumberOfFinalizedBlocksShort
annotations:
message: 'The number of finalized blocks has not increased for the last 3 minutes'
# runbook_url: "https://github.com/w3f/infrastructure/wiki/<LINK>"
expr: increase(polkadot_block_height{status="finalized"}[1m]) == 0
for: 3m
labels:
severity: warning
origin: {{ .Values.deploymentName }}
- alert: PolkadotLowNumberOfFinalizedBlocksLong
annotations:
message: 'The number of finalized blocks has not increased for the last 10 minutes'
# runbook_url: "https://github.com/w3f/infrastructure/wiki/<LINK>"
expr: increase(polkadot_block_height{status="finalized"}[1m]) == 0
for: 10m
labels:
severity: critical
origin: {{ .Values.deploymentName }}
{{ end }}
123 changes: 0 additions & 123 deletions tests/prometheus/polkadot/alertrules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,126 +20,3 @@ tests:
values: '0+20x5 100+0x5' # 0 20 40 60 80 100 100 100 100 100 100

alert_rule_test:
# Test LowNumberOfPeersShort alert
- eval_time: 3m # Values: 3 2 2
alertname: PolkadotLowNumberOfPeersShort
exp_alerts:
- eval_time: 4m # Values: 2 2 2
alertname: PolkadotLowNumberOfPeersShort
exp_alerts:
- exp_labels:
severity: warning
origin: testnet-0
pod: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node has less than 3 peers for 3 minutes"

# Test LowNumberOfPeersLong alert
- eval_time: 15m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
alertname: PolkadotLowNumberOfPeersLong
exp_alerts:
- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
alertname: PolkadotLowNumberOfPeersLong
exp_alerts:
- exp_labels:
severity: critical
origin: testnet-0
pod: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node has less than 3 peers for 15 minutes"

# Test TransactionQueueSizeShort alert
- eval_time: 10m
alertname: PolkadotTransactionQueueSizeShort
exp_alerts:
- eval_time: 11m
alertname: PolkadotTransactionQueueSizeShort
exp_alerts:
- exp_labels:
severity: warning
origin: testnet-0
pod: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node has more than 10 transactions in the queue for more than 10 minutes"

# Test TransactionQueueSizeLong alert
- eval_time: 30m
alertname: PolkadotTransactionQueueSizeLong
exp_alerts:
- eval_time: 31m
alertname: PolkadotTransactionQueueSizeLong
exp_alerts:
- exp_labels:
severity: critical
origin: testnet-0
pod: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node has more than 10 transactions in the queue for more than 30 minutes"

# Test LowNumberOfNewBlocksShort alert
- eval_time: 6m
alertname: PolkadotLowNumberOfNewBlocksShort
exp_alerts:
- eval_time: 7m
alertname: PolkadotLowNumberOfNewBlocksShort
exp_alerts:
- exp_labels:
severity: warning
origin: testnet-0
pod: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "The number of new blocks has not increased for the last 3 minutes"

# Test LowNumberOfNewBlocksLong alert
- eval_time: 13m
alertname: PolkadotLowNumberOfNewBlocksLong
exp_alerts:
- eval_time: 14m
alertname: PolkadotLowNumberOfNewBlocksLong
exp_alerts:
- exp_labels:
severity: critical
origin: testnet-0
pod: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "The number of new blocks has not increased for the last 10 minutes"

# Test LowNumberOfFinalizedBlocksShort alert
- eval_time: 6m
alertname: PolkadotLowNumberOfFinalizedBlocksShort
exp_alerts:
- eval_time: 7m
alertname: PolkadotLowNumberOfFinalizedBlocksShort
exp_alerts:
- exp_labels:
severity: warning
origin: testnet-0
pod: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "The number of finalized blocks has not increased for the last 3 minutes"

# Test LowNumberOfFinalizedBlocksLong alert
- eval_time: 13m
alertname: PolkadotLowNumberOfFinalizedBlocksLong
exp_alerts:
- eval_time: 14m
alertname: PolkadotLowNumberOfFinalizedBlocksLong
exp_alerts:
- exp_labels:
severity: critical
origin: testnet-0
pod: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "The number of finalized blocks has not increased for the last 10 minutes"

0 comments on commit fef4726

Please sign in to comment.