From 8498b76b238e6cdf08cebf6be55c3c621c7ca94f Mon Sep 17 00:00:00 2001 From: kogeler Date: Fri, 6 Oct 2023 14:19:38 +0300 Subject: [PATCH] node_backup and state_exporter roles were added Signed-off-by: kogeler --- .github/workflows/pr-node-backup.yml | 18 ++ galaxy.yml | 2 +- roles/node_backup/.yamllint | 33 +++ roles/node_backup/README.md | 8 + roles/node_backup/defaults/main.yml | 40 ++++ roles/node_backup/files/exporter.py | 202 ++++++++++++++++ roles/node_backup/handlers/main.yml | 15 ++ roles/node_backup/molecule/default/README.md | 25 ++ .../molecule/default/collections.yml | 4 + .../node_backup/molecule/default/converge.yml | 7 + .../molecule/default/group_vars/all.yml | 35 +++ .../node_backup/molecule/default/molecule.yml | 31 +++ .../node_backup/molecule/default/prepare.yml | 43 ++++ roles/node_backup/molecule/default/verify.yml | 47 ++++ roles/node_backup/tasks/exporter.yml | 37 +++ roles/node_backup/tasks/job.yml | 51 ++++ roles/node_backup/tasks/main.yml | 49 ++++ roles/node_backup/tasks/requirements.yml | 48 ++++ roles/node_backup/tasks/tests.yml | 31 +++ .../node_backup/templates/common-backup.sh.j2 | 6 + .../templates/node-backup-exporter.service.j2 | 12 + .../templates/node-backup.service.j2 | 6 + .../templates/node-backup.timer.j2 | 11 + .../templates/rclone/rclone.conf.j2 | 19 ++ .../node_backup/templates/single-backup.sh.j2 | 218 ++++++++++++++++++ roles/node_backup/vars/main.yml | 17 ++ roles/state_exporter/defaults/main.yml | 6 + roles/state_exporter/files/exporter.py | 170 ++++++++++++++ roles/state_exporter/handlers/main.yml | 8 + roles/state_exporter/tasks/main.yml | 53 +++++ roles/state_exporter/templates/.service.j2 | 13 ++ 31 files changed, 1264 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pr-node-backup.yml create mode 100644 roles/node_backup/.yamllint create mode 100644 roles/node_backup/README.md create mode 100644 roles/node_backup/defaults/main.yml create mode 100644 roles/node_backup/files/exporter.py create mode 100644 roles/node_backup/handlers/main.yml create mode 100644 roles/node_backup/molecule/default/README.md create mode 100644 roles/node_backup/molecule/default/collections.yml create mode 100644 roles/node_backup/molecule/default/converge.yml create mode 100644 roles/node_backup/molecule/default/group_vars/all.yml create mode 100644 roles/node_backup/molecule/default/molecule.yml create mode 100644 roles/node_backup/molecule/default/prepare.yml create mode 100644 roles/node_backup/molecule/default/verify.yml create mode 100644 roles/node_backup/tasks/exporter.yml create mode 100644 roles/node_backup/tasks/job.yml create mode 100644 roles/node_backup/tasks/main.yml create mode 100644 roles/node_backup/tasks/requirements.yml create mode 100644 roles/node_backup/tasks/tests.yml create mode 100644 roles/node_backup/templates/common-backup.sh.j2 create mode 100644 roles/node_backup/templates/node-backup-exporter.service.j2 create mode 100644 roles/node_backup/templates/node-backup.service.j2 create mode 100644 roles/node_backup/templates/node-backup.timer.j2 create mode 100644 roles/node_backup/templates/rclone/rclone.conf.j2 create mode 100644 roles/node_backup/templates/single-backup.sh.j2 create mode 100644 roles/node_backup/vars/main.yml create mode 100644 roles/state_exporter/defaults/main.yml create mode 100644 roles/state_exporter/files/exporter.py create mode 100644 roles/state_exporter/handlers/main.yml create mode 100644 roles/state_exporter/tasks/main.yml create mode 100644 roles/state_exporter/templates/.service.j2 diff --git a/.github/workflows/pr-node-backup.yml b/.github/workflows/pr-node-backup.yml new file mode 100644 index 0000000..76a9157 --- /dev/null +++ b/.github/workflows/pr-node-backup.yml @@ -0,0 +1,18 @@ +name: check PR (node_backup) + +on: + pull_request: + paths: + - roles/node_backup/** + - .github/** + +jobs: + run-molecule-tests: + strategy: + fail-fast: false + matrix: + molecule-driver: [lxd, docker] + uses: ./.github/workflows/reusable-molecule.yml + with: + role-name: node + molecule-driver: ${{ matrix.molecule-driver }} \ No newline at end of file diff --git a/galaxy.yml b/galaxy.yml index 183fe0c..6bcca8d 100644 --- a/galaxy.yml +++ b/galaxy.yml @@ -8,7 +8,7 @@ namespace: paritytech name: chain # The version of the collection. Must be compatible with semantic versioning -version: 1.5.1 +version: 1.6.0 # The path to the Markdown (.md) readme file. This path is relative to the root of the collection readme: README.md diff --git a/roles/node_backup/.yamllint b/roles/node_backup/.yamllint new file mode 100644 index 0000000..8827676 --- /dev/null +++ b/roles/node_backup/.yamllint @@ -0,0 +1,33 @@ +--- +# Based on ansible-lint config +extends: default + +rules: + braces: + max-spaces-inside: 1 + level: error + brackets: + max-spaces-inside: 1 + level: error + colons: + max-spaces-after: -1 + level: error + commas: + max-spaces-after: -1 + level: error + comments: disable + comments-indentation: disable + document-start: disable + empty-lines: + max: 3 + level: error + hyphens: + level: error + indentation: disable + key-duplicates: enable + line-length: disable + new-line-at-end-of-file: disable + new-lines: + type: unix + trailing-spaces: disable + truthy: disable diff --git a/roles/node_backup/README.md b/roles/node_backup/README.md new file mode 100644 index 0000000..7b3855b --- /dev/null +++ b/roles/node_backup/README.md @@ -0,0 +1,8 @@ +node_backup +========= +This role will template out the backup script and the backup Prometheus exporter. Also, it creates the relevant systemd units.
+The nodes that we deploy on the same instance, are normal substrate nodes that are syncing the chain. +The backup is made from the local database. These nodes don't have to do any other work other than synchronization.
+Nodes are stopped during the backup process of the given chain because otherwise, the database will be changing during +the backup. It corrupts the backup. +

diff --git a/roles/node_backup/defaults/main.yml b/roles/node_backup/defaults/main.yml new file mode 100644 index 0000000..d04cd27 --- /dev/null +++ b/roles/node_backup/defaults/main.yml @@ -0,0 +1,40 @@ +--- + + +# R2 configuration +node_backup_r2_access_key_id: "" +node_backup_r2_secret_access_key: "" +node_backup_r2_api_url: "" + +node_backup_max_concurrent_requests: 50 + +node_backup_schedule: + - "*-*-* 01:00:00" + +node_backup_user: "polkadot" + +node_backup_base_path: "/opt/node_backup" +node_backup_tmp_path: "/tmp" + +# It wipes a local cash of the node-bakcup expoter. +# It's useful if you rename or remove some backups from the 'node_backup_targets' variable +node_backup_wipe_cache_enable: false + +# List of the nodes deployed to the host +# service_name - is used to extract information about db type and should be following: +# node_chain-<[paritydb|rocksdb]-[prune|archive] +# where: `node_chain` is value of `node_chain` variable from `node` role. +node_backup_targets: [] +# - service_name: polkadot-rocksdb-prune +# local_path: /opt/polkadot-rocksdb-prune/chains/polkadot/db +# rpc_port: 9934 +# # old way of backups. It takes more time to restore and backup +# # it's true by default +# tar: false +# # type of backup. can be 'gcp-native', 'gcp-rclone' or 'r2-rclone' +# type: 'gcp-rclone' +# # name of the bucket +# bucket_name: "backup" +# # the public domain name of the bucket +# # it's empty by default +# bucket_domain: "backup.polkadot.io" \ No newline at end of file diff --git a/roles/node_backup/files/exporter.py b/roles/node_backup/files/exporter.py new file mode 100644 index 0000000..338263f --- /dev/null +++ b/roles/node_backup/files/exporter.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import sys +import pickle +import json +import logging +import threading +import traceback +import io +from http.server import BaseHTTPRequestHandler, HTTPServer +from prometheus_client import start_http_server, Gauge + + +LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + + +cache_filename = os.path.dirname(__file__) + '/exporter.cache' + +backup_labels = ['id', 'storage', 'bucket_name', 'service_name', 'version'] +backup_metrics = { + "timestamp": Gauge('node_backup_timestamp', + 'Time of the last backup (unix timestamp)', + backup_labels), + "size": Gauge('node_backup_size', + 'Size of the last backup (byte)', + backup_labels), + "last_block": Gauge('node_backup_last_block', + 'Last block in the last backup (byte)', + backup_labels), + "last_backup": Gauge('node_backup_last_backup', + 'Last backup', + backup_labels + ['backup_name', 'tar_backup_path', 'backup_path']), + "total_size": Gauge('node_backup_total_size', + 'Size of all backups (byte)', + ['storage', 'bucket_name']) +} + + +def update_cache(key, value): + if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 0: + with open(cache_filename, 'rb') as f: + data = pickle.load(f) + else: + data = {} + data[key] = value + with open(cache_filename, 'wb') as f: + pickle.dump(data, f) + + +def fetch_cache(): + if os.path.exists(cache_filename) and os.path.getsize(cache_filename) > 0: + with open(cache_filename, 'rb') as f: + data = pickle.load(f) + logging.info(f"Fetched from cache: {data}") + return data + else: + return {} + + +def clean_metrics(id, backup_name, version): + """ + Purge records with old versions + """ + + def check_record(key_value) -> bool: + return ( + id in key_value['labels'] and + key_value['name'] != 'node_backup_total_size' and + ( + (key_value['name'] == 'node_backup_last_backup' and backup_name not in key_value['labels']) or + version not in key_value['labels'] + ) + ) + + for metric in backup_metrics.items(): + current_metrics=[{'name': i.name, 'labels': list(i.labels.values()), 'value': i.value} for i in metric[1].collect()[0].samples] + old_metrics = list(filter(check_record, current_metrics)) + for old_metric in old_metrics: + logging.info(f"clean {old_metric['name']} metric with label set: {str(old_metric['labels'])}") + metric[1].remove(*old_metric['labels']) + + +def set_metrics(data): + id = f"{data['storage']}-{data['bucketName']}-{data['serviceName']}" + common_labels={'id': id, + 'storage': data['storage'], + 'bucket_name': data['bucketName'], + 'service_name': data['serviceName'], + 'version': data['version']} + if data['bucketDomain'] != '': + backup_path=f"https://{data['bucketDomain']}/{data['serviceName']}/{data['backupName']}" + tar_backup_path=f"https://{data['bucketDomain']}/tar/{data['serviceName']}/{data['backupName']}.tar" + elif data['bucketDomain'] == '' and data['storage'] == 'gcp': + backup_path=f"gs://{data['bucketName']}/{data['serviceName']}/{data['backupName']}" + tar_backup_path=f"https://storage.googleapis.com/{data['bucketName']}/tar/{data['serviceName']}/{data['backupName']}.tar" + else: + raise Exception("'bucketDomain' has to be defined") + clean_metrics(id, data['backupName'], data['version']) + backup_metrics['timestamp'].labels(**common_labels).set(int(data['timeStamp'])) + backup_metrics['size'].labels(**common_labels).set(int(data['size'])) + backup_metrics['last_block'].labels(**common_labels).set(int(data['lastBlock'])) + backup_metrics['last_backup'].labels(**common_labels, + backup_name=data['backupName'], + backup_path=backup_path, + tar_backup_path=tar_backup_path).set(1) + backup_metrics['total_size'].labels(storage=data['storage'], + bucket_name=data['bucketName']).set(int(data['totalSize'])) + update_cache((data['storage'], data['bucketName'], data['serviceName']), data) + logging.info(f"request was processed successfully. data: {data}") + + +class HttpProcessor(BaseHTTPRequestHandler): + """ + HTTP Server + """ + BaseHTTPRequestHandler.server_version = 'Python API' + + def log_message(self, format, *args): + message = f"{self.address_string()} {format % args}" + logging.info(message) + + def _set_headers(self): + self.send_response(200) + self.send_header('Content-type', 'application/json; charset=utf-8') + self.end_headers() + + + def do_POST(self): + if self.headers.get('Content-Type') != 'application/json': + self.send_error(400, "Only application/json supported") + self.end_headers() + return + data = "" + try: + # read the message and convert it into a python dictionary + length = int(self.headers['content-length']) + data = self.rfile.read(length) + + set_metrics(json.loads(data)) + self.send_response(200) + + self._set_headers() + self.wfile.write(json.dumps({"status": "OK"}).encode("utf8")) + except json.decoder.JSONDecodeError as e: + tb_output = io.StringIO() + traceback.print_tb(e.__traceback__, file=tb_output) + logging.error(f"JSON decoding error. error: '{e}', JSON: '{data}'") + logging.error(f"JSON decoding error. traceback:\n{tb_output.getvalue()}") + tb_output.close() + self.send_error(400, 'JSONDecodeError') + return + except Exception as e: + tb_output = io.StringIO() + traceback.print_tb(e.__traceback__, file=tb_output) + logging.error(f"request processing error. error: '{e}'") + logging.error(f"request processing error. traceback:\n{tb_output.getvalue()}") + tb_output.close() + self.send_error(500) + return + + +def start_servers(): + """ + Start HTTP Servers + """ + # Start up the server to expose the metrics. + start_http_server(9109) # Metrics server + server_address = ('127.0.0.1', 60101) # Data reception server + server = HTTPServer(server_address, HttpProcessor) + server.serve_forever() + + +if __name__ == '__main__': + + # set up console log handler + console = logging.StreamHandler() + console.setLevel(logging.INFO) + formatter = logging.Formatter(LOGGING_FORMAT) + console.setFormatter(formatter) + # set up basic logging config + logging.basicConfig(format=LOGGING_FORMAT, level=logging.INFO, handlers=[console]) + + + for backup in fetch_cache().items(): + try: + set_metrics(backup[1]) + except KeyError as e: + logging.error(f"cache fetching error. error: {e}, key: {backup[0]}, value: {backup[1]}") + except Exception as e: + tb_output = io.StringIO() + traceback.print_tb(e.__traceback__, file=tb_output) + logging.error(f"cache fetching error. error: '{e}'") + logging.error(f"cache fetching error. traceback:\n{tb_output.getvalue()}") + tb_output.close() + sys.exit(1) + + thread = threading.Thread(target=start_servers, args=()) + thread.daemon = True + thread.start() + thread.join() diff --git a/roles/node_backup/handlers/main.yml b/roles/node_backup/handlers/main.yml new file mode 100644 index 0000000..129b520 --- /dev/null +++ b/roles/node_backup/handlers/main.yml @@ -0,0 +1,15 @@ +--- + +- name: restart node-backup exporter + ansible.builtin.systemd: + name: "node-backup-exporter" + state: restarted + enabled: true + daemon_reload: true + +- name: restart node-backup timer + ansible.builtin.systemd: + name: "node-backup.timer" + state: restarted + enabled: true + daemon_reload: true \ No newline at end of file diff --git a/roles/node_backup/molecule/default/README.md b/roles/node_backup/molecule/default/README.md new file mode 100644 index 0000000..cde3444 --- /dev/null +++ b/roles/node_backup/molecule/default/README.md @@ -0,0 +1,25 @@ +### Collection + +Molecula should install collection automatically, If id did not happened run: +```commandline +mkdir molecule/default/collections +ansible-galaxy collection install -f -r molecule/default/collections.yml -p ./molecule/default/collections +``` + +### Molecule +#### Docker +Test role with docker driver +```shell +molecule create +molecule converge +molecule destroy +``` + +#### LXD +Test role with LXD driver +```shell +DRIVER=lxd molecule create +DRIVER=lxd molecule converge +DRIVER=lxd molecule destroy +``` + diff --git a/roles/node_backup/molecule/default/collections.yml b/roles/node_backup/molecule/default/collections.yml new file mode 100644 index 0000000..88bc7f3 --- /dev/null +++ b/roles/node_backup/molecule/default/collections.yml @@ -0,0 +1,4 @@ +collections: + - name: https://github.com/paritytech/ansible-galaxy.git + type: git + version: main \ No newline at end of file diff --git a/roles/node_backup/molecule/default/converge.yml b/roles/node_backup/molecule/default/converge.yml new file mode 100644 index 0000000..e860493 --- /dev/null +++ b/roles/node_backup/molecule/default/converge.yml @@ -0,0 +1,7 @@ +--- +- name: Converge + hosts: all + tasks: + - name: "Include node backup" + ansible.builtin.include_role: + name: "node_backup" diff --git a/roles/node_backup/molecule/default/group_vars/all.yml b/roles/node_backup/molecule/default/group_vars/all.yml new file mode 100644 index 0000000..578dd8b --- /dev/null +++ b/roles/node_backup/molecule/default/group_vars/all.yml @@ -0,0 +1,35 @@ +## Molecule +ansible_user: root + +## prepare.yml +#node_legacy_rpc_flags: false +node_binary: "https://github.com/paritytech/polkadot/releases/download/v0.9.42/polkadot" +node_chain: "rococo-local" +node_data_root_path: "/opt/{{node_app_name}}" +node_chain_backup_restoring_type: "none" +node_pruning: 256 +# node_binary_deployment: false + +# node_backup +_gcp_bucket: test-blockstore-backups +node_backup_user: "parity" +node_backup_r2_access_key_id: "abc" +node_backup_r2_secret_access_key: "cba" +node_backup_r2_api_url: "https://a.b" +node_backup_targets: + - service_name: rococo-alice-rocksdb-prune + local_path: /opt/rococo-alice-rocksdb-prune/chains/rococo_local_testnet/db + rpc_port: 9933 + bucket_name: "{{ _gcp_bucket }}" + type: "gcp-native" + - service_name: rococo-bob-paritydb-prune + local_path: /opt/rococo-bob-paritydb-prune/chains/rococo_local_testnet/paritydb + rpc_port: 9934 + bucket_name: "{{ _gcp_bucket }}" + type: "gcp-rclone" + - service_name: rococo-bob-paritydb-prune + local_path: /opt/rococo-bob-paritydb-prune/chains/rococo_local_testnet/paritydb + rpc_port: 9934 + bucket_name: "{{ _gcp_bucket }}" + type: "r2-rclone" + bucket_domain: "c.d" \ No newline at end of file diff --git a/roles/node_backup/molecule/default/molecule.yml b/roles/node_backup/molecule/default/molecule.yml new file mode 100644 index 0000000..4e44ecf --- /dev/null +++ b/roles/node_backup/molecule/default/molecule.yml @@ -0,0 +1,31 @@ +--- +dependency: + name: galaxy +driver: + name: ${DRIVER:-docker} +platforms: + - name: molecule-instance-node-backup + # LXD + source: + alias: debian/bullseye/amd64 + # DOCKER + image: "paritytech/debian11:latest" + command: ${MOLECULE_DOCKER_COMMAND:-""} + privileged: true + pre_build_image: true + +provisioner: + name: ansible + options: + diff: True + config_options: + defaults: + callbacks_enabled: timer +verifier: + name: ansible + options: + diff: True +lint: | + set -e + yamllint . + ansible-lint diff --git a/roles/node_backup/molecule/default/prepare.yml b/roles/node_backup/molecule/default/prepare.yml new file mode 100644 index 0000000..15e45c7 --- /dev/null +++ b/roles/node_backup/molecule/default/prepare.yml @@ -0,0 +1,43 @@ +- name: Prepare + hosts: all + gather_facts: false + pre_tasks: + - name: Install Python3 + ansible.builtin.raw: apt -y update && apt install -y python3 + changed_when: false + - name: Prepare | create user parity + ansible.builtin.user: + name: parity + tasks: + - name: "rococo-alice local" + ansible.builtin.include_role: + name: parity.chain.node + vars: + node_rpc_port: 9933 + node_app_name: "rococo-alice-rocksdb-prune" + node_custom_options: + - "--alice" + - name: "rococo-bob local" + ansible.builtin.include_role: + name: parity.chain.node + vars: + node_rpc_port: 9934 + node_paritydb_enable: true + node_app_name: "rococo-bob-paritydb-prune" + node_custom_options: + - "--bob" + - name: Pretend we are in gcp | Install cron, gnupg + ansible.builtin.package: + name: + - cron + - gnupg + state: present + update_cache: true + - name: Pretend we are in gcp | Add an Apt signing key + ansible.builtin.apt_key: + url: https://packages.cloud.google.com/apt/doc/apt-key.gpg + state: present + - name: Pretend we are in gcp | Add apt repository into sources list + ansible.builtin.apt_repository: + repo: deb https://packages.cloud.google.com/apt cloud-sdk main + state: present diff --git a/roles/node_backup/molecule/default/verify.yml b/roles/node_backup/molecule/default/verify.yml new file mode 100644 index 0000000..8161fd5 --- /dev/null +++ b/roles/node_backup/molecule/default/verify.yml @@ -0,0 +1,47 @@ +--- +- name: Verify + hosts: all + gather_facts: false + tasks: + - name: wait until ~10 blocks created + ansible.builtin.uri: + url: "http://127.0.0.1:9933" + method: "POST" + body_format: "json" + body: + id: 1 + jsonrpc: "2.0" + method: "chain_getHeader" + params: [] + return_content: true + register: _node_backup_register_header + until: _node_backup_register_header.json.result.number | int(base=16) > 10 + retries: 10 + delay: 10 + + - name: Print current block + ansible.builtin.debug: + var: _node_backup_register_header.json.result.number | int(base=16) +# # todo add tests +# +## a) upload to gcp +# GCP storage emulator is not available yet (https://github.com/googleapis/google-cloud-python/issues/10300), +# there are third party emulator, but support of gsutils is broken (https://github.com/oittaa/gcp-storage-emulator/issues/186) +# when emulator will be available: +# 1. run and configure emulator +# 2. run script: +# - name: run backup script +# ansible.builtin.command: /home/parity/bin/node_backup.sh +# 3. +# - name: "rococo-bob local" +# ansible.builtin.include_role: +# name: parity.chain.node +# vars: +# node_rpc_port: 9935 +# node_paritydb_enable: true +# node_app_name: "rococo-local-rpc" +# +## b) Test backup-exporter: +# We can push fake data to backup-exporter (like run bash script). +# Then we can check the Prometheus endpoint to check and match the results. +# This will allow checking the code of the exporter. diff --git a/roles/node_backup/tasks/exporter.yml b/roles/node_backup/tasks/exporter.yml new file mode 100644 index 0000000..8e1e659 --- /dev/null +++ b/roles/node_backup/tasks/exporter.yml @@ -0,0 +1,37 @@ +--- + +- name: node-backup | exporter | remove the cache file + ansible.builtin.file: + path: "{{ _node_backup_exporter_cache_file }}" + state: absent + notify: restart node-backup exporter + when: node_backup_wipe_cache_enable | bool + +- name: node-backup | exporter | copy exporter file + ansible.builtin.copy: + src: "exporter.py" + dest: "{{ _node_backup_exporter_file }}" + mode: 0755 + owner: "{{ node_backup_user }}" + group: "{{ node_backup_user }}" + notify: restart node-backup exporter + +- name: node-backup | exporter | copy exporter systemd unit file + ansible.builtin.template: + src: "node-backup-exporter.service.j2" + dest: "/etc/systemd/system/node-backup-exporter.service" + owner: "root" + group: "root" + mode: "0644" + notify: restart node-backup exporter + + # to avoid 2 restarts during the first deploy +- name: node-backup | exporter | flush handlers + ansible.builtin.meta: flush_handlers + +- name: node-backup | exporter | start exporter service + ansible.builtin.systemd: + name: "node-backup-exporter" + state: started + enabled: true + daemon_reload: true diff --git a/roles/node_backup/tasks/job.yml b/roles/node_backup/tasks/job.yml new file mode 100644 index 0000000..f0379e4 --- /dev/null +++ b/roles/node_backup/tasks/job.yml @@ -0,0 +1,51 @@ +--- + +- name: node-backup | job | set _node_backup_targets variable 1 + ansible.builtin.set_fact: + _node_backup_targets: [] + +- name: node-backup | job | set _node_backup_targets variable 2 + ansible.builtin.set_fact: + _node_backup_targets: "{{ _node_backup_targets + + [ item | combine({'id': _node_backup_id}, recursive=True) ] }}" + vars: + _node_backup_id: "{{ (_node_backup_storages[item.type] + '-' + item.bucket_name + '-' + item.service_name) | regex_replace('[^0-9a-zA-Z]+', '-') }}" + loop: "{{ node_backup_targets }}" + +- name: node-backup | job | copy single backup scripts + ansible.builtin.template: + src: "single-backup.sh.j2" + dest: "{{ _node_backup_scripts_path }}/{{ item.id }}.sh" + mode: 0755 + owner: "root" + group: "root" + loop: "{{ _node_backup_targets }}" + tags: ['node-backup-test'] + +- name: node-backup | job | copy common backup script + ansible.builtin.template: + src: "common-backup.sh.j2" + dest: "{{ _node_backup_scripts_path }}/common.sh" + mode: 0755 + owner: "root" + group: "root" + tags: ['node-backup-test'] + +- name: node-backup | job | copy backup systemd unit files + ansible.builtin.template: + src: "{{ item }}.j2" + dest: "/etc/systemd/system/{{ item }}" + owner: "root" + group: "root" + mode: "0644" + loop: + - "node-backup.service" + - "node-backup.timer" + notify: restart node-backup timer + +- name: node-backup | job | enable timer + ansible.builtin.systemd: + name: "node-backup.timer" + state: started + enabled: true + daemon_reload: true diff --git a/roles/node_backup/tasks/main.yml b/roles/node_backup/tasks/main.yml new file mode 100644 index 0000000..38a617d --- /dev/null +++ b/roles/node_backup/tasks/main.yml @@ -0,0 +1,49 @@ +--- + +- name: node-backup | tests + ansible.builtin.include_tasks: + file: tests.yml + apply: + tags: ['node-backup', 'node-backup-tests'] + tags: ['node-backup', 'node-backup-tests'] + +- name: node-backup | create directories + ansible.builtin.file: + path: "{{ item.path }}" + state: directory + mode: "0755" + owner: "{{ item.user }}" + group: "{{ item.user }}" + loop: + - path: "{{ node_backup_base_path }}" + user: root + - path: "{{ _node_backup_scripts_path }}" + user: root + - path: "{{ _node_backup_exporter_path }}" + user: "{{ node_backup_user }}" + - path: "{{ _node_backup_log_path }}" + user: root + - path: "{{ _node_backup_venv_path }}" + user: root + tags: [ 'node-backup' ] + +- name: node-backup | requirements + ansible.builtin.include_tasks: + file: requirements.yml + apply: + tags: [ 'node-backup', 'node-backup-requirements' ] + tags: [ 'node-backup', 'node-backup-requirements' ] + +- name: node-backup | job + ansible.builtin.include_tasks: + file: job.yml + apply: + tags: [ 'node-backup', 'node-backup-job' ] + tags: [ 'node-backup', 'node-backup-job' ] + +- name: node-backup | exporter + ansible.builtin.include_tasks: + file: exporter.yml + apply: + tags: [ 'node-backup', 'node-backup-exporter' ] + tags: [ 'node-backup', 'node-backup-exporter' ] diff --git a/roles/node_backup/tasks/requirements.yml b/roles/node_backup/tasks/requirements.yml new file mode 100644 index 0000000..9207623 --- /dev/null +++ b/roles/node_backup/tasks/requirements.yml @@ -0,0 +1,48 @@ +--- + +- name: node-backup | requirements | install packages + ansible.builtin.package: + name: "{{ packages }}" + state: present + update_cache: true + vars: + packages: + - "curl" + - "jq" + - "expect" + - "moreutils" + - "python3-venv" + - "python3-setuptools" + + +- name: node-backup | requirements | install Python modules + ansible.builtin.pip: + name: + - "prometheus-client==0.17.0" + virtualenv: "{{ _node_backup_venv_path }}" + virtualenv_command: "python3 -m venv" + notify: restart node-backup exporter + +- name: node-backup | requirements | configure rclone + block: + + - name: node-backup | requirements | install rclone + ansible.builtin.apt: + deb: "{{ _node_backup_rclone_deb }}" + + - name: node backup | requirements | create rclone config directory + ansible.builtin.file: + path: "/root/.config/rclone" + state: directory + mode: 0700 + owner: "root" + group: "root" + + - name: node-backup | requirements | copy R2 config + ansible.builtin.template: + src: "rclone/rclone.conf.j2" + dest: "/root/.config/rclone/rclone.conf" + owner: "root" + group: "root" + mode: 0600 + when: node_backup_targets | json_query('[].type') | intersect(_node_backup_rclone_types) | length > 0 diff --git a/roles/node_backup/tasks/tests.yml b/roles/node_backup/tasks/tests.yml new file mode 100644 index 0000000..167d119 --- /dev/null +++ b/roles/node_backup/tasks/tests.yml @@ -0,0 +1,31 @@ +--- + +- name: node-backup | test | check R2 configuration + ansible.builtin.fail: + msg: "If the R2 backups are used, 'node_backup_r2_access_key_id', 'node_backup_r2_secret_access_key' and 'node_backup_r2_api_url' variables have to be specified" + when: node_backup_targets | json_query('[].type') | intersect(_node_backup_r2_types) | length > 0 and + ( node_backup_r2_access_key_id == '' or + node_backup_r2_secret_access_key == '' or + node_backup_r2_api_url == '' + ) + +- name: node-backup | test | check variables + ansible.builtin.fail: + msg: "'service_name', 'rpc_port', 'type' and 'bucket_name' fields have to be specified for each item in 'node_backup_targets'" + when: item.service_name == '' or + item.rpc_port == '' or + item.type == '' or + item.bucket_name == '' + loop: "{{ node_backup_targets }}" + +- name: node-backup | test | check R2 backups + ansible.builtin.fail: + msg: "the 'bucket_domain' field has to be specified for R2 backups" + when: item.type in _node_backup_r2_types and item.bucket_domain == '' + loop: "{{ node_backup_targets }}" + +- name: node-backup | test | check backup types + ansible.builtin.fail: + msg: "{{ item.type }} is not a valid backup type" + when: item.type not in (_node_backup_gcp_types + _node_backup_r2_types) + loop: "{{ node_backup_targets }}" diff --git a/roles/node_backup/templates/common-backup.sh.j2 b/roles/node_backup/templates/common-backup.sh.j2 new file mode 100644 index 0000000..c0eafb3 --- /dev/null +++ b/roles/node_backup/templates/common-backup.sh.j2 @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +{% for target in _node_backup_targets %} +now=$(date +"%Y%m%d-%H%M%S") +unbuffer bash {{ _node_backup_scripts_path }}/{{ target.id }}.sh "${now}" 2>&1 | tee "{{ _node_backup_log_path }}/{{ target.service_name }}-${now}.txt" +{% endfor %} \ No newline at end of file diff --git a/roles/node_backup/templates/node-backup-exporter.service.j2 b/roles/node_backup/templates/node-backup-exporter.service.j2 new file mode 100644 index 0000000..81df6d3 --- /dev/null +++ b/roles/node_backup/templates/node-backup-exporter.service.j2 @@ -0,0 +1,12 @@ +[Unit] +Description=Node backup exporter systemd service + +[Service] +Environment=PYTHONUNBUFFERED=True +ExecStart={{ _node_backup_venv_path }}/bin/python3 {{ _node_backup_exporter_file }} +Restart=always +User={{ node_backup_user }} +Group={{ node_backup_user }} + +[Install] +WantedBy=multi-user.target diff --git a/roles/node_backup/templates/node-backup.service.j2 b/roles/node_backup/templates/node-backup.service.j2 new file mode 100644 index 0000000..8c06940 --- /dev/null +++ b/roles/node_backup/templates/node-backup.service.j2 @@ -0,0 +1,6 @@ +[Unit] +Description=Node backup systemd service + +[Service] +Type=oneshot +ExecStart={{ _node_backup_scripts_path }}/common.sh diff --git a/roles/node_backup/templates/node-backup.timer.j2 b/roles/node_backup/templates/node-backup.timer.j2 new file mode 100644 index 0000000..c1b5167 --- /dev/null +++ b/roles/node_backup/templates/node-backup.timer.j2 @@ -0,0 +1,11 @@ +[Unit] +Description=Node backup systemd timer + +[Timer] +{% for time in node_backup_schedule %} +OnCalendar={{ time }} +{% endfor %} +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/roles/node_backup/templates/rclone/rclone.conf.j2 b/roles/node_backup/templates/rclone/rclone.conf.j2 new file mode 100644 index 0000000..9d704f9 --- /dev/null +++ b/roles/node_backup/templates/rclone/rclone.conf.j2 @@ -0,0 +1,19 @@ +{% if node_backup_targets | json_query('[].type') | intersect(_node_backup_r2_types) | length > 0 %} +[R2backups] +type = s3 +provider = Cloudflare +access_key_id = {{ node_backup_r2_access_key_id }} +secret_access_key = {{ node_backup_r2_secret_access_key }} +endpoint = {{ node_backup_r2_api_url }} +acl = private +upload_cutoff = 1024M +upload_concurrency = {{ node_backup_max_concurrent_requests }} +chunk_size = 256M +{% endif %} + +{% if node_backup_targets | json_query('[].type') | intersect(_node_backup_gcp_types) | length > 0 %} +[GCPbackups] +type = google cloud storage +bucket_policy_only = true +{% endif %} + diff --git a/roles/node_backup/templates/single-backup.sh.j2 b/roles/node_backup/templates/single-backup.sh.j2 new file mode 100644 index 0000000..f5a78a7 --- /dev/null +++ b/roles/node_backup/templates/single-backup.sh.j2 @@ -0,0 +1,218 @@ +#!/usr/bin/env bash + +# We mustn't remove it. Any failed command can bring an inconsistent backup. +set -eu -o pipefail + +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Backup $0 Started!\n---\n" + +tmp_meta_file="{{ node_backup_tmp_path }}/{{ item.service_name }}.meta.txt" +tmp_latest_version_file="{{ node_backup_tmp_path }}/{{ item.service_name }}_latest_version.meta.txt" + +set -x +systemctl start {{ item.service_name }} +set +x + +counter=1 +curl_result="" + +until echo ${curl_result} | grep 'false' +do + if [ $counter -gt 20 ];then + echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) the health check is failed for '{{ item.service_name }}' service. The backup will be skipped!\n---\n" + false + fi + echo -e "Run health-check ${counter}..." + set -x + curl_result=$(curl --retry 3 --retry-delay 60 --retry-connrefused -s -X POST -H "Content-Type: application/json" \ + -d '{"id":1, "jsonrpc":"2.0", "method": "system_health", "params":[]}' \ + http://127.0.0.1:{{ item.rpc_port }} | jq '.["result"]["isSyncing"]') + set +x + if [ $counter -gt 1 ];then + sleep 60 + fi + let "counter+=1" +done + +set -x +last_block=$(curl --retry 3 --retry-connrefused --retry-delay 60 -X POST -H "Content-Type: application/json" \ + -d '{"id":1, "jsonrpc":"2.0", "method": "system_syncState", "params":[]}' \ + http://127.0.0.1:{{ item.rpc_port }} \ + | jq '.["result"]["currentBlock"]') + +version=$(curl --retry 3 --retry-connrefused --retry-delay 60 -X POST -H "Content-Type: application/json" \ + -d '{"id":1, "jsonrpc":"2.0", "method": "system_version", "params":[]}' \ + http://127.0.0.1:{{ item.rpc_port }} \ + | jq '.["result"]') +set +x +version=${version%\"} +version=${version#\"} +time_stamp=$(date +"%s") + +SECONDS=0 + +# Database would be modified during the backup and potentially corrupt the backup. So we'll +# need to stop the unit and start it again after the backup. +set -x +systemctl stop {{ item.service_name }} +set +x + +# Get the list of local files +local_files=/tmp/local-files-{{ item.service_name }}-${1} +remote_files=/tmp/remote-files-{{ item.service_name }}-${1} +find {{ item.local_path }} -mindepth 1 -type f | sed "s|{{ item.local_path }}||g" | sed 's/^\/*//' | sort > ${local_files} + +{% if item.type == 'gcp-native' %} +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Start the '{{ item.id }}' backup\n---\n" +set -x +gcloud storage \ + cp -r {{ item.local_path }} gs://{{ item.bucket_name }}/{{ item.service_name }}/${1} + +# Get the list of files in the bucket +gcloud storage ls -r gs://{{ item.bucket_name }}/{{ item.service_name }}/${1} | grep -vF '/:' \ + | sed "s|gs://{{ item.bucket_name }}/{{ item.service_name }}/${1}/||g" | grep . | sort > ${remote_files} +set +x + +# Check if remote version matches the local one +if ! diff ${remote_files} ${local_files} -q; then + echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) The contents of the remote bucket does not match the local copy for the '{{ item.id }}' backup. Cleaning the remote backup...\n---\n" + set -x + gcloud storage rm -r gs://{{ item.bucket_name }}/{{ item.service_name }}/${1} + set +x + echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Show diff and exit!\n---\n" + set -x + diff ${remote_files} ${local_files} + rm -f ${remote_files} ${local_files} + set +x + exit 1 +fi + +set -x +gcloud storage \ + cp ${remote_files} gs://{{ item.bucket_name }}/{{ item.service_name }}/${1}/files.txt +rm -f ${remote_files} +size=$(gsutil \ + du -s gs://{{ item.bucket_name }}/{{ item.service_name }}/${1} | awk '{ print $1 }' ) + +echo -e "size: ${size}\nlastBlock: ${last_block}\nversion: ${version}" > ${tmp_meta_file} +gcloud storage \ + cp ${tmp_meta_file} gs://{{ item.bucket_name }}/{{ item.service_name }}/${1}.meta.txt +rm -f ${tmp_meta_file} + +echo "${1}" > ${tmp_latest_version_file} +gcloud storage \ + cp ${tmp_latest_version_file} gs://{{ item.bucket_name }}/{{ item.service_name }}/latest_version.meta.txt +rm -f ${tmp_latest_version_file} +set +x +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed the '{{ item.id }}' backup in ${SECONDS} seconds\n---\n" + +{% if item.tar | default(true) %} +SECONDS=0 +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Start the '{{ item.id }}' TAR backup\n---\n" +set -x +tar -cf - {{ item.local_path }} | gcloud storage \ + cp - gs://{{ item.bucket_name }}/tar/{{ item.service_name }}/${1}.tar +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed the '{{ item.id }}' TAR backup in ${SECONDS} seconds\n---\n" +set +x +{% endif %} + +set -x +total_size=$(gsutil \ + du -s gs://{{ item.bucket_name }} | awk '{ print $1 }' ) +set +x +{% endif %} + + +{% if item.type in _node_backup_rclone_types %} +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Start the '{{ item.id }}' backup\n---\n" + +{% if item.type == 'gcp-rclone' %} +remote="GCPbackups" +{% elif item.type == 'r2-rclone' %} +remote="R2backups" +{% else %} +{{ "backup type must be defined."/0 }} +{% endif %} + +set -x +LATEST_BACKUP=$(rclone cat ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/latest_version.meta.txt) +if [ -n "$LATEST_BACKUP" ]; then + rclone copy -v --transfers={{ node_backup_max_concurrent_requests }} \ + --contimeout=10m --retries 10 --retries-sleep 60 --error-on-no-transfer --fast-list --checksum \ + ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${LATEST_BACKUP} \ + ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1} + echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed copying of the latest backup for the '{{ item.id }}' backup in ${SECONDS} seconds\n---\n" + SECONDS=0 +fi +rclone sync -v --transfers={{ node_backup_max_concurrent_requests }} \ + --contimeout=10m --retries 10 --retries-sleep 60 --error-on-no-transfer \ + --update --fast-list --delete-during --disable-http2 --no-gzip-encoding \ + {{ item.local_path }} ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1} + +# Get the list of files in the bucket +rclone lsf -R --fast-list --files-only \ + ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1} | sort > ${remote_files} +set +x + +# Check if remote version matches the local one +if ! diff ${remote_files} ${local_files} -q; then + echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) The contents of the remote bucket does not match the local copy for the '{{ item.id }}' backup. Cleaning the remote backup...\n---\n" + set -x + rclone purge -v --contimeout=10m --retries 10 --retries-sleep 60 --fast-list \ + ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1} + set +x + echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Show diff and exit!\n---\n" + set -x + diff ${remote_files} ${local_files} + rm -f ${remote_files} ${local_files} + set +x + exit 1 +fi + +set -x +rclone copyto -v \ + ${remote_files} ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1}/files.txt +rm -f ${remote_files} + +size=$(rclone size --json ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1} | jq '.bytes') + +echo -e "size: ${size}\nlastBlock: ${last_block}\nversion: ${version}" > ${tmp_meta_file} +rclone copyto -v \ + ${tmp_meta_file} ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/${1}.meta.txt +rm -f ${tmp_meta_file} + +echo "${1}" > ${tmp_latest_version_file} +rclone copyto -v \ + ${tmp_latest_version_file} ${remote}:{{ item.bucket_name }}/{{ item.service_name }}/latest_version.meta.txt +rm -f ${tmp_latest_version_file} +set +x +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed the '{{ item.id }}' backup in ${SECONDS} seconds\n---\n" + +{% if item.tar | default(true) %} +SECONDS=0 +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Start the '{{ item.id }}' TAR backup\n---\n" +set -x +tar -cf - {{ item.local_path }} | rclone rcat -v --contimeout=10m --retries 10 --retries-sleep 60 --error-on-no-transfer \ + --transfers=1 --disable-http2 \ + ${remote}:{{ item.bucket_name }}/tar/{{ item.service_name }}/${1}.tar +set +x +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Completed the '{{ item.id }}' TAR backup in ${SECONDS} seconds\n---\n" +{% endif %} + +set -x +total_size=$(rclone size --json ${remote}:{{ item.bucket_name }} | jq '.bytes') +set +x +{% endif %} + +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Notify the backup exporter about the latest successful backup\n---\n" +set -x +curl --retry 3 --retry-connrefused --retry-delay 60 -X POST -H "Content-Type: application/json" -d \ + '{"serviceName":"{{ item.service_name }}", "backupName": "'$1'", "timeStamp": "'$time_stamp'", + "size": "'$size'", "totalSize": "'$total_size'", "lastBlock": "'$last_block'", "version": "'$version'", + "storage": "{{ _node_backup_storages[item.type] }}", "bucketName": "{{ item.bucket_name }}", "bucketDomain": "{{ item.bucket_domain | default("") }}"}' \ + http://127.0.0.1:60101 + +rm -f ${local_files} +systemctl start {{ item.service_name }} +set +x + +echo -e "\n---\n$(date +%Y-%m-%d\ %H:%M:%S) Backup $0 Finished!\n---\n" diff --git a/roles/node_backup/vars/main.yml b/roles/node_backup/vars/main.yml new file mode 100644 index 0000000..6c7a881 --- /dev/null +++ b/roles/node_backup/vars/main.yml @@ -0,0 +1,17 @@ +--- + +_node_backup_scripts_path: "{{ node_backup_base_path }}/scripts" +_node_backup_log_path: "{{ node_backup_base_path }}/logs" +_node_backup_venv_path: "{{ node_backup_base_path }}/venv" +_node_backup_exporter_path: "{{ node_backup_base_path }}/exporter" +_node_backup_exporter_file: "{{ _node_backup_exporter_path }}/exporter.py" +_node_backup_exporter_cache_file: "{{ _node_backup_exporter_path }}/exporter.cache" +_node_backup_rclone_deb: "https://downloads.rclone.org/v1.63.1/rclone-v1.63.1-linux-amd64.deb" + +_node_backup_r2_types: ["r2-rclone"] +_node_backup_gcp_types: ["gcp-native", "gcp-rclone"] +_node_backup_rclone_types: ["gcp-rclone", "r2-rclone"] +_node_backup_storages: + r2-rclone: r2 + gcp-rclone: gcp + gcp-native: gcp \ No newline at end of file diff --git a/roles/state_exporter/defaults/main.yml b/roles/state_exporter/defaults/main.yml new file mode 100644 index 0000000..ba4a080 --- /dev/null +++ b/roles/state_exporter/defaults/main.yml @@ -0,0 +1,6 @@ +--- + +state_exporter_name: "state-exporter" +state_exporter_user: "parity" +state_exporter_file: "/home/{{ state_exporter_user }}/bin/{{ state_exporter_name }}.py" +state_exporter_debug: false diff --git a/roles/state_exporter/files/exporter.py b/roles/state_exporter/files/exporter.py new file mode 100644 index 0000000..22bc130 --- /dev/null +++ b/roles/state_exporter/files/exporter.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import schedule +import time +import sys +import os +import logging +import traceback +from prometheus_client import start_http_server, Gauge +import psutil + +LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + +node_chain_folders = { + 'polkadot': 'polkadot', + 'kusama': 'ksmcc3', + 'westend': 'westend2', + 'rococo': 'rococo_v1_12' +} + +process_metrics = { + 'polkadot_state_process_cmdline': Gauge( + 'polkadot_state_process_cmdline', + 'cmdline of a node process', + ['name', 'pid', 'cmd_line']), + 'polkadot_state_process_threads': Gauge( + 'polkadot_state_process_threads', + 'number threads of a node process', + ['name', 'pid']), + 'polkadot_state_process_memory': Gauge( + 'polkadot_state_process_memory', + 'memory is used by a node process', + ['name', 'pid']), + 'polkadot_state_process_cpu_percent': Gauge( + 'polkadot_state_process_cpu_percent', + 'memory is used by a node process', + ['name', 'pid']) +} + +node_metrics = { + 'polkadot_state_node_session_key': Gauge( + 'polkadot_state_node_session_key', + 'session key of a node', + ['name', 'pid', 'session_key']) +} + +PORT = 9110 + + +def update_metrics(): + processes = {} + + for proc in psutil.process_iter(): + try: + process_cmdline = proc.cmdline() + if not (len(process_cmdline) > 1 and '--name' in process_cmdline and '--chain' in process_cmdline): + continue + process_chain = process_cmdline[::-1][process_cmdline[::-1].index('--chain') - 1] + process_name = process_cmdline[::-1][process_cmdline[::-1].index('--name') - 1] + process_pid = proc.pid + process_base_path = process_cmdline[::-1][process_cmdline[::-1].index('--base-path') - 1]\ + if '--base-path' in process_cmdline else None + # It will delete the previous process if + # it's the parent of the current process (it can be docker, bash, etc.) + if process_name in processes and processes[process_name]['pid'] < process_pid: + del processes[process_name] + processes[process_name] = {'pid': process_pid, + 'chain': process_chain, + 'cmd_line': ' '.join(process_cmdline[1:]), + 'threads': proc.num_threads(), + 'memory': proc.memory_info().rss, + 'cpu_percent': proc.cpu_percent(), + 'base_path': process_base_path + } + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + pass + except Exception as e: + logger.error(e) + logger.error(traceback.print_tb(e.__traceback__)) + return + logger.debug('processes were found: ' + str(processes)) + + try: + # wipe metrics + for metric in {**process_metrics, **node_metrics}.items(): + for sample in metric[1].collect()[0].samples: + metric[1].remove(*list(sample.labels.values())) + + for proc in processes: + process_metrics['polkadot_state_process_cmdline'].labels( + name=proc, + pid=processes[proc]['pid'], + cmd_line=processes[proc]['cmd_line']).set(1) + process_metrics['polkadot_state_process_threads'].labels( + name=proc, + pid=processes[proc]['pid']).set(processes[proc]['threads']) + process_metrics['polkadot_state_process_memory'].labels( + name=proc, + pid=processes[proc]['pid']).set(processes[proc]['memory']) + process_metrics['polkadot_state_process_cpu_percent'].labels( + name=proc, + pid=processes[proc]['pid']).set(processes[proc]['cpu_percent']) + if processes[proc]['base_path']: + keystore_path = os.path.join( + processes[proc]['base_path'], + 'chains', + node_chain_folders[processes[proc]['chain']], + 'keystore') + node_session_key = parse_session_key(keystore_path) + if node_session_key: + node_metrics['polkadot_state_node_session_key'].labels( + name=proc, + pid=processes[proc]['pid'], + session_key=node_session_key).set(1) + except Exception as e: + logger.error(e) + logger.error(traceback.print_tb(e.__traceback__)) + return + + +def parse_session_key(dir): + # variants of key prefixes in the right order + key_formats = ( + ['6772616e', '62616265', '696d6f6e', '70617261', '61756469'], + ['6772616e', '62616265', '696d6f6e', '70617261', '6173676e', '61756469']) + possible_prefixes = list(set([j for i in key_formats for j in i])) + + if os.path.isdir(dir): + os.chdir(dir) + files = os.listdir('.') + files = [i for i in files if len(i) == 72 and i[0:8] in possible_prefixes] + if not files: + return None + # find creation time of the newlest key + time_of_last_key = sorted(list(set([int(os.path.getmtime(i)) for i in files])))[-1] + # parse the newest public keys and them prefixes from names of files. + # creation time can have 1 second drift in theory + keys = {i[0:8]: i[8:] for i in files if int(os.path.getmtime(i)) in [time_of_last_key - 1, time_of_last_key, time_of_last_key + 1]} + logger.debug('keys were found: ' + str(keys) + ' in the keystore path: ' + dir) + for key_format in key_formats: + if set(keys.keys()) == set(key_format): + # build the session key + session_key = '0x' + ''.join([keys[i] for i in key_format]) + logger.debug('the session key was parsed: ' + session_key + ' in the keystore path: ' + dir) + return(session_key) + logger.error('Error of session key parsing') + return None + + +if __name__ == '__main__': + global logger + logger = logging.getLogger('state_exporter') + + # console handler + ch = logging.StreamHandler() + if len(sys.argv) > 1 and sys.argv[1] == 'debug': + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + formatter = logging.Formatter(LOGGING_FORMAT) + ch.setFormatter(formatter) + logger.addHandler(ch) + + # Start up the server to expose the metrics + start_http_server(PORT) # Metrics server + schedule.every(10).seconds.do(update_metrics) + while True: + schedule.run_pending() + time.sleep(1) diff --git a/roles/state_exporter/handlers/main.yml b/roles/state_exporter/handlers/main.yml new file mode 100644 index 0000000..199da3e --- /dev/null +++ b/roles/state_exporter/handlers/main.yml @@ -0,0 +1,8 @@ +--- + +- name: restart state-exporter + ansible.builtin.systemd: + name: "{{ state_exporter_name }}" + state: restarted + enabled: true + daemon_reload: true diff --git a/roles/state_exporter/tasks/main.yml b/roles/state_exporter/tasks/main.yml new file mode 100644 index 0000000..968a8a1 --- /dev/null +++ b/roles/state_exporter/tasks/main.yml @@ -0,0 +1,53 @@ +--- + +- block: + + - name: Exporter | Install apt packages + ansible.builtin.package: + name: "{{ packages }}" + state: present + update_cache: true + vars: + packages: + - "python3-prometheus-client" + - "python3-schedule" + - "python3-psutil" + + - name: Exporter | Create directory + ansible.builtin.file: + path: "{{ state_exporter_file | dirname }}" + state: directory + mode: 0755 + owner: "{{ state_exporter_user }}" + group: "{{ state_exporter_user }}" + + - name: Exporter | Copy exporter + ansible.builtin.copy: + src: "exporter.py" + dest: "{{ state_exporter_file }}" + mode: 0755 + owner: "{{ state_exporter_user }}" + group: "{{ state_exporter_user }}" + notify: restart state-exporter + + - name: Exporter | Copy exporter systemd unit file + ansible.builtin.template: + src: ".service.j2" + dest: "/etc/systemd/system/{{ state_exporter_name }}.service" + owner: "root" + group: "root" + mode: "0600" + notify: restart state-exporter + + # to avoid 2 restarts during the first deploy + - name: Exporter | Flush handlers + ansible.builtin.meta: flush_handlers + + - name: Exporter | Start exporter service + ansible.builtin.systemd: + name: "{{ state_exporter_name }}" + state: started + enabled: true + daemon_reload: true + + tags: ['state-exporter'] diff --git a/roles/state_exporter/templates/.service.j2 b/roles/state_exporter/templates/.service.j2 new file mode 100644 index 0000000..0e0fefc --- /dev/null +++ b/roles/state_exporter/templates/.service.j2 @@ -0,0 +1,13 @@ +[Unit] +Description=Node backup exporter systemd service + +[Service] +Environment=PYTHONUNBUFFERED=True +ExecStart={{ state_exporter_file }}{% if state_exporter_debug %} debug{% endif %} + +Restart=always +User={{ state_exporter_user }} +Group={{ state_exporter_user}} + +[Install] +WantedBy=multi-user.target