From 8bff34562b94bb8fc5c4c5c231694c3bf104cd0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 09:41:27 +0100 Subject: [PATCH 01/10] Improve monitoring comment --- temboardui/plugins/monitoring/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/temboardui/plugins/monitoring/__init__.py b/temboardui/plugins/monitoring/__init__.py index 0bd393ef0..b6b1585ed 100644 --- a/temboardui/plugins/monitoring/__init__.py +++ b/temboardui/plugins/monitoring/__init__.py @@ -10,6 +10,8 @@ # # Tasks: # +# - schedule_collector(): schedule a collector task for each agent in +# inventory. # - collector(host, port, key) inserts metrics history in metric_*_current # table. # - history_tables_worker() move data from metric_*_current to From 777638bb7e403a2251cca3d8131c02c6a3e446ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 11:45:10 +0100 Subject: [PATCH 02/10] Clean unused local variable declaration --- .../20211112_2d55ab971bde17_metrics_archive_deadlock.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/temboardui/model/alembic/versions/20211112_2d55ab971bde17_metrics_archive_deadlock.sql b/temboardui/model/alembic/versions/20211112_2d55ab971bde17_metrics_archive_deadlock.sql index ab72a8d66..1a71005d4 100644 --- a/temboardui/model/alembic/versions/20211112_2d55ab971bde17_metrics_archive_deadlock.sql +++ b/temboardui/model/alembic/versions/20211112_2d55ab971bde17_metrics_archive_deadlock.sql @@ -32,7 +32,6 @@ CREATE OR REPLACE FUNCTION aggregate_data_single(table_name TEXT, record_type TE LANGUAGE plpgsql AS $$ DECLARE - t JSON; v_agg_periods TEXT[] := array['30m', '6h']; v_agg_table TEXT; i_period TEXT; From 29ee48d7812c63f071e43f0cfb9abc7aac2e9bfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 09:41:42 +0100 Subject: [PATCH 03/10] perfui: Handle more logfmt metrics --- perfui/importlog.py | 56 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/perfui/importlog.py b/perfui/importlog.py index 83623223c..1e57540f8 100755 --- a/perfui/importlog.py +++ b/perfui/importlog.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # # Documented in docs/howto-performances.md +# import logging import os @@ -15,9 +16,11 @@ import pytz -logger = logging.getLogger('logsender') +logger = logging.getLogger('importlog') LOCALTZ = pytz.timezone('UTC') -KNOWN_LABELS = ['pid', 'service', 'task'] +KNOWN_LABELS = [ + 'agent', 'pid', 'service', 'task', 'dbname', 'spcname', 'timestamp', +] KNOWN_METRICS = { 'fork': dict( type='counter', @@ -63,7 +66,18 @@ type='counter', unit='seconds', help='Total seconds spent by the process in userspace.', - ) + ), + # Agent metrics + 'xacts_n_commit': dict( + type='counter', + unit='nounit', + help='Total commit in database.', + ), + 'xacts_n_rollback': dict( + type='counter', + unit='nounit', + help='Total commit in database.', + ), } @@ -127,12 +141,13 @@ def main(logfile): log_count += len(loki_batch) loki_batch[:] = [] - if ' io_rchar=' in tail: + if ' io_rchar=' in tail or ' up=1 ' in tail: # métriques - _, message = tail.split(':', 2) + _, message = tail.split(':', 1) try: metrics = dict(parse_logfmt(message)) except Exception as e: + import pdb; pdb.set_trace() logger.warning("Failed to parse perf metrics: %s.", e) logger.warning("Malformed line: %s.", line) continue @@ -168,12 +183,12 @@ def main(logfile): "/import/data.txt", "/prometheus", ]) - from_ = 1000 * int(start.timestamp() - 60) - to = 1000 * int(end.timestamp() + 60) + from_ = 1000 * int(min(start.timestamp(), omw.start) - 60) + to = 1000 * int(max(end.timestamp(), omw.end) + 60) dashboard_url = ( "http://grafana.temboardperf.docker:3000" "/d/MkhXLKbnz/temboard-performance" - f"?orgId=1&from={from_}&to={to}&var-logfile={logfile}" + f"?orgId=1&from={from_}&to={to}&var-service=.%2B&var-logfile={logfile}" ) logger.info("View graph and messages at: %s.", dashboard_url) @@ -181,6 +196,7 @@ def main(logfile): def send_log_batch_to_loki(lines, labels): # cf. https://grafana.com/docs/loki/latest/api/#post-lokiapiv1push logger.info("Sending %s lines to loki.", len(lines)) + return r = httpx.post( 'http://loki.temboardperf.docker:3100/loki/api/v1/push', json=dict(streams=[dict( @@ -239,12 +255,25 @@ def find_timezone(): class OpenMetricsWriter: + blacklist = ['up'] + def __init__(self, fo, known_metrics): self.fo = fo self.known_metrics = known_metrics + self.unknown = [] self.declared_metrics = set() + self.start = None + self.end = None def append(self, name, labels, value, epoch_s): + if name in self.blacklist: + return + if name in self.unknown: + return + if name not in self.known_metrics: + logger.debug("Unknown metric %s", name) + self.unknown.append(name) + return unit = self.known_metrics[name]['unit'] metric = name if unit != 'nounit': @@ -260,13 +289,24 @@ def append(self, name, labels, value, epoch_s): """)) self.declared_metrics.add(name) + if 'timestamp' in labels: + # Overwrite epoch from pseudo label timestamp + timestamp = labels.pop('timestamp') + date, time = timestamp.split('T') + timestamp = parse_datetime(date, time) + epoch_s = timestamp.timestamp() + self.start = min(self.start or epoch_s, epoch_s) + labels = ','.join('%s="%s"' % label for label in labels.items()) self.fo.write(dedent(f"""\ {metric}{{{labels}}} {value} {epoch_s} """)) + self.end = epoch_s def close(self): self.fo.write("# EOF\n") + for name in self.unknown: + logger.debug("Unknown metric %s.", name) logging.basicConfig( From d95b8cb70640f864e9378a7b81b6e788bce73fc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 11:44:31 +0100 Subject: [PATCH 04/10] perfui: New postgres dashboard Allows to present metrics from agent and collector in a Grafana. --- .../grafana/provisioning/dashboards/perf.yaml | 6 + .../rootfs/usr/share/temboard/postgres.json | 201 ++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 perfui/grafana/rootfs/usr/share/temboard/postgres.json diff --git a/perfui/grafana/rootfs/etc/grafana/provisioning/dashboards/perf.yaml b/perfui/grafana/rootfs/etc/grafana/provisioning/dashboards/perf.yaml index 067510dd3..b3a02d757 100644 --- a/perfui/grafana/rootfs/etc/grafana/provisioning/dashboards/perf.yaml +++ b/perfui/grafana/rootfs/etc/grafana/provisioning/dashboards/perf.yaml @@ -7,3 +7,9 @@ providers: allowUiUpdates: true options: path: /usr/share/temboard/perf.json +- name: "Postgres from temBoard agent" + type: file + folder: '' + allowUiUpdates: true + options: + path: /usr/share/temboard/postgres.json diff --git a/perfui/grafana/rootfs/usr/share/temboard/postgres.json b/perfui/grafana/rootfs/usr/share/temboard/postgres.json new file mode 100644 index 000000000..3bbabc85e --- /dev/null +++ b/perfui/grafana/rootfs/usr/share/temboard/postgres.json @@ -0,0 +1,201 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "iteration": 1643970369705, + "links": [], + "liveNow": false, + "panels": [ + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "01lWBBhnk" + }, + "exemplar": true, + "expr": "sum(rate(xacts_n_commit{logfile=\"$logfile\"}[5m]))", + "interval": "", + "legendFormat": "commit", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "01lWBBhnk" + }, + "exemplar": true, + "expr": "sum(rate(xacts_n_rollback{logfile=\"$logfile\"}[5m]))", + "hide": false, + "interval": "", + "legendFormat": "rollback", + "refId": "B" + } + ], + "title": "TPS", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "j1tzJKb7k" + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 2, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Ascending", + "wrapLogMessage": false + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "j1tzJKb7k" + }, + "expr": "{logfile=~\"$logfile\"}", + "refId": "A" + } + ], + "title": "Messages", + "type": "logs" + } + ], + "refresh": false, + "schemaVersion": 33, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": ".+", + "value": ".+" + }, + "definition": "label_values(logfile)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "logfile", + "options": [], + "query": { + "query": "label_values(logfile)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Postgres", + "uid": "gQ7FEi-7z", + "version": 3, + "weekStart": "" +} From 2f1e3831edb3bbc069eae4601ed6a2c61235cdb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 11:50:22 +0100 Subject: [PATCH 05/10] perfui: stamp import Allows to import logfile several times. --- perfui/importlog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perfui/importlog.py b/perfui/importlog.py index 1e57540f8..1fc7fc4e1 100755 --- a/perfui/importlog.py +++ b/perfui/importlog.py @@ -90,7 +90,7 @@ def main(logfile): labels = dict( job="logsender.py", - logfile=logfile, + logfile="%s_%s" % (datetime.now().strftime('%Y%m%dT%H%M%S'), logfile), ) logger.info("Analyzing %s.", logfile) From 88141abbf73f0ed3fd4cb7004e5072d6fc9b0163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 11:45:53 +0100 Subject: [PATCH 06/10] Export xacts metrics as logfmt in collector --- temboardui/plugins/monitoring/__init__.py | 10 +++++- .../plugins/monitoring/handlers/monitoring.py | 7 ++++- temboardui/plugins/monitoring/tools.py | 31 ++++++++++++++++++- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/temboardui/plugins/monitoring/__init__.py b/temboardui/plugins/monitoring/__init__.py index b6b1585ed..07e9b72ce 100644 --- a/temboardui/plugins/monitoring/__init__.py +++ b/temboardui/plugins/monitoring/__init__.py @@ -532,7 +532,15 @@ def collector(app, address, port, key): row['instances'][0]['available'] ) logger.info("Insert collected metrics for %s.", agent_id) - insert_metrics(worker_session, host.host_id, instance_id, data) + insert_metrics( + worker_session, host.host_id, instance_id, data, dict( + agent=agent_id, + timestamp=( + # transform to ISOFORMAT (same as journalctl) + row['datetime'].replace(' +', '+').replace(' ', 'T') + ), + ), + ) worker_session.commit() except DataError as e: diff --git a/temboardui/plugins/monitoring/handlers/monitoring.py b/temboardui/plugins/monitoring/handlers/monitoring.py index f95f9db41..a031d1bfb 100644 --- a/temboardui/plugins/monitoring/handlers/monitoring.py +++ b/temboardui/plugins/monitoring/handlers/monitoring.py @@ -87,7 +87,12 @@ def collector(request): ) request.db_session.commit() - insert_metrics(request.db_session, host.host_id, instance_id, metrics_data) + labels = dict( + agent='%s:%s' % (hostname, port), + ) + insert_metrics( + request.db_session, host.host_id, instance_id, metrics_data, labels, + ) request.db_session.commit() # Update collector status diff --git a/temboardui/plugins/monitoring/tools.py b/temboardui/plugins/monitoring/tools.py index 1ff0588b6..28d51b759 100644 --- a/temboardui/plugins/monitoring/tools.py +++ b/temboardui/plugins/monitoring/tools.py @@ -138,15 +138,27 @@ def check_host_key(session, hostname, agent_key): raise Exception("Agent key does not match.") -def insert_metrics(session, host_id, instance_id, data): +def insert_metrics(session, host_id, instance_id, data, labels=None): + labels = labels or {} for metric_name in list(data.keys()): + # Do not try to insert empty lines if data[metric_name] is None: continue if len(data[metric_name]) == 0: continue + for record in generate_logfmt_records(metric_name, data[metric_name]): + try: + logger.debug( + "up=1 %s %s", + " ".join(['%s=%s' % i for i in labels.items()]), + " ".join(['%s=%s' % i for i in record.items()]), + ) + except Exception: + logger.exception("Failed to format logfmt.") + # Insert data for metric_data in data[metric_name]: if metric_name == 'sessions': @@ -199,6 +211,23 @@ def insert_metrics(session, host_id, instance_id, data): session.commit() +def generate_logfmt_records(metric, points): + # Generate flat key-value record from temBoard metrics data for logfmt. + if metric not in ('xacts',): + return + for point in points: + record = dict() + for k, v in point.items(): + if k in ('datetime', 'port', 'measure_interval'): + continue + if k in ('dbname',): + logkey = k + else: + logkey = '%s_%s' % (metric, k) + record[logkey] = v + yield record + + def get_instance_checks(session, instance_id): # Returns enabled alerting checks as list of tuples: # (name, warning threshold, critical threshold) From d4aebe1149081311c5477e997544d3419634e804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 11:46:16 +0100 Subject: [PATCH 07/10] Gracefully handle alert errors --- temboardui/plugins/monitoring/__init__.py | 25 +++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/temboardui/plugins/monitoring/__init__.py b/temboardui/plugins/monitoring/__init__.py index 07e9b72ce..b60cfe0f7 100644 --- a/temboardui/plugins/monitoring/__init__.py +++ b/temboardui/plugins/monitoring/__init__.py @@ -598,17 +598,20 @@ def collector(app, address, port, key): logger.info( "Apply alerting checks against preprocessed data for agent %s.", agent_id) - check_preprocessed_data( - worker_session, - host.host_id, - instance_id, - preprocess_data( - row['data'], - get_instance_checks(worker_session, instance_id), - row['datetime'] - ), - app.config.temboard.home, - ) + try: + check_preprocessed_data( + worker_session, + host.host_id, + instance_id, + preprocess_data( + row['data'], + get_instance_checks(worker_session, instance_id), + row['datetime'] + ), + app.config.temboard.home, + ) + except Exception: + logger.exception("Failed to check monitoring data for alerting.") logger.debug("Row with datetime=%s inserted", row['datetime']) worker_session.commit() From 8531b971e33457e8085d6a150fd8e0f342b19040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 11:46:38 +0100 Subject: [PATCH 08/10] Fix statements redo interval My bad. --- temboardui/plugins/statements/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/temboardui/plugins/statements/__init__.py b/temboardui/plugins/statements/__init__.py index cd036b75c..76e022f97 100644 --- a/temboardui/plugins/statements/__init__.py +++ b/temboardui/plugins/statements/__init__.py @@ -943,8 +943,7 @@ def statements_bootstrap(context): yield taskmanager.Task( worker_name='pull_data_worker', id='statementsdata', - # redo_interval=1 * 60, # Repeat each 1m, - redo_interval=10, # Repeat each 1m, + redo_interval=1 * 60, # Repeat each 1m, options={}, ) yield taskmanager.Task( From cd69d26b98266a93708f74e0e49ae9a240d483ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 12:26:42 +0100 Subject: [PATCH 09/10] Lint get_metric_data_csv query for log_statement --- temboardui/plugins/monitoring/chartdata.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/temboardui/plugins/monitoring/chartdata.py b/temboardui/plugins/monitoring/chartdata.py index 0e27fa8a6..336f09d80 100644 --- a/temboardui/plugins/monitoring/chartdata.py +++ b/temboardui/plugins/monitoring/chartdata.py @@ -5,6 +5,8 @@ # python3 from io import StringIO import datetime +from textwrap import dedent + from psycopg2.extensions import AsIs from .pivot import pivot_timeserie @@ -872,10 +874,15 @@ def get_metric_data_csv(session, metric_name, start, end, host_id=None, query = cur.mogrify(q_tpl, dict(host_id=host_id, instance_id=instance_id, start=start, end=end, key=key, tablename=AsIs(tablename))) - query = query.decode("utf-8") + query = query.strip().decode("utf-8") # Retreive data using copy_expert() - cur.copy_expert("COPY(" + query + ") TO STDOUT WITH CSV HEADER", - data_buffer) + cur.copy_expert(dedent("""\ + -- get_metric_data_csv + -- SET search_path TO monitoring; + COPY( + %s + ) TO STDOUT WITH CSV HEADER + """) % (query,), data_buffer) cur.close() if metric.get('pivot'): From 9592bce8d4aba98b55ee333d15e554c3f88cc766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20BERSAC?= Date: Fri, 4 Feb 2022 14:45:27 +0100 Subject: [PATCH 10/10] Agent changelog for connexion pooling --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2687d85f5..20053f60b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ **Agent:** - Fix database probes always executed on same database. +- Reduce reconnexion in monitoring probes. - Explicitily requires psycopg2 2.7+ on debian. - Ignore loopback and tmpfs file systems. - Drop Python2 support