diff --git a/postgres/changelog.d/19083.added b/postgres/changelog.d/19083.added new file mode 100644 index 0000000000000..9a4d0e63f76db --- /dev/null +++ b/postgres/changelog.d/19083.added @@ -0,0 +1 @@ +Track logical replication slot catalog_xmin age diff --git a/postgres/datadog_checks/postgres/util.py b/postgres/datadog_checks/postgres/util.py index 2a216d3fbe189..8bf859963c6b1 100644 --- a/postgres/datadog_checks/postgres/util.py +++ b/postgres/datadog_checks/postgres/util.py @@ -383,6 +383,7 @@ def get_list_chunks(lst, n): CASE WHEN temporary THEN 'temporary' ELSE 'permanent' END, CASE WHEN active THEN 'active' ELSE 'inactive' END, CASE WHEN xmin IS NULL THEN NULL ELSE age(xmin) END, + CASE WHEN catalog_xmin IS NULL THEN NULL ELSE age(catalog_xmin) END, pg_wal_lsn_diff( CASE WHEN pg_is_in_recovery() THEN pg_last_wal_receive_lsn() ELSE pg_current_wal_lsn() END, restart_lsn), pg_wal_lsn_diff( @@ -395,6 +396,7 @@ def get_list_chunks(lst, n): {'name': 'slot_persistence', 'type': 'tag'}, {'name': 'slot_state', 'type': 'tag'}, {'name': 'replication_slot.xmin_age', 'type': 'gauge'}, + {'name': 'replication_slot.catalog_xmin_age', 'type': 'gauge'}, {'name': 'replication_slot.restart_delay_bytes', 'type': 'gauge'}, {'name': 'replication_slot.confirmed_flush_delay_bytes', 'type': 'gauge'}, ], diff --git a/postgres/metadata.csv b/postgres/metadata.csv index da8bf54049e23..1cf6813327fe3 100644 --- a/postgres/metadata.csv +++ b/postgres/metadata.csv @@ -123,6 +123,7 @@ postgresql.replication.wal_replay_lag,gauge,,second,,"Time elapsed between flush postgresql.replication.wal_write_lag,gauge,,second,,Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (but not yet flushed it or applied it). This can be used to gauge the delay that synchronous_commit level remote_write incurred while committing if this server was configured as a synchronous standby. Only available with postgresql 10 and newer.,-1,postgres,repl write lag, postgresql.replication_delay,gauge,,second,,The current replication delay in seconds. Only available with postgresql 9.1 and newer,-1,postgres,repl delay, postgresql.replication_delay_bytes,gauge,,byte,,The current replication delay in bytes. Only available with postgresql 9.2 and newer,-1,postgres,repl delay bytes, +postgresql.replication_slot.catalog_xmin_age,gauge,,transaction,,"The age of the oldest transaction affecting the system catalogs that this slot needs the database to retain. VACUUM cannot remove catalog tuples deleted by any later transaction. This metric is tagged with slot_name, slot_type, slot_persistence, slot_state.",-1,postgres,repslot catalog_xmin, postgresql.replication_slot.confirmed_flush_delay_bytes,gauge,,byte,,"The delay in bytes between the current WAL position and last position this slot's consumer confirmed. This is only available for logical replication slots. This metric is tagged with slot_name, slot_type, slot_persistence, slot_state.",-1,postgres,repslot flush, postgresql.replication_slot.restart_delay_bytes,gauge,,byte,,"The amount of WAL bytes that the consumer of this slot may require and won't be automatically removed during checkpoints unless it exceeds max_slot_wal_keep_size parameter. Nothing is reported if there's no WAL reservation for this slot. This metric is tagged with slot_name, slot_type, slot_persistence, slot_state.",-1,postgres,repslot restart, postgresql.replication_slot.spill_bytes,count,,byte,,"Amount of decoded transaction data spilled to disk while performing decoding of changes from WAL for this slot. This and other spill counters can be used to gauge the I/O occurred during logical decoding and allow tuning logical_decoding_work_mem. Extracted from pg_stat_replication_slots. Only available with PostgreSQL 14 and newer. This metric is tagged with slot_name, slot_type, slot_state.",-1,postgres,repslot spill_byte, diff --git a/postgres/tests/common.py b/postgres/tests/common.py index fa640a889ac03..f051d673bacc3 100644 --- a/postgres/tests/common.py +++ b/postgres/tests/common.py @@ -314,6 +314,7 @@ def check_replication_slots(aggregator, expected_tags, count=1): for metric_name in _iterate_metric_name(QUERY_PG_REPLICATION_SLOTS): if 'slot_type:physical' in expected_tags and metric_name in [ 'postgresql.replication_slot.confirmed_flush_delay_bytes', + 'postgresql.replication_slot.catalog_xmin_age', ]: continue if 'slot_type:logical' in expected_tags and metric_name in [