Close replica on partitioning to prevent its returning online but lag…

…ging - Remember time of each successful replica report to ZK. - If ZK is unavailable, check if walreceiver is streaming. - If both ZK and walreceiver checks fail, close replica. - Log everything with values that triggered this logic. - Add tests for this logic.
yandex · Nov 5, 2024 · 2452dfa · 2452dfa
1 parent a0c9dd7
commit 2452dfa
Show file tree

Hide file tree

Showing 5 changed files with 107 additions and 3 deletions.
diff --git a/src/__init__.py b/src/__init__.py
@@ -99,6 +99,7 @@ def read_config(filename=None, options=None):
             'recovery_timeout': 60,
             'can_delayed': 'no',
             'primary_switch_restart': 'yes',
+            'close_detached_after': 300,
         },
         'commands': {
             'promote': '/usr/lib/postgresql/10/bin/pg_ctl promote -D %p',

diff --git a/src/main.py b/src/main.py
@@ -54,6 +54,7 @@ def __init__(self, **kwargs):
         self._is_single_node = False
         self.notifier = sdnotify.Notifier()
         self._slot_drop_countdown = {}
+        self.last_zk_host_stat_write = 0
 
         if self.config.getboolean('global', 'quorum_commit'):
             self._replication_manager = QuorumReplicationManager(
@@ -284,8 +285,7 @@ def run_iteration(self, my_prio):
                 logging.warning('Cluster in maintenance mode')
                 self.zk.reconnect()
                 self.zk.write(self.zk.get_host_maintenance_path(), 'enable')
-                logging.debug('Finished iteration.')
-                timer.sleep(self.config.getfloat('global', 'iteration_timeout'))
+                self.finish_iteration()
                 return
         except ZookeeperException:
             logging.error("Zookeeper exception while getting ZK state")
@@ -295,6 +295,11 @@ def run_iteration(self, my_prio):
                 logging.error("Upper exception was for primary")
                 my_hostname = helpers.get_hostname()
                 self.resolve_zk_primary_lock(my_hostname)
+            elif role == 'replica' and not self.is_in_maintenance:
+                logging.error("Upper exception was for replica")
+                self.handle_detached_replica(db_state)
+                self.re_init_zk()
+                self.finish_iteration()
             else:
                 self.re_init_zk()
             return
@@ -325,6 +330,9 @@ def run_iteration(self, my_prio):
             if not self.zk.noexcept_write(self.zk.get_host_prio_path(), my_prio, need_lock=False):
                 logging.warning('Could not write priority to ZK')
 
+        self.finish_iteration()
+
+    def finish_iteration(self):
         logging.debug('Finished iteration.')
         timer.sleep(self.config.getfloat('global', 'iteration_timeout'))
 
@@ -535,6 +543,35 @@ def resolve_zk_primary_lock(self, my_hostname):
             logging.warning('Lock in ZK is being held by %s. We should return to cluster here.', holder)
             self._return_to_cluster(holder, 'primary')
 
+    def handle_detached_replica(self, db_state):
+        close_detached_replica_after = self.config.getfloat('replica', 'close_detached_after')
+        if not close_detached_replica_after:
+            return
+        now = time.time()
+        zk_write_delay = now - self.last_zk_host_stat_write
+        if zk_write_delay < close_detached_replica_after:
+            logging.debug(
+                f'Replica ZK write delay {zk_write_delay} within '
+                f'{close_detached_replica_after} seconds; keeping replica open'
+            )
+            return
+        if not db_state['wal_receiver']:
+            logging.debug('Stopping pooler for replica with lost ZK connection and without walreceiver running')
+            self.db.pgpooler('stop')
+            return
+        walreceiver_delay = now - db_state['wal_receiver']['last_msg_receipt_time_msec'] // 1000
+        if walreceiver_delay > close_detached_replica_after:
+            logging.debug(
+                f'Stopping pooler for replica with lost ZK connection '
+                f'and walreceiver delay {walreceiver_delay} > {close_detached_replica_after}'
+            )
+            self.db.pgpooler('stop')
+        else:
+            logging.debug(
+                f'Replica write delay {zk_write_delay}, but walreceiver delay {walreceiver_delay} within '
+                f'{close_detached_replica_after}; keeping replica open'
+            )
+
     def write_host_stat(self, hostname, db_state):
         stream_from = self.config.get('global', 'stream_from')
         replics_info = db_state.get('replics_info')
@@ -559,6 +596,7 @@ def write_host_stat(self, hostname, db_state):
             if not self.zk.write(replics_info_path, replics_info, preproc=json.dumps, need_lock=False):
                 logging.warning('Could not write host replics_info to ZK.')
                 return False
+        self.last_zk_host_stat_write = time.time()
 
     def remove_stale_operation(self, hostname):
         op_path = '%s/%s/op' % (self.zk.MEMBERS_PATH, hostname)

diff --git a/src/pg.py b/src/pg.py
@@ -359,6 +359,7 @@ def _get_wal_receiver_info(self):
         Get wal_receiver info from pg_stat_wal_receiver
         """
         query = """SELECT pid, status, slot_name,
+                   COALESCE(1000*EXTRACT(epoch FROM last_msg_receipt_time), 0)::bigint AS last_msg_receipt_time_msec,
                    conninfo FROM pg_stat_wal_receiver"""
         return self._get(query)
 

diff --git a/tests/features/coordinator_fail.feature b/tests/features/coordinator_fail.feature
@@ -18,6 +18,7 @@ Feature: Check availability on coordinator failure
                     primary_switch_checks: 1
                     min_failover_timeout: 1
                     primary_unavailability_timeout: 2
+                    close_detached_after: 30
                 commands:
                     generate_recovery_conf: /usr/local/bin/gen_rec_conf_<with_slots>_slot.sh %m %p
         """
@@ -50,7 +51,7 @@ Feature: Check availability on coordinator failure
         When we disconnect from network container "zookeeper1"
         And we disconnect from network container "zookeeper2"
         And we disconnect from network container "zookeeper3"
-        And we wait "10.0" seconds
+        And we wait "35.0" seconds
         Then pgbouncer is running in container "postgresql1"
         And pgbouncer is running in container "postgresql2"
         And pgbouncer is running in container "postgresql3"

diff --git a/tests/features/kill_replica.feature b/tests/features/kill_replica.feature
@@ -285,3 +285,66 @@ Feature: Destroy synchronous replica in various scenarios
     Examples: <lock_type>
         | lock_type | lock_host  |
         | zookeeper | zookeeper1 |
+
+    @detached_replica
+    Scenario Outline: Disconnecting replica pgboncer behaviour
+        Given a "pgconsul" container common config
+        """
+            pgconsul.conf:
+                global:
+                    priority: 0
+                    use_replication_slots: '<use_slots>'
+                    quorum_commit: '<quorum_commit>'
+                primary:
+                    change_replication_type: 'yes'
+                    primary_switch_checks: 1
+                replica:
+                    allow_potential_data_loss: 'no'
+                    primary_unavailability_timeout: 1
+                    primary_switch_checks: 1
+                    min_failover_timeout: 1
+                    primary_unavailability_timeout: 2
+                    close_detached_after: 30
+                commands:
+                    generate_recovery_conf: /usr/local/bin/gen_rec_conf_<with_slots>_slot.sh %m %p
+        """
+        Given a following cluster with "<lock_type>" <with_slots> replication slots
+        """
+            postgresql1:
+                role: primary
+            postgresql2:
+                role: replica
+        """
+        Then <lock_type> "<lock_host>" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader"
+        Then container "postgresql2" is in <replication_type> group
+        Then <lock_type> "<lock_host>" has following values for key "/pgconsul/postgresql/replics_info"
+        """
+          - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net
+            state: streaming
+        """
+        When we disconnect from network container "postgresql2"
+        And we wait "5.0" seconds
+        Then pgbouncer is running in container "postgresql2"
+        When we wait "30.0" seconds
+        Then pgbouncer is not running in container "postgresql2"
+        When we connect to network container "postgresql2"
+        Then container "postgresql1" is primary
+        Then container "postgresql2" is a replica of container "postgresql1"
+        Then container "postgresql2" is in <replication_type> group
+        Then <lock_type> "<lock_host>" has following values for key "/pgconsul/postgresql/replics_info"
+        """
+          - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net
+            state: streaming
+        """
+
+    Examples: <lock_type>, <with_slots> replication slots, <destroy>/<repair>
+        | lock_type | lock_host  | with_slots | use_slots | quorum_commit | replication_type |
+        | zookeeper | zookeeper1 |  without   |    no     |      yes      |      quorum      |
+        | zookeeper | zookeeper1 |   with     |    yes    |      yes      |      quorum      |
+        | zookeeper | zookeeper1 |  without   |    no     |      yes      |      quorum      |
+        | zookeeper | zookeeper1 |   with     |    yes    |      yes      |      quorum      |
+        | zookeeper | zookeeper1 |  without   |    no     |      no       |       sync       |
+        | zookeeper | zookeeper1 |   with     |    yes    |      no       |       sync       |
+        | zookeeper | zookeeper1 |  without   |    no     |      no       |       sync       |
+        | zookeeper | zookeeper1 |   with     |    yes    |      no       |       sync       |
+