Skip to content

Commit

Permalink
Close replica on partitioning to prevent its returning online but lag…
Browse files Browse the repository at this point in the history
…ging

- Remember time of each successful replica report to ZK.
- If ZK is unavailable, check if walreceiver is streaming.
- If both ZK and walreceiver checks fail, close replica.
- Log everything with values that triggered this logic.
- Add tests for this logic.
  • Loading branch information
vicpopov committed Nov 5, 2024
1 parent a0c9dd7 commit 2452dfa
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def read_config(filename=None, options=None):
'recovery_timeout': 60,
'can_delayed': 'no',
'primary_switch_restart': 'yes',
'close_detached_after': 300,
},
'commands': {
'promote': '/usr/lib/postgresql/10/bin/pg_ctl promote -D %p',
Expand Down
42 changes: 40 additions & 2 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(self, **kwargs):
self._is_single_node = False
self.notifier = sdnotify.Notifier()
self._slot_drop_countdown = {}
self.last_zk_host_stat_write = 0

if self.config.getboolean('global', 'quorum_commit'):
self._replication_manager = QuorumReplicationManager(
Expand Down Expand Up @@ -284,8 +285,7 @@ def run_iteration(self, my_prio):
logging.warning('Cluster in maintenance mode')
self.zk.reconnect()
self.zk.write(self.zk.get_host_maintenance_path(), 'enable')
logging.debug('Finished iteration.')
timer.sleep(self.config.getfloat('global', 'iteration_timeout'))
self.finish_iteration()
return
except ZookeeperException:
logging.error("Zookeeper exception while getting ZK state")
Expand All @@ -295,6 +295,11 @@ def run_iteration(self, my_prio):
logging.error("Upper exception was for primary")
my_hostname = helpers.get_hostname()
self.resolve_zk_primary_lock(my_hostname)
elif role == 'replica' and not self.is_in_maintenance:
logging.error("Upper exception was for replica")
self.handle_detached_replica(db_state)
self.re_init_zk()
self.finish_iteration()
else:
self.re_init_zk()
return
Expand Down Expand Up @@ -325,6 +330,9 @@ def run_iteration(self, my_prio):
if not self.zk.noexcept_write(self.zk.get_host_prio_path(), my_prio, need_lock=False):
logging.warning('Could not write priority to ZK')

self.finish_iteration()

def finish_iteration(self):
logging.debug('Finished iteration.')
timer.sleep(self.config.getfloat('global', 'iteration_timeout'))

Expand Down Expand Up @@ -535,6 +543,35 @@ def resolve_zk_primary_lock(self, my_hostname):
logging.warning('Lock in ZK is being held by %s. We should return to cluster here.', holder)
self._return_to_cluster(holder, 'primary')

def handle_detached_replica(self, db_state):
close_detached_replica_after = self.config.getfloat('replica', 'close_detached_after')
if not close_detached_replica_after:
return
now = time.time()
zk_write_delay = now - self.last_zk_host_stat_write
if zk_write_delay < close_detached_replica_after:
logging.debug(
f'Replica ZK write delay {zk_write_delay} within '
f'{close_detached_replica_after} seconds; keeping replica open'
)
return
if not db_state['wal_receiver']:
logging.debug('Stopping pooler for replica with lost ZK connection and without walreceiver running')
self.db.pgpooler('stop')
return
walreceiver_delay = now - db_state['wal_receiver']['last_msg_receipt_time_msec'] // 1000
if walreceiver_delay > close_detached_replica_after:
logging.debug(
f'Stopping pooler for replica with lost ZK connection '
f'and walreceiver delay {walreceiver_delay} > {close_detached_replica_after}'
)
self.db.pgpooler('stop')
else:
logging.debug(
f'Replica write delay {zk_write_delay}, but walreceiver delay {walreceiver_delay} within '
f'{close_detached_replica_after}; keeping replica open'
)

def write_host_stat(self, hostname, db_state):
stream_from = self.config.get('global', 'stream_from')
replics_info = db_state.get('replics_info')
Expand All @@ -559,6 +596,7 @@ def write_host_stat(self, hostname, db_state):
if not self.zk.write(replics_info_path, replics_info, preproc=json.dumps, need_lock=False):
logging.warning('Could not write host replics_info to ZK.')
return False
self.last_zk_host_stat_write = time.time()

def remove_stale_operation(self, hostname):
op_path = '%s/%s/op' % (self.zk.MEMBERS_PATH, hostname)
Expand Down
1 change: 1 addition & 0 deletions src/pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ def _get_wal_receiver_info(self):
Get wal_receiver info from pg_stat_wal_receiver
"""
query = """SELECT pid, status, slot_name,
COALESCE(1000*EXTRACT(epoch FROM last_msg_receipt_time), 0)::bigint AS last_msg_receipt_time_msec,
conninfo FROM pg_stat_wal_receiver"""
return self._get(query)

Expand Down
3 changes: 2 additions & 1 deletion tests/features/coordinator_fail.feature
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Feature: Check availability on coordinator failure
primary_switch_checks: 1
min_failover_timeout: 1
primary_unavailability_timeout: 2
close_detached_after: 30
commands:
generate_recovery_conf: /usr/local/bin/gen_rec_conf_<with_slots>_slot.sh %m %p
"""
Expand Down Expand Up @@ -50,7 +51,7 @@ Feature: Check availability on coordinator failure
When we disconnect from network container "zookeeper1"
And we disconnect from network container "zookeeper2"
And we disconnect from network container "zookeeper3"
And we wait "10.0" seconds
And we wait "35.0" seconds
Then pgbouncer is running in container "postgresql1"
And pgbouncer is running in container "postgresql2"
And pgbouncer is running in container "postgresql3"
Expand Down
63 changes: 63 additions & 0 deletions tests/features/kill_replica.feature
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,66 @@ Feature: Destroy synchronous replica in various scenarios
Examples: <lock_type>
| lock_type | lock_host |
| zookeeper | zookeeper1 |

@detached_replica
Scenario Outline: Disconnecting replica pgboncer behaviour
Given a "pgconsul" container common config
"""
pgconsul.conf:
global:
priority: 0
use_replication_slots: '<use_slots>'
quorum_commit: '<quorum_commit>'
primary:
change_replication_type: 'yes'
primary_switch_checks: 1
replica:
allow_potential_data_loss: 'no'
primary_unavailability_timeout: 1
primary_switch_checks: 1
min_failover_timeout: 1
primary_unavailability_timeout: 2
close_detached_after: 30
commands:
generate_recovery_conf: /usr/local/bin/gen_rec_conf_<with_slots>_slot.sh %m %p
"""
Given a following cluster with "<lock_type>" <with_slots> replication slots
"""
postgresql1:
role: primary
postgresql2:
role: replica
"""
Then <lock_type> "<lock_host>" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader"
Then container "postgresql2" is in <replication_type> group
Then <lock_type> "<lock_host>" has following values for key "/pgconsul/postgresql/replics_info"
"""
- client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net
state: streaming
"""
When we disconnect from network container "postgresql2"
And we wait "5.0" seconds
Then pgbouncer is running in container "postgresql2"
When we wait "30.0" seconds
Then pgbouncer is not running in container "postgresql2"
When we connect to network container "postgresql2"
Then container "postgresql1" is primary
Then container "postgresql2" is a replica of container "postgresql1"
Then container "postgresql2" is in <replication_type> group
Then <lock_type> "<lock_host>" has following values for key "/pgconsul/postgresql/replics_info"
"""
- client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net
state: streaming
"""

Examples: <lock_type>, <with_slots> replication slots, <destroy>/<repair>
| lock_type | lock_host | with_slots | use_slots | quorum_commit | replication_type |
| zookeeper | zookeeper1 | without | no | yes | quorum |
| zookeeper | zookeeper1 | with | yes | yes | quorum |
| zookeeper | zookeeper1 | without | no | yes | quorum |
| zookeeper | zookeeper1 | with | yes | yes | quorum |
| zookeeper | zookeeper1 | without | no | no | sync |
| zookeeper | zookeeper1 | with | yes | no | sync |
| zookeeper | zookeeper1 | without | no | no | sync |
| zookeeper | zookeeper1 | with | yes | no | sync |

0 comments on commit 2452dfa

Please sign in to comment.