Skip to content

Commit

Permalink
khepri_cluster: Stop spamming cluster with join/leave if it is not ready
Browse files Browse the repository at this point in the history
[Why]
If a cluster change (a member joining or leaving), it is not permitted
to make another change: Ra will reply with the following error:

    {error, cluster_change_not_permitted}

Before this patch, we would retry immediately after such an error. We
were effeclively spamming the Ra server with many join/leave commands,
but the user too with hundreds of "not ready; waiting..." log messages.

[How]
For now, we simply add a hard-coded 200 ms sleep before retrying.
  • Loading branch information
dumbbell committed Sep 11, 2024
1 parent 80ef2a3 commit 99b5217
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions src/khepri_cluster.erl
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ reset_remotely_and_join_locked(
?LOG_DEBUG(
"Remote cluster (reached through node ~0p) is not ready "
"for a membership change yet; waiting...", [RemoteNode]),
Ret2 = wait_for_leader(StoreId, Timeout1),
Ret2 = wait_for_cluster_change_permitted(StoreId, Timeout1),
Timeout2 = khepri_utils:end_timeout_window(Timeout1, T2),
case Ret2 of
ok ->
Expand Down Expand Up @@ -847,7 +847,7 @@ do_join_locked(StoreId, ThisMember, RemoteNode, Timeout) ->
?LOG_DEBUG(
"Remote cluster (reached through node ~0p) is not ready "
"for a membership change yet; waiting...", [RemoteNode]),
Ret2 = wait_for_leader(RemoteMember, Timeout1),
Ret2 = wait_for_cluster_change_permitted(RemoteMember, Timeout1),
Timeout2 = khepri_utils:end_timeout_window(Timeout1, T2),
case Ret2 of
ok ->
Expand All @@ -871,6 +871,17 @@ do_join_locked(StoreId, ThisMember, RemoteNode, Timeout) ->
end
end.

wait_for_cluster_change_permitted(RaMemberOrStoreId, Timeout) ->
Ret = wait_for_leader(RaMemberOrStoreId, Timeout),

%% We wait for an additional fixed amount of time because the
%% cluster could have a leader and still not be ready to accept
%% a cluster change. This avoids too many retries that will
%% just eat resources.
timer:sleep(200),

Ret.

-spec reset() -> Ret when
Ret :: ok | khepri:error().
%% @doc Resets the store on this Erlang node.
Expand Down Expand Up @@ -946,7 +957,7 @@ do_reset(RaSystem, StoreId, ThisMember, Timeout) ->
"Cluster is not ready for a membership change yet; waiting",
[]),
try
Ret2 = wait_for_leader(StoreId, Timeout1),
Ret2 = wait_for_cluster_change_permitted(StoreId, Timeout1),
Timeout2 = khepri_utils:end_timeout_window(Timeout1, T2),
case Ret2 of
ok -> do_reset(RaSystem, StoreId, ThisMember, Timeout2);
Expand Down

0 comments on commit 99b5217

Please sign in to comment.