storage: fix master discovery stall

Same issue as described in the previous commit, but it was affecting storages. The difference is that the storages don't need to hold so tight on their connections and can simply discard the disconnected ones. Closes tarantool/vshard-ee#4 NO_DOC=bugfix
tarantool · Apr 5, 2024 · 2171db2 · 2171db2
1 parent fccbfc9
commit 2171db2
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 1 deletion.
diff --git a/test/storage-luatest/auto_master_2_2_2_test.lua b/test/storage-luatest/auto_master_2_2_2_test.lua
@@ -321,3 +321,84 @@ test_group.test_conn_manager_connect_self = function(g)
     -- Cleanup
     vtest.cluster_rebalancer_disable(g)
 end
+
+test_group.test_master_discovery_on_disconnect = function(g)
+    local function bucket_send(bid, uuid)
+        local ok, err = ivshard.storage.bucket_send(
+            bid, uuid, {timeout = iwait_timeout})
+        ilt.assert_equals(err, nil)
+        ilt.assert(ok)
+    end
+    local function bucket_gc_wait()
+        _G.bucket_gc_wait()
+    end
+    local function send_bucket_to_new_master(storage_src, storage_dst)
+        -- Make sure the bucket will not be delivered even if somehow the tiny
+        -- send-timeout appeared to be enough to send it. Just for the
+        -- simplicity of the test.
+        storage_dst:exec(function()
+            ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = true
+        end)
+        local bid = storage_src:exec(function(rs_uuid, master_uuid)
+            local bid = _G.get_first_bucket()
+            -- Try to send a bucket. Here the sender notices that the
+            -- destination's master is disconnected. A master search is
+            -- triggered then.
+            local ok, err = ivshard.storage.bucket_send(
+                bid, rs_uuid, {timeout = 0.01})
+            ilt.assert(not ok)
+            ilt.assert_not_equals(err, nil)
+            -- Wait until the master search finds the new node.
+            local rs = ivshard.storage.internal.replicasets[rs_uuid]
+            ilt.helpers.retrying({timeout = iwait_timeout}, function()
+                if not rs.master or rs.master.uuid ~= master_uuid then
+                    error('Master still is not found')
+                end
+            end)
+            ivtest.service_wait_for_new_ok(
+                ivshard.storage.internal.recovery_service,
+                {on_yield = ivshard.storage.recovery_wakeup,
+                 timeout = iwait_timeout})
+            ilt.assert_equals(box.space._bucket:get{bid}.status,
+                              ivconst.BUCKET.ACTIVE)
+            return bid
+        end, {storage_dst:replicaset_uuid(), storage_dst:instance_uuid()})
+        -- Now the bucket can be sent fine.
+        storage_dst:exec(function()
+            ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = false
+        end)
+        storage_src:exec(bucket_send, {bid, storage_dst:replicaset_uuid()})
+        storage_src:exec(bucket_gc_wait)
+    end
+    vtest.cluster_exec_each(g, function()
+        rawset(_G, 'test_old_idle_interval',
+               ivconst.MASTER_SEARCH_IDLE_INTERVAL)
+        ivconst.MASTER_SEARCH_IDLE_INTERVAL = iwait_timeout + 10
+    end)
+    --
+    -- Discover the master first time.
+    --
+    local bid = vtest.storage_first_bucket(g.replica_1_a)
+    g.replica_1_a:exec(bucket_send, {bid, g.replica_2_a:replicaset_uuid()})
+    g.replica_1_a:exec(bucket_gc_wait)
+    g.replica_2_a:exec(bucket_send, {bid, g.replica_1_a:replicaset_uuid()})
+    g.replica_2_a:exec(bucket_gc_wait)
+    --
+    -- Blocking call while the known master is disconnected.
+    --
+    g.replica_2_a:stop()
+    g.replica_2_b:update_box_cfg{read_only = false}
+    send_bucket_to_new_master(g.replica_1_a, g.replica_2_b)
+    -- Can't GC the bucket until the old master is back. But can send it.
+    g.replica_2_b:exec(bucket_send, {bid, g.replica_1_a:replicaset_uuid()})
+
+    -- Restore everything back.
+    g.replica_2_a:start()
+    vtest.cluster_cfg(g, global_cfg)
+    g.replica_2_b:exec(bucket_gc_wait)
+    g.replica_2_b:update_box_cfg{read_only = true}
+    vtest.cluster_exec_each(g, function()
+        ivconst.MASTER_SEARCH_IDLE_INTERVAL = _G.test_old_idle_interval
+        _G.test_old_idle_interval = nil
+    end)
+end
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
@@ -3351,14 +3351,25 @@ local function conn_manager_locate_masters(service)
     local is_all_done = true
     local is_done, _, err
     for rs_id, rs in pairs(M.replicasets) do
-        if rs.is_master_auto and not rs.master and rs.master_wait_count > 0 then
+        if not rs.is_master_auto then
+            goto continue
+        end
+        if not rs.master and rs.master_wait_count == 0 then
+            goto continue
+        end
+        if not rs.master then
             is_done, _, err = rs:locate_master()
             if err then
                 log.error(service:set_status_error(
                     'Error during master discovery for %s: %s', rs_id, err))
             end
             is_all_done = is_all_done and is_done
+        elseif not rs.master:is_connected() then
+            log.warn('Discarded a not connected master %s in rs %s',
+                     rs.master, rs_id)
+            rs.master = nil
         end
+    ::continue::
     end
     return is_all_done
 end