[Tarantool-patches] [PATCH vshard 6/6] router: update master using a hint from storage
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Fri Jul 2 01:09:36 MSK 2021
Storage sends NON_MASTER error when an attempt happens to make a
read-write request on it while it is not a master. The error
contains UUID of the instance.
The patch adds to this error a new field - UUID of the master as
it is seen on this storage. Router now uses that information to
quickly switch its read-write requests to the new master. In fact,
this should happen in almost all cases of master auto-discovery on
the router if it occurs under load.
Closes #75
---
test/router/master_discovery.result | 427 ++++++++++++++++++++++++++
test/router/master_discovery.test.lua | 191 ++++++++++++
test/router/router.result | 8 +-
vshard/error.lua | 2 +-
vshard/replicaset.lua | 65 ++++
vshard/router/init.lua | 17 +-
vshard/storage/init.lua | 6 +-
7 files changed, 706 insertions(+), 10 deletions(-)
diff --git a/test/router/master_discovery.result b/test/router/master_discovery.result
index 2fca1e4..8afeccb 100644
--- a/test/router/master_discovery.result
+++ b/test/router/master_discovery.result
@@ -109,6 +109,18 @@ end
| ---
| ...
+function check_no_master_for_replicaset(rs_id) \
+ local rs_uuid = util.replicasets[rs_id] \
+ local master = vshard.router.static.replicasets[rs_uuid].master \
+ if not master then \
+ return true \
+ end \
+ vshard.router.static.master_search_fiber:wakeup() \
+ return false \
+end
+ | ---
+ | ...
+
function check_all_buckets_found() \
if vshard.router.info().bucket.unknown == 0 then \
return true \
@@ -148,6 +160,28 @@ end
| ---
| ...
+function master_discovery_block() \
+ vshard.router.internal.errinj.ERRINJ_MASTER_SEARCH_DELAY = true \
+end
+ | ---
+ | ...
+
+function check_master_discovery_block() \
+ if vshard.router.internal.errinj.ERRINJ_MASTER_SEARCH_DELAY == 'in' then \
+ return true \
+ end \
+ vshard.router.static.master_search_fiber:wakeup() \
+ return false \
+end
+ | ---
+ | ...
+
+function master_discovery_unblock() \
+ vshard.router.internal.errinj.ERRINJ_MASTER_SEARCH_DELAY = false \
+end
+ | ---
+ | ...
+
--
-- Simulate the first cfg when no masters are known.
--
@@ -494,6 +528,399 @@ end
| message: Master is not configured for replicaset <replicaset_1>
| ...
+--
+-- Restore the old master back.
+--
+test_run:switch('storage_1_b')
+ | ---
+ | - true
+ | ...
+replicas = cfg.sharding[util.replicasets[1]].replicas
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_b].master = false
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_a].master = true
+ | ---
+ | ...
+vshard.storage.cfg(cfg, instance_uuid)
+ | ---
+ | ...
+
+--
+-- RW call uses a hint from the old master about who is the new master.
+--
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+forget_masters()
+ | ---
+ | ...
+test_run:wait_cond(check_all_masters_found)
+ | ---
+ | - true
+ | ...
+master_discovery_block()
+ | ---
+ | ...
+test_run:wait_cond(check_master_discovery_block)
+ | ---
+ | - true
+ | ...
+
+-- Change master while discovery is asleep.
+test_run:switch('storage_1_a')
+ | ---
+ | - true
+ | ...
+replicas = cfg.sharding[util.replicasets[1]].replicas
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_a].master = false
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_b].master = true
+ | ---
+ | ...
+vshard.storage.cfg(cfg, instance_uuid)
+ | ---
+ | ...
+
+test_run:switch('storage_1_b')
+ | ---
+ | - true
+ | ...
+replicas = cfg.sharding[util.replicasets[1]].replicas
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_a].master = false
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_b].master = true
+ | ---
+ | ...
+vshard.storage.cfg(cfg, instance_uuid)
+ | ---
+ | ...
+
+-- First request fails and tells where is the master. The second attempt works.
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+vshard.router.callrw(1501, 'echo', {1}, opts_big_timeout)
+ | ---
+ | - 1
+ | ...
+
+test_run:switch('storage_1_b')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 1)
+ | ---
+ | - true
+ | ...
+echo_count = 0
+ | ---
+ | ...
+
+--
+-- A non master error might contain no information about a new master.
+--
+-- Make the replicaset read-only.
+test_run:switch('storage_1_a')
+ | ---
+ | - true
+ | ...
+replicas = cfg.sharding[util.replicasets[1]].replicas
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_b].master = false
+ | ---
+ | ...
+vshard.storage.cfg(cfg, instance_uuid)
+ | ---
+ | ...
+
+test_run:switch('storage_1_b')
+ | ---
+ | - true
+ | ...
+replicas = cfg.sharding[util.replicasets[1]].replicas
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_b].master = false
+ | ---
+ | ...
+vshard.storage.cfg(cfg, instance_uuid)
+ | ---
+ | ...
+
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+-- A request should return no info about a new master. The router will wait for
+-- a new master discovery.
+f = fiber.create(function() \
+ fiber.self():set_joinable(true) \
+ return vshard.router.callrw(1501, 'echo', {1}, opts_big_timeout) \
+end)
+ | ---
+ | ...
+test_run:wait_cond(function() \
+ return check_no_master_for_replicaset(1) \
+end)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('storage_1_a')
+ | ---
+ | - true
+ | ...
+replicas = cfg.sharding[util.replicasets[1]].replicas
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_a].master = true
+ | ---
+ | ...
+vshard.storage.cfg(cfg, instance_uuid)
+ | ---
+ | ...
+
+test_run:switch('storage_1_b')
+ | ---
+ | - true
+ | ...
+replicas = cfg.sharding[util.replicasets[1]].replicas
+ | ---
+ | ...
+replicas[util.name_to_uuid.storage_1_a].master = true
+ | ---
+ | ...
+vshard.storage.cfg(cfg, instance_uuid)
+ | ---
+ | ...
+
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+master_discovery_unblock()
+ | ---
+ | ...
+test_run:wait_cond(check_all_masters_found)
+ | ---
+ | - true
+ | ...
+f:join()
+ | ---
+ | - true
+ | - 1
+ | ...
+
+test_run:switch('storage_1_a')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 1)
+ | ---
+ | - true
+ | ...
+echo_count = 0
+ | ---
+ | ...
+
+--
+-- Unit tests for master change in a replicaset object. Normally it can only
+-- happen in quite complicated cases. Hence the tests prefer to use the internal
+-- replicaset object instead.
+-- Disable the master search fiber so as it wouldn't interfere.
+--
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+master_discovery_block()
+ | ---
+ | ...
+test_run:wait_cond(check_master_discovery_block)
+ | ---
+ | - true
+ | ...
+rs = vshard.router.static.replicasets[util.replicasets[1]]
+ | ---
+ | ...
+storage_a_uuid = util.name_to_uuid.storage_1_a
+ | ---
+ | ...
+storage_b_uuid = util.name_to_uuid.storage_1_b
+ | ---
+ | ...
+
+assert(rs.master.uuid == storage_a_uuid)
+ | ---
+ | - true
+ | ...
+rs.master = nil
+ | ---
+ | ...
+rs.is_auto_master = false
+ | ---
+ | ...
+
+-- When auto-search is disabled and master is not known, nothing will make it
+-- known. It is up to the config.
+assert(not rs:update_master(storage_a_uuid, storage_b_uuid))
+ | ---
+ | - true
+ | ...
+assert(not rs.master)
+ | ---
+ | - true
+ | ...
+-- New master might be not reported.
+assert(not rs:update_master(storage_a_uuid))
+ | ---
+ | - true
+ | ...
+assert(not rs.master)
+ | ---
+ | - true
+ | ...
+
+-- With auto-search and not known master it is not assigned if a new master is
+-- not reported.
+rs.is_auto_master = true
+ | ---
+ | ...
+-- But update returns true, because it makes sense to try a next request later
+-- when the master is found.
+assert(rs:update_master(storage_a_uuid))
+ | ---
+ | - true
+ | ...
+assert(not rs.master)
+ | ---
+ | - true
+ | ...
+
+-- Report of a not known UUID won't assign the master.
+assert(rs:update_master(storage_a_uuid, util.name_to_uuid.storage_2_a))
+ | ---
+ | - true
+ | ...
+assert(not rs.master)
+ | ---
+ | - true
+ | ...
+
+-- Report of a known UUID assigns the master.
+assert(rs:update_master(storage_a_uuid, storage_b_uuid))
+ | ---
+ | - true
+ | ...
+assert(rs.master.uuid == storage_b_uuid)
+ | ---
+ | - true
+ | ...
+
+-- Master could change while the request's error was being received. Then the
+-- error should not change anything because it is outdated.
+assert(rs:update_master(storage_a_uuid))
+ | ---
+ | - true
+ | ...
+assert(rs.master.uuid == storage_b_uuid)
+ | ---
+ | - true
+ | ...
+-- It does not depend on auto-search. Still returns true, because if the master
+-- was changed since the request was sent, it means it could be retried and
+-- might succeed.
+rs.is_auto_master = false
+ | ---
+ | ...
+assert(rs:update_master(storage_a_uuid))
+ | ---
+ | - true
+ | ...
+assert(rs.master.uuid == storage_b_uuid)
+ | ---
+ | - true
+ | ...
+
+-- If the current master is reported as not a master and auto-search is
+-- disabled, update should fail. Because makes no sense to retry until a new
+-- config is applied externally.
+assert(not rs:update_master(storage_b_uuid, storage_a_uuid))
+ | ---
+ | - true
+ | ...
+assert(rs.master.uuid == storage_b_uuid)
+ | ---
+ | - true
+ | ...
+
+-- With auto-search, if the node is not a master and no new master is reported,
+-- the current master should be reset. Because makes no sense to send more RW
+-- requests to him. But update returns true, because the current request could
+-- be retried after waiting for a new master discovery.
+rs.is_auto_master = true
+ | ---
+ | ...
+assert(rs:update_master(storage_b_uuid))
+ | ---
+ | - true
+ | ...
+assert(rs.master == nil)
+ | ---
+ | - true
+ | ...
+
+-- When candidate is reported, and is known, it is used. But restore the master
+-- first to test its change.
+assert(rs:update_master(storage_b_uuid, storage_a_uuid))
+ | ---
+ | - true
+ | ...
+assert(rs.master.uuid == storage_a_uuid)
+ | ---
+ | - true
+ | ...
+-- Now update.
+assert(rs:update_master(storage_a_uuid, storage_b_uuid))
+ | ---
+ | - true
+ | ...
+assert(rs.master.uuid == storage_b_uuid)
+ | ---
+ | - true
+ | ...
+
+-- Candidate UUID might be not known in case the topology config is different on
+-- the router and on the storage. Then the master is simply reset.
+assert(rs:update_master(storage_b_uuid, util.name_to_uuid.storage_2_a))
+ | ---
+ | - true
+ | ...
+assert(rs.master == nil)
+ | ---
+ | - true
+ | ...
+
+master_discovery_unblock()
+ | ---
+ | ...
+test_run:wait_cond(check_all_masters_found)
+ | ---
+ | - true
+ | ...
+
_ = test_run:switch("default")
| ---
| ...
diff --git a/test/router/master_discovery.test.lua b/test/router/master_discovery.test.lua
index e087f58..70e5a72 100644
--- a/test/router/master_discovery.test.lua
+++ b/test/router/master_discovery.test.lua
@@ -67,6 +67,16 @@ function check_master_for_replicaset(rs_id, master_name)
return true \
end
+function check_no_master_for_replicaset(rs_id) \
+ local rs_uuid = util.replicasets[rs_id] \
+ local master = vshard.router.static.replicasets[rs_uuid].master \
+ if not master then \
+ return true \
+ end \
+ vshard.router.static.master_search_fiber:wakeup() \
+ return false \
+end
+
function check_all_buckets_found() \
if vshard.router.info().bucket.unknown == 0 then \
return true \
@@ -96,6 +106,22 @@ function stop_aggressive_master_search()
master_search_helper_f = nil \
end
+function master_discovery_block() \
+ vshard.router.internal.errinj.ERRINJ_MASTER_SEARCH_DELAY = true \
+end
+
+function check_master_discovery_block() \
+ if vshard.router.internal.errinj.ERRINJ_MASTER_SEARCH_DELAY == 'in' then \
+ return true \
+ end \
+ vshard.router.static.master_search_fiber:wakeup() \
+ return false \
+end
+
+function master_discovery_unblock() \
+ vshard.router.internal.errinj.ERRINJ_MASTER_SEARCH_DELAY = false \
+end
+
--
-- Simulate the first cfg when no masters are known.
--
@@ -240,6 +266,171 @@ do
}) \
end
+--
+-- Restore the old master back.
+--
+test_run:switch('storage_1_b')
+replicas = cfg.sharding[util.replicasets[1]].replicas
+replicas[util.name_to_uuid.storage_1_b].master = false
+replicas[util.name_to_uuid.storage_1_a].master = true
+vshard.storage.cfg(cfg, instance_uuid)
+
+--
+-- RW call uses a hint from the old master about who is the new master.
+--
+test_run:switch('router_1')
+forget_masters()
+test_run:wait_cond(check_all_masters_found)
+master_discovery_block()
+test_run:wait_cond(check_master_discovery_block)
+
+-- Change master while discovery is asleep.
+test_run:switch('storage_1_a')
+replicas = cfg.sharding[util.replicasets[1]].replicas
+replicas[util.name_to_uuid.storage_1_a].master = false
+replicas[util.name_to_uuid.storage_1_b].master = true
+vshard.storage.cfg(cfg, instance_uuid)
+
+test_run:switch('storage_1_b')
+replicas = cfg.sharding[util.replicasets[1]].replicas
+replicas[util.name_to_uuid.storage_1_a].master = false
+replicas[util.name_to_uuid.storage_1_b].master = true
+vshard.storage.cfg(cfg, instance_uuid)
+
+-- First request fails and tells where is the master. The second attempt works.
+test_run:switch('router_1')
+vshard.router.callrw(1501, 'echo', {1}, opts_big_timeout)
+
+test_run:switch('storage_1_b')
+assert(echo_count == 1)
+echo_count = 0
+
+--
+-- A non master error might contain no information about a new master.
+--
+-- Make the replicaset read-only.
+test_run:switch('storage_1_a')
+replicas = cfg.sharding[util.replicasets[1]].replicas
+replicas[util.name_to_uuid.storage_1_b].master = false
+vshard.storage.cfg(cfg, instance_uuid)
+
+test_run:switch('storage_1_b')
+replicas = cfg.sharding[util.replicasets[1]].replicas
+replicas[util.name_to_uuid.storage_1_b].master = false
+vshard.storage.cfg(cfg, instance_uuid)
+
+test_run:switch('router_1')
+-- A request should return no info about a new master. The router will wait for
+-- a new master discovery.
+f = fiber.create(function() \
+ fiber.self():set_joinable(true) \
+ return vshard.router.callrw(1501, 'echo', {1}, opts_big_timeout) \
+end)
+test_run:wait_cond(function() \
+ return check_no_master_for_replicaset(1) \
+end)
+
+test_run:switch('storage_1_a')
+replicas = cfg.sharding[util.replicasets[1]].replicas
+replicas[util.name_to_uuid.storage_1_a].master = true
+vshard.storage.cfg(cfg, instance_uuid)
+
+test_run:switch('storage_1_b')
+replicas = cfg.sharding[util.replicasets[1]].replicas
+replicas[util.name_to_uuid.storage_1_a].master = true
+vshard.storage.cfg(cfg, instance_uuid)
+
+test_run:switch('router_1')
+master_discovery_unblock()
+test_run:wait_cond(check_all_masters_found)
+f:join()
+
+test_run:switch('storage_1_a')
+assert(echo_count == 1)
+echo_count = 0
+
+--
+-- Unit tests for master change in a replicaset object. Normally it can only
+-- happen in quite complicated cases. Hence the tests prefer to use the internal
+-- replicaset object instead.
+-- Disable the master search fiber so as it wouldn't interfere.
+--
+test_run:switch('router_1')
+master_discovery_block()
+test_run:wait_cond(check_master_discovery_block)
+rs = vshard.router.static.replicasets[util.replicasets[1]]
+storage_a_uuid = util.name_to_uuid.storage_1_a
+storage_b_uuid = util.name_to_uuid.storage_1_b
+
+assert(rs.master.uuid == storage_a_uuid)
+rs.master = nil
+rs.is_auto_master = false
+
+-- When auto-search is disabled and master is not known, nothing will make it
+-- known. It is up to the config.
+assert(not rs:update_master(storage_a_uuid, storage_b_uuid))
+assert(not rs.master)
+-- New master might be not reported.
+assert(not rs:update_master(storage_a_uuid))
+assert(not rs.master)
+
+-- With auto-search and not known master it is not assigned if a new master is
+-- not reported.
+rs.is_auto_master = true
+-- But update returns true, because it makes sense to try a next request later
+-- when the master is found.
+assert(rs:update_master(storage_a_uuid))
+assert(not rs.master)
+
+-- Report of a not known UUID won't assign the master.
+assert(rs:update_master(storage_a_uuid, util.name_to_uuid.storage_2_a))
+assert(not rs.master)
+
+-- Report of a known UUID assigns the master.
+assert(rs:update_master(storage_a_uuid, storage_b_uuid))
+assert(rs.master.uuid == storage_b_uuid)
+
+-- Master could change while the request's error was being received. Then the
+-- error should not change anything because it is outdated.
+assert(rs:update_master(storage_a_uuid))
+assert(rs.master.uuid == storage_b_uuid)
+-- It does not depend on auto-search. Still returns true, because if the master
+-- was changed since the request was sent, it means it could be retried and
+-- might succeed.
+rs.is_auto_master = false
+assert(rs:update_master(storage_a_uuid))
+assert(rs.master.uuid == storage_b_uuid)
+
+-- If the current master is reported as not a master and auto-search is
+-- disabled, update should fail. Because makes no sense to retry until a new
+-- config is applied externally.
+assert(not rs:update_master(storage_b_uuid, storage_a_uuid))
+assert(rs.master.uuid == storage_b_uuid)
+
+-- With auto-search, if the node is not a master and no new master is reported,
+-- the current master should be reset. Because makes no sense to send more RW
+-- requests to him. But update returns true, because the current request could
+-- be retried after waiting for a new master discovery.
+rs.is_auto_master = true
+assert(rs:update_master(storage_b_uuid))
+assert(rs.master == nil)
+
+-- When candidate is reported, and is known, it is used. But restore the master
+-- first to test its change.
+assert(rs:update_master(storage_b_uuid, storage_a_uuid))
+assert(rs.master.uuid == storage_a_uuid)
+-- Now update.
+assert(rs:update_master(storage_a_uuid, storage_b_uuid))
+assert(rs.master.uuid == storage_b_uuid)
+
+-- Candidate UUID might be not known in case the topology config is different on
+-- the router and on the storage. Then the master is simply reset.
+assert(rs:update_master(storage_b_uuid, util.name_to_uuid.storage_2_a))
+assert(rs.master == nil)
+
+master_discovery_unblock()
+test_run:wait_cond(check_all_masters_found)
+
_ = test_run:switch("default")
_ = test_run:cmd("stop server router_1")
_ = test_run:cmd("cleanup server router_1")
diff --git a/test/router/router.result b/test/router/router.result
index 98dd5b5..8a0e30d 100644
--- a/test/router/router.result
+++ b/test/router/router.result
@@ -1105,11 +1105,12 @@ _ = test_run:switch("router_1")
util.check_error(vshard.router.call, 1, 'write', 'echo', { 'hello world' })
---
- null
-- replica_uuid: <storage_2_a>
- replicaset_uuid: <replicaset_2>
- type: ShardingError
+- master_uuid: <storage_2_b>
+ replica_uuid: <storage_2_a>
message: Replica <storage_2_a> is not a master for replicaset
<replicaset_2> anymore
+ type: ShardingError
+ replicaset_uuid: <replicaset_2>
name: NON_MASTER
code: 2
...
@@ -1175,6 +1176,7 @@ error_messages
- Use replicaset:connect_replica(...) instead of replicaset.connect_replica(...)
- Use replicaset:down_replica_priority(...) instead of replicaset.down_replica_priority(...)
- Use replicaset:up_replica_priority(...) instead of replicaset.up_replica_priority(...)
+ - Use replicaset:update_master(...) instead of replicaset.update_master(...)
- Use replicaset:wait_connected(...) instead of replicaset.wait_connected(...)
...
_, replica = next(replicaset.replicas)
diff --git a/vshard/error.lua b/vshard/error.lua
index e2d1a31..fa7bdaa 100644
--- a/vshard/error.lua
+++ b/vshard/error.lua
@@ -20,7 +20,7 @@ local error_message_template = {
[2] = {
name = 'NON_MASTER',
msg = 'Replica %s is not a master for replicaset %s anymore',
- args = {'replica_uuid', 'replicaset_uuid'}
+ args = {'replica_uuid', 'replicaset_uuid', 'master_uuid'}
},
[3] = {
name = 'BUCKET_ALREADY_EXISTS',
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
index 7747258..660c786 100644
--- a/vshard/replicaset.lua
+++ b/vshard/replicaset.lua
@@ -570,6 +570,70 @@ local function rebind_replicasets(replicasets, old_replicasets)
end
end
+--
+-- Let the replicaset know @a old_master_uuid is not a master anymore, should
+-- use @a candidate_uuid instead.
+-- Returns whether the request, which brought this information, can be retried.
+--
+local function replicaset_update_master(replicaset, old_master_uuid,
+ candidate_uuid)
+ local is_auto = replicaset.is_auto_master
+ local replicaset_uuid = replicaset.uuid
+ if old_master_uuid == candidate_uuid then
+ -- It should not happen ever, but be ready to everything.
+ log.warn('Replica %s in replicaset %s reports self as both master '..
+ 'and not master', master_uuid, replicaset_uuid)
+ return is_auto
+ end
+ local master = replicaset.master
+ if not master then
+ if not is_auto or not candidate_uuid then
+ return is_auto
+ end
+ local candidate = replicaset.replicas[candidate_uuid]
+ if not candidate then
+ return true
+ end
+ replicaset.master = candidate
+ log.info('Replica %s becomes a master as reported by %s for '..
+ 'replicaset %s', candidate_uuid, old_master_uuid,
+ replicaset_uuid)
+ return true
+ end
+ local master_uuid = master.uuid
+ if master_uuid ~= old_master_uuid then
+ -- Master was changed while the master update information was coming.
+ -- It means it is outdated and should be ignored.
+ -- Return true regardless of the auto-master config. Because the master
+ -- change means the caller's request has a chance to succeed on the new
+ -- master on retry.
+ return true
+ end
+ if not is_auto then
+ log.warn('Replica %s is not master for replicaset %s anymore, please '..
+ 'update configuration', master_uuid, replicaset_uuid)
+ return false
+ end
+ if not candidate_uuid then
+ replicaset.master = nil
+ log.warn('Replica %s is not a master anymore for replicaset %s, no '..
+ 'candidate was reported', master_uuid, replicaset_uuid)
+ return true
+ end
+ local candidate = replicaset.replicas[candidate_uuid]
+ if candidate then
+ replicaset.master = candidate
+ log.info('Replica %s becomes master instead of %s for replicaset %s',
+ candidate_uuid, master_uuid, replicaset_uuid)
+ else
+ replicaset.master = nil
+ log.warn('Replica %s is not a master anymore for replicaset %s, new '..
+ 'master %s could not be found in the config',
+ master_uuid, replicaset_uuid, candidate_uuid)
+ end
+ return true
+end
+
--
-- Check if the master is still master, and find a new master if there is no a
-- known one.
@@ -693,6 +757,7 @@ local replicaset_mt = {
callbro = replicaset_template_multicallro(false, true);
callre = replicaset_template_multicallro(true, false);
callbre = replicaset_template_multicallro(true, true);
+ update_master = replicaset_update_master,
};
__tostring = replicaset_tostring;
}
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 9407ccd..b36ee8e 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -33,6 +33,7 @@ if not M then
ERRINJ_FAILOVER_CHANGE_CFG = false,
ERRINJ_RELOAD = false,
ERRINJ_LONG_DISCOVERY = false,
+ ERRINJ_MASTER_SEARCH_DELAY = false,
},
-- Dictionary, key is router name, value is a router.
routers = {},
@@ -620,12 +621,11 @@ local function router_call_impl(router, bucket_id, mode, prefer_replica,
bucket_reset(router, bucket_id)
return nil, err
elseif err.code == lerror.code.NON_MASTER then
- -- Same, as above - do not wait and repeat.
assert(mode == 'write')
- log.warn("Replica %s is not master for replicaset %s anymore,"..
- "please update configuration!",
- replicaset.master.uuid, replicaset.uuid)
- return nil, err
+ if not replicaset:update_master(err.replica_uuid,
+ err.master_uuid) then
+ return nil, err
+ end
else
return nil, err
end
@@ -1059,7 +1059,14 @@ end
local function master_search_f(router)
local module_version = M.module_version
local is_in_progress = false
+ local errinj = M.errinj
while module_version == M.module_version do
+ if errinj.ERRINJ_MASTER_SEARCH_DELAY then
+ errinj.ERRINJ_MASTER_SEARCH_DELAY = 'in'
+ repeat
+ lfiber.sleep(0.001)
+ until not errinj.ERRINJ_MASTER_SEARCH_DELAY
+ end
local timeout
local start_time = fiber_clock()
local is_done, is_nop, err = master_search_step(router)
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 465abc5..3158f3c 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -971,9 +971,13 @@ local function bucket_check_state(bucket_id, mode)
end
reason = 'write is prohibited'
elseif M.this_replicaset.master ~= M.this_replica then
+ local master_uuid = M.this_replicaset.master
+ if master_uuid then
+ master_uuid = master_uuid.uuid
+ end
return bucket, lerror.vshard(lerror.code.NON_MASTER,
M.this_replica.uuid,
- M.this_replicaset.uuid)
+ M.this_replicaset.uuid, master_uuid)
else
return bucket
end
--
2.24.3 (Apple Git-128)
More information about the Tarantool-patches
mailing list