From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 285FA2CDE8 for ; Thu, 22 Mar 2018 18:11:52 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id tXs4sUxZviMY for ; Thu, 22 Mar 2018 18:11:52 -0400 (EDT) Received: from smtp60.i.mail.ru (smtp60.i.mail.ru [217.69.128.40]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 411B92B784 for ; Thu, 22 Mar 2018 18:11:50 -0400 (EDT) From: Vladislav Shpilevoy Subject: [tarantool-patches] [PATCH vshard 1/1] storage: do read_only + sync on master demotion before box.cfg Date: Fri, 23 Mar 2018 01:11:45 +0300 Message-Id: Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Cc: georgy@tarantool.org, Vladislav Shpilevoy Before a master is demoted and updated its box.cfg it must turn read_only mode on and try to sync with slaves. With no sync and read_only before box.cfg it is possible, that some new data will appear during box reconfiguration, and it will not be synced. Closes #84 --- Branch: https://github.com/tarantool/vshard/tree/gh-84-sync-before-cfg Issue: https://github.com/tarantool/vshard/issues/84 test/misc/reconfigure.result | 14 ++++ test/misc/reconfigure.test.lua | 4 ++ test/storage/demote_sync_errinj.result | 110 +++++++++++++++++++++++++++++++ test/storage/demote_sync_errinj.test.lua | 39 +++++++++++ test/storage/read_only_slave.result | 27 +++++--- test/storage/read_only_slave.test.lua | 16 +++-- test/unit/config.result | 23 +++++++ test/unit/config.test.lua | 11 ++++ vshard/cfg.lua | 7 +- vshard/consts.lua | 3 +- vshard/storage/init.lua | 76 +++++++++++++++++++-- 11 files changed, 303 insertions(+), 27 deletions(-) create mode 100644 test/storage/demote_sync_errinj.result create mode 100644 test/storage/demote_sync_errinj.test.lua diff --git a/test/misc/reconfigure.result b/test/misc/reconfigure.result index 0a014a3..c7960b3 100644 --- a/test/misc/reconfigure.result +++ b/test/misc/reconfigure.result @@ -100,6 +100,13 @@ not vshard.storage.internal.collect_lua_garbage --- - true ... +vshard.storage.internal.sync_timeout +--- +- 1 +... +cfg.sync_timeout = 100 +--- +... cfg.collect_lua_garbage = true --- ... @@ -120,6 +127,10 @@ not vshard.storage.internal.collect_lua_garbage --- - true ... +vshard.storage.internal.sync_timeout +--- +- 1 +... vshard.storage.internal.rebalancer_max_receiving ~= 1000 --- - true @@ -128,6 +139,9 @@ vshard.storage.internal.collect_bucket_garbage_interval ~= 100 --- - true ... +cfg.sync_timeout = nil +--- +... cfg.collect_lua_garbage = nil --- ... diff --git a/test/misc/reconfigure.test.lua b/test/misc/reconfigure.test.lua index 033214a..25dc2ca 100644 --- a/test/misc/reconfigure.test.lua +++ b/test/misc/reconfigure.test.lua @@ -41,14 +41,18 @@ util.check_error(vshard.storage.cfg, cfg, 'unknow uuid') -- changed. -- not vshard.storage.internal.collect_lua_garbage +vshard.storage.internal.sync_timeout +cfg.sync_timeout = 100 cfg.collect_lua_garbage = true cfg.rebalancer_max_receiving = 1000 cfg.collect_bucket_garbage_interval = 100 cfg.invalid_option = 'kek' vshard.storage.cfg(cfg, names.storage_1_a) not vshard.storage.internal.collect_lua_garbage +vshard.storage.internal.sync_timeout vshard.storage.internal.rebalancer_max_receiving ~= 1000 vshard.storage.internal.collect_bucket_garbage_interval ~= 100 +cfg.sync_timeout = nil cfg.collect_lua_garbage = nil cfg.rebalancer_max_receiving = nil cfg.collect_bucket_garbage_interval = nil diff --git a/test/storage/demote_sync_errinj.result b/test/storage/demote_sync_errinj.result new file mode 100644 index 0000000..2d14504 --- /dev/null +++ b/test/storage/demote_sync_errinj.result @@ -0,0 +1,110 @@ +test_run = require('test_run').new() +--- +... +REPLICASET_1 = { 'storage_1_a', 'storage_1_b' } +--- +... +REPLICASET_2 = { 'storage_2_a', 'storage_2_b' } +--- +... +test_run:create_cluster(REPLICASET_1, 'storage') +--- +... +test_run:create_cluster(REPLICASET_2, 'storage') +--- +... +util = require('util') +--- +... +util.wait_master(test_run, REPLICASET_1, 'storage_1_a') +--- +... +util.wait_master(test_run, REPLICASET_2, 'storage_2_a') +--- +... +test_run:switch('storage_1_a') +--- +- true +... +fiber = require('fiber') +--- +... +s = box.schema.create_space('test') +--- +... +pk = s:create_index('pk') +--- +... +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = true +--- +... +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true +--- +... +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false +--- +... +f = fiber.create(function() vshard.storage.cfg(cfg, names.storage_1_a) end) +--- +... +f:status() +--- +- suspended +... +-- Can not write - read only mode is already on. +s:replace{1} +--- +- error: Can't modify data because this instance is in read-only mode. +... +test_run:switch('storage_1_b') +--- +- true +... +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true +--- +... +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false +--- +... +vshard.storage.cfg(cfg, names.storage_1_b) +--- +... +box.space.test:select{} +--- +- [] +... +test_run:switch('storage_1_a') +--- +- true +... +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = false +--- +... +while f:status() ~= 'dead' do fiber.sleep(0.1) end +--- +... +s:select{} +--- +- [] +... +test_run:switch('storage_1_b') +--- +- true +... +box.space.test:select{} +--- +- [] +... +box.space.test:drop() +--- +... +test_run:cmd("switch default") +--- +- true +... +test_run:drop_cluster(REPLICASET_2) +--- +... +test_run:drop_cluster(REPLICASET_1) +--- +... diff --git a/test/storage/demote_sync_errinj.test.lua b/test/storage/demote_sync_errinj.test.lua new file mode 100644 index 0000000..f12ab60 --- /dev/null +++ b/test/storage/demote_sync_errinj.test.lua @@ -0,0 +1,39 @@ +test_run = require('test_run').new() +REPLICASET_1 = { 'storage_1_a', 'storage_1_b' } +REPLICASET_2 = { 'storage_2_a', 'storage_2_b' } +test_run:create_cluster(REPLICASET_1, 'storage') +test_run:create_cluster(REPLICASET_2, 'storage') +util = require('util') +util.wait_master(test_run, REPLICASET_1, 'storage_1_a') +util.wait_master(test_run, REPLICASET_2, 'storage_2_a') + +test_run:switch('storage_1_a') +fiber = require('fiber') +s = box.schema.create_space('test') +pk = s:create_index('pk') +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = true +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false +f = fiber.create(function() vshard.storage.cfg(cfg, names.storage_1_a) end) +f:status() +-- Can not write - read only mode is already on. +s:replace{1} + +test_run:switch('storage_1_b') +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false +vshard.storage.cfg(cfg, names.storage_1_b) +box.space.test:select{} + +test_run:switch('storage_1_a') +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = false +while f:status() ~= 'dead' do fiber.sleep(0.1) end +s:select{} + +test_run:switch('storage_1_b') +box.space.test:select{} +box.space.test:drop() + +test_run:cmd("switch default") +test_run:drop_cluster(REPLICASET_2) +test_run:drop_cluster(REPLICASET_1) diff --git a/test/storage/read_only_slave.result b/test/storage/read_only_slave.result index 9d5c85e..07d3653 100644 --- a/test/storage/read_only_slave.result +++ b/test/storage/read_only_slave.result @@ -36,10 +36,17 @@ s = box.schema.create_space('test') pk = s:create_index('pk') --- ... +ok = nil +--- +... +err = nil +--- +... function on_master_enable() s:replace{1} end --- ... -function on_master_disable() s:replace{2} end +-- Test, that in disable trigger already can not write. +function on_master_disable() ok, err = pcall(s.replace, s, {2}) end --- ... vshard.storage.on_master_enable(on_master_enable) @@ -85,8 +92,8 @@ vshard.storage.on_master_enable(on_master_enable) vshard.storage.on_master_disable(on_master_disable) --- ... --- Yes, there is no 3 or 4, because trigger is set after a replica --- becames slave. +-- Yes, there is no 3 or 4, because a trigger on disable always +-- works in readonly. s:select{} --- - - [1] @@ -128,28 +135,28 @@ box.cfg.read_only --- - true ... +ok, err +--- +- false +- Can't modify data because this instance is in read-only mode. +... fiber = require('fiber') --- ... -while s:count() ~= 3 do fiber.sleep(0.1) end +while s:count() ~= 2 do fiber.sleep(0.1) end --- ... s:select{} --- - - [1] - - [2] - [3] ... test_run:switch('storage_1_b') --- - true ... --- Yes, there is no {2}, because replication source is unset --- already. -s:select{} +s:drop() --- -- - [1] - - [3] ... test_run:cmd("switch default") --- diff --git a/test/storage/read_only_slave.test.lua b/test/storage/read_only_slave.test.lua index 3253ad7..d49e6f2 100644 --- a/test/storage/read_only_slave.test.lua +++ b/test/storage/read_only_slave.test.lua @@ -11,8 +11,11 @@ test_run:switch('storage_1_a') box.cfg.read_only s = box.schema.create_space('test') pk = s:create_index('pk') +ok = nil +err = nil function on_master_enable() s:replace{1} end -function on_master_disable() s:replace{2} end +-- Test, that in disable trigger already can not write. +function on_master_disable() ok, err = pcall(s.replace, s, {2}) end vshard.storage.on_master_enable(on_master_enable) vshard.storage.on_master_disable(on_master_disable) s:select{} @@ -27,8 +30,8 @@ function on_master_enable() s:replace{3} end function on_master_disable() if not box.cfg.read_only then s:replace{4} end end vshard.storage.on_master_enable(on_master_enable) vshard.storage.on_master_disable(on_master_disable) --- Yes, there is no 3 or 4, because trigger is set after a replica --- becames slave. +-- Yes, there is no 3 or 4, because a trigger on disable always +-- works in readonly. s:select{} -- Check that after master change the read_only is updated, and @@ -44,14 +47,13 @@ cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false vshard.storage.cfg(cfg, names.storage_1_a) box.cfg.read_only +ok, err fiber = require('fiber') -while s:count() ~= 3 do fiber.sleep(0.1) end +while s:count() ~= 2 do fiber.sleep(0.1) end s:select{} test_run:switch('storage_1_b') --- Yes, there is no {2}, because replication source is unset --- already. -s:select{} +s:drop() test_run:cmd("switch default") test_run:drop_cluster(REPLICASET_2) diff --git a/test/unit/config.result b/test/unit/config.result index 2a5473c..8b098fb 100644 --- a/test/unit/config.result +++ b/test/unit/config.result @@ -419,3 +419,26 @@ cfg.collect_lua_garbage = false _ = lcfg.check(cfg) --- ... +-- +-- gh-84: sync before master demotion, and allow to configure +-- sync timeout. +-- +cfg.sync_timeout = -100 +--- +... +check(cfg) +--- +- Sync timeout must be non-negative number +... +cfg.sync_timeout = 0 +--- +... +_ = lcfg.check(cfg) +--- +... +cfg.sync_timeout = 10.5 +--- +... +_ = lcfg.check(cfg) +--- +... diff --git a/test/unit/config.test.lua b/test/unit/config.test.lua index 140c38c..0990d0e 100644 --- a/test/unit/config.test.lua +++ b/test/unit/config.test.lua @@ -166,3 +166,14 @@ cfg.collect_lua_garbage = true _ = lcfg.check(cfg) cfg.collect_lua_garbage = false _ = lcfg.check(cfg) + +-- +-- gh-84: sync before master demotion, and allow to configure +-- sync timeout. +-- +cfg.sync_timeout = -100 +check(cfg) +cfg.sync_timeout = 0 +_ = lcfg.check(cfg) +cfg.sync_timeout = 10.5 +_ = lcfg.check(cfg) diff --git a/vshard/cfg.lua b/vshard/cfg.lua index c578af2..9860676 100644 --- a/vshard/cfg.lua +++ b/vshard/cfg.lua @@ -198,7 +198,11 @@ local cfg_template = { {'collect_lua_garbage', { type = 'boolean', name = 'Garbage Lua collect necessity', is_optional = true, default = false - }} + }}, + {'sync_timeout', { + type = 'non-negative number', name = 'Sync timeout', is_optional = true, + default = consts.DEFAULT_SYNC_TIMEOUT + }}, } -- @@ -227,6 +231,7 @@ local function remove_non_box_options(cfg) cfg.shard_index = nil cfg.collect_bucket_garbage_interval = nil cfg.collect_lua_garbage = nil + cfg.sync_timeout = nil end return { diff --git a/vshard/consts.lua b/vshard/consts.lua index 47651d7..1a0e4ab 100644 --- a/vshard/consts.lua +++ b/vshard/consts.lua @@ -20,7 +20,6 @@ return { REPLICATION_THRESHOLD_FAIL = 10, DEFAULT_BUCKET_COUNT = 3000; - BUCKET_SYNC_TIMEOUT = 0.1; BUCKET_SENT_GARBAGE_DELAY = 0.5; DEFAULT_REBALANCER_DISBALANCE_THRESHOLD = 1; REBALANCER_IDLE_INTERVAL = 60 * 60; @@ -30,7 +29,7 @@ return { CALL_TIMEOUT_MAX = 64; FAILOVER_UP_TIMEOUT = 5; FAILOVER_DOWN_TIMEOUT = 1; - SYNC_TIMEOUT = 1; + DEFAULT_SYNC_TIMEOUT = 1; RECONNECT_TIMEOUT = 0.5; DEFAULT_COLLECT_BUCKET_GARBAGE_INTERVAL = 0.5; RECOVERY_INTERVAL = 5; diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index a0f216e..6d66b05 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -35,10 +35,12 @@ if not M then errinj = { ERRINJ_BUCKET_FIND_GARBAGE_DELAY = false, ERRINJ_RELOAD = false, + ERRINJ_CFG_DELAY = false, }, -- This counter is used to restart background fibers with -- new reloaded code. module_version = 0, + sync_timeout = consts.DEFAULT_SYNC_TIMEOUT, ------------------- Garbage collection ------------------- -- Fiber to remove garbage buckets data. @@ -305,7 +307,7 @@ local function sync(timeout) end log.debug("Synchronizing replicaset...") - timeout = timeout or consts.SYNC_TIMEOUT + timeout = timeout or M.sync_timeout local vclock = box.info.vclock local tstart = lfiber.time() repeat @@ -539,14 +541,31 @@ local function buckets_discovery() return ret end +-- +-- The only thing, that must be done to abort a master demote is +-- a reset of read_only. +-- +local function local_on_master_disable_abort() + box.cfg{read_only = false} +end + +-- +-- Prepare to a master demotion. Before it, a master must stop +-- accept writes, and try to wait until all of its data is +-- replicated to each slave. +-- +local function local_on_master_disable_prepare() + log.info("Resigning from the replicaset master role...") + box.cfg({read_only = true}) + sync(M.sync_timeout) +end + -- -- This function executes when a master role is removed from local -- instance during configuration -- local function local_on_master_disable() M._on_master_disable:run() - box.cfg({read_only = true}) - log.verbose("Resigning from the replicaset master role...") -- Stop garbage collecting if M.collect_bucket_garbage_fiber ~= nil then M.collect_bucket_garbage_fiber:cancel() @@ -558,13 +577,29 @@ local function local_on_master_disable() M.recovery_fiber = nil log.info('Recovery stopped') end - -- Wait until replicas are synchronized before one another become a new master - sync(consts.SYNC_TIMEOUT) log.info("Resigned from the replicaset master role") end local collect_garbage_f +-- +-- The only thing, that must be done to abort a master promotion +-- is a set read_only back to true. +-- +local function local_on_master_enable_abort() + box.cfg({read_only = true}) +end + +-- +-- Promote does not require sync, because a replica can not have a +-- data, that is not on a current master - the replica is read +-- only. But read_only can not be set to false here, because +-- until box.cfg is called, it can not be guaranteed, that the +-- promotion will be successful. +-- +local function local_on_master_enable_prepare() + log.info("Taking on replicaset master role...") +end -- -- This function executes whan a master role is added to local -- instance during configuration @@ -572,7 +607,6 @@ local collect_garbage_f local function local_on_master_enable() box.cfg({read_only = false}) M._on_master_enable:run() - log.verbose("Taking on replicaset master role...") recovery_garbage_receiving_buckets() -- Start background process to collect garbage. M.collect_bucket_garbage_fiber = @@ -1348,9 +1382,37 @@ local function storage_cfg(cfg, this_replica_uuid) local shard_index = cfg.shard_index local collect_bucket_garbage_interval = cfg.collect_bucket_garbage_interval local collect_lua_garbage = cfg.collect_lua_garbage + -- + -- Sync timeout is a special case - it must be updated before + -- all other options to allow a user to demote a master with + -- a new sync timeout. + -- + local old_sync_timeout = M.sync_timeout + M.sync_timeout = cfg.sync_timeout lcfg.remove_non_box_options(cfg) - box.cfg(cfg) + if was_master and not is_master then + local_on_master_disable_prepare() + end + if not was_master and is_master then + local_on_master_enable_prepare() + end + + local ok, err = pcall(box.cfg, cfg) + while M.errinj.ERRINJ_CFG_DELAY do + lfiber.sleep(0.01) + end + if not ok then + M.sync_timeout = old_sync_timeout + if was_master and not is_master then + local_on_master_disable_abort() + end + if not was_master and is_master then + local_on_master_enable_abort() + end + error(err) + end + log.info("Box has been configured") local uri = luri.parse(this_replica.uri) box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password) -- 2.14.3 (Apple Git-98)