From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 0FEFF283CD for ; Mon, 26 Mar 2018 06:38:27 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id SXVEof97z8Iw for ; Mon, 26 Mar 2018 06:38:26 -0400 (EDT) Received: from smtp43.i.mail.ru (smtp43.i.mail.ru [94.100.177.103]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 1A3D72834D for ; Mon, 26 Mar 2018 06:38:25 -0400 (EDT) From: Georgy Kirichenko Subject: [tarantool-patches] Re: [PATCH vshard 1/1] storage: do read_only + sync on master demotion before box.cfg Date: Mon, 26 Mar 2018 13:38:19 +0300 Message-ID: <23427395.8AgaDehIWV@localhost> In-Reply-To: References: MIME-Version: 1.0 Content-Type: multipart/signed; boundary="nextPart3923829.ZClXRgqJih"; micalg="pgp-sha256"; protocol="application/pgp-signature" Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Cc: Vladislav Shpilevoy --nextPart3923829.ZClXRgqJih Content-Transfer-Encoding: 7Bit Content-Type: text/plain; charset="us-ascii" Likes Ok for me On Friday, March 23, 2018 1:11:45 AM MSK Vladislav Shpilevoy wrote: > Before a master is demoted and updated its box.cfg it must turn > read_only mode on and try to sync with slaves. With no sync and > read_only before box.cfg it is possible, that some new data will > appear during box reconfiguration, and it will not be synced. > > Closes #84 > --- > > Branch: https://github.com/tarantool/vshard/tree/gh-84-sync-before-cfg > Issue: https://github.com/tarantool/vshard/issues/84 > > test/misc/reconfigure.result | 14 ++++ > test/misc/reconfigure.test.lua | 4 ++ > test/storage/demote_sync_errinj.result | 110 > +++++++++++++++++++++++++++++++ test/storage/demote_sync_errinj.test.lua | > 39 +++++++++++ > test/storage/read_only_slave.result | 27 +++++--- > test/storage/read_only_slave.test.lua | 16 +++-- > test/unit/config.result | 23 +++++++ > test/unit/config.test.lua | 11 ++++ > vshard/cfg.lua | 7 +- > vshard/consts.lua | 3 +- > vshard/storage/init.lua | 76 +++++++++++++++++++-- > 11 files changed, 303 insertions(+), 27 deletions(-) > create mode 100644 test/storage/demote_sync_errinj.result > create mode 100644 test/storage/demote_sync_errinj.test.lua > > diff --git a/test/misc/reconfigure.result b/test/misc/reconfigure.result > index 0a014a3..c7960b3 100644 > --- a/test/misc/reconfigure.result > +++ b/test/misc/reconfigure.result > @@ -100,6 +100,13 @@ not vshard.storage.internal.collect_lua_garbage > --- > - true > ... > +vshard.storage.internal.sync_timeout > +--- > +- 1 > +... > +cfg.sync_timeout = 100 > +--- > +... > cfg.collect_lua_garbage = true > --- > ... > @@ -120,6 +127,10 @@ not vshard.storage.internal.collect_lua_garbage > --- > - true > ... > +vshard.storage.internal.sync_timeout > +--- > +- 1 > +... > vshard.storage.internal.rebalancer_max_receiving ~= 1000 > --- > - true > @@ -128,6 +139,9 @@ vshard.storage.internal.collect_bucket_garbage_interval > ~= 100 --- > - true > ... > +cfg.sync_timeout = nil > +--- > +... > cfg.collect_lua_garbage = nil > --- > ... > diff --git a/test/misc/reconfigure.test.lua b/test/misc/reconfigure.test.lua > index 033214a..25dc2ca 100644 > --- a/test/misc/reconfigure.test.lua > +++ b/test/misc/reconfigure.test.lua > @@ -41,14 +41,18 @@ util.check_error(vshard.storage.cfg, cfg, 'unknow uuid') > -- changed. > -- > not vshard.storage.internal.collect_lua_garbage > +vshard.storage.internal.sync_timeout > +cfg.sync_timeout = 100 > cfg.collect_lua_garbage = true > cfg.rebalancer_max_receiving = 1000 > cfg.collect_bucket_garbage_interval = 100 > cfg.invalid_option = 'kek' > vshard.storage.cfg(cfg, names.storage_1_a) > not vshard.storage.internal.collect_lua_garbage > +vshard.storage.internal.sync_timeout > vshard.storage.internal.rebalancer_max_receiving ~= 1000 > vshard.storage.internal.collect_bucket_garbage_interval ~= 100 > +cfg.sync_timeout = nil > cfg.collect_lua_garbage = nil > cfg.rebalancer_max_receiving = nil > cfg.collect_bucket_garbage_interval = nil > diff --git a/test/storage/demote_sync_errinj.result > b/test/storage/demote_sync_errinj.result new file mode 100644 > index 0000000..2d14504 > --- /dev/null > +++ b/test/storage/demote_sync_errinj.result > @@ -0,0 +1,110 @@ > +test_run = require('test_run').new() > +--- > +... > +REPLICASET_1 = { 'storage_1_a', 'storage_1_b' } > +--- > +... > +REPLICASET_2 = { 'storage_2_a', 'storage_2_b' } > +--- > +... > +test_run:create_cluster(REPLICASET_1, 'storage') > +--- > +... > +test_run:create_cluster(REPLICASET_2, 'storage') > +--- > +... > +util = require('util') > +--- > +... > +util.wait_master(test_run, REPLICASET_1, 'storage_1_a') > +--- > +... > +util.wait_master(test_run, REPLICASET_2, 'storage_2_a') > +--- > +... > +test_run:switch('storage_1_a') > +--- > +- true > +... > +fiber = require('fiber') > +--- > +... > +s = box.schema.create_space('test') > +--- > +... > +pk = s:create_index('pk') > +--- > +... > +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = true > +--- > +... > +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true > +--- > +... > +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false > +--- > +... > +f = fiber.create(function() vshard.storage.cfg(cfg, names.storage_1_a) end) > +--- > +... > +f:status() > +--- > +- suspended > +... > +-- Can not write - read only mode is already on. > +s:replace{1} > +--- > +- error: Can't modify data because this instance is in read-only mode. > +... > +test_run:switch('storage_1_b') > +--- > +- true > +... > +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true > +--- > +... > +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false > +--- > +... > +vshard.storage.cfg(cfg, names.storage_1_b) > +--- > +... > +box.space.test:select{} > +--- > +- [] > +... > +test_run:switch('storage_1_a') > +--- > +- true > +... > +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = false > +--- > +... > +while f:status() ~= 'dead' do fiber.sleep(0.1) end > +--- > +... > +s:select{} > +--- > +- [] > +... > +test_run:switch('storage_1_b') > +--- > +- true > +... > +box.space.test:select{} > +--- > +- [] > +... > +box.space.test:drop() > +--- > +... > +test_run:cmd("switch default") > +--- > +- true > +... > +test_run:drop_cluster(REPLICASET_2) > +--- > +... > +test_run:drop_cluster(REPLICASET_1) > +--- > +... > diff --git a/test/storage/demote_sync_errinj.test.lua > b/test/storage/demote_sync_errinj.test.lua new file mode 100644 > index 0000000..f12ab60 > --- /dev/null > +++ b/test/storage/demote_sync_errinj.test.lua > @@ -0,0 +1,39 @@ > +test_run = require('test_run').new() > +REPLICASET_1 = { 'storage_1_a', 'storage_1_b' } > +REPLICASET_2 = { 'storage_2_a', 'storage_2_b' } > +test_run:create_cluster(REPLICASET_1, 'storage') > +test_run:create_cluster(REPLICASET_2, 'storage') > +util = require('util') > +util.wait_master(test_run, REPLICASET_1, 'storage_1_a') > +util.wait_master(test_run, REPLICASET_2, 'storage_2_a') > + > +test_run:switch('storage_1_a') > +fiber = require('fiber') > +s = box.schema.create_space('test') > +pk = s:create_index('pk') > +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = true > +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true > +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false > +f = fiber.create(function() vshard.storage.cfg(cfg, names.storage_1_a) end) > +f:status() > +-- Can not write - read only mode is already on. > +s:replace{1} > + > +test_run:switch('storage_1_b') > +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true > +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false > +vshard.storage.cfg(cfg, names.storage_1_b) > +box.space.test:select{} > + > +test_run:switch('storage_1_a') > +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = false > +while f:status() ~= 'dead' do fiber.sleep(0.1) end > +s:select{} > + > +test_run:switch('storage_1_b') > +box.space.test:select{} > +box.space.test:drop() > + > +test_run:cmd("switch default") > +test_run:drop_cluster(REPLICASET_2) > +test_run:drop_cluster(REPLICASET_1) > diff --git a/test/storage/read_only_slave.result > b/test/storage/read_only_slave.result index 9d5c85e..07d3653 100644 > --- a/test/storage/read_only_slave.result > +++ b/test/storage/read_only_slave.result > @@ -36,10 +36,17 @@ s = box.schema.create_space('test') > pk = s:create_index('pk') > --- > ... > +ok = nil > +--- > +... > +err = nil > +--- > +... > function on_master_enable() s:replace{1} end > --- > ... > -function on_master_disable() s:replace{2} end > +-- Test, that in disable trigger already can not write. > +function on_master_disable() ok, err = pcall(s.replace, s, {2}) end > --- > ... > vshard.storage.on_master_enable(on_master_enable) > @@ -85,8 +92,8 @@ vshard.storage.on_master_enable(on_master_enable) > vshard.storage.on_master_disable(on_master_disable) > --- > ... > --- Yes, there is no 3 or 4, because trigger is set after a replica > --- becames slave. > +-- Yes, there is no 3 or 4, because a trigger on disable always > +-- works in readonly. > s:select{} > --- > - - [1] > @@ -128,28 +135,28 @@ box.cfg.read_only > --- > - true > ... > +ok, err > +--- > +- false > +- Can't modify data because this instance is in read-only mode. > +... > fiber = require('fiber') > --- > ... > -while s:count() ~= 3 do fiber.sleep(0.1) end > +while s:count() ~= 2 do fiber.sleep(0.1) end > --- > ... > s:select{} > --- > - - [1] > - - [2] > - [3] > ... > test_run:switch('storage_1_b') > --- > - true > ... > --- Yes, there is no {2}, because replication source is unset > --- already. > -s:select{} > +s:drop() > --- > -- - [1] > - - [3] > ... > test_run:cmd("switch default") > --- > diff --git a/test/storage/read_only_slave.test.lua > b/test/storage/read_only_slave.test.lua index 3253ad7..d49e6f2 100644 > --- a/test/storage/read_only_slave.test.lua > +++ b/test/storage/read_only_slave.test.lua > @@ -11,8 +11,11 @@ test_run:switch('storage_1_a') > box.cfg.read_only > s = box.schema.create_space('test') > pk = s:create_index('pk') > +ok = nil > +err = nil > function on_master_enable() s:replace{1} end > -function on_master_disable() s:replace{2} end > +-- Test, that in disable trigger already can not write. > +function on_master_disable() ok, err = pcall(s.replace, s, {2}) end > vshard.storage.on_master_enable(on_master_enable) > vshard.storage.on_master_disable(on_master_disable) > s:select{} > @@ -27,8 +30,8 @@ function on_master_enable() s:replace{3} end > function on_master_disable() if not box.cfg.read_only then s:replace{4} end > end vshard.storage.on_master_enable(on_master_enable) > vshard.storage.on_master_disable(on_master_disable) > --- Yes, there is no 3 or 4, because trigger is set after a replica > --- becames slave. > +-- Yes, there is no 3 or 4, because a trigger on disable always > +-- works in readonly. > s:select{} > > -- Check that after master change the read_only is updated, and > @@ -44,14 +47,13 @@ > cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true > cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false > vshard.storage.cfg(cfg, names.storage_1_a) > box.cfg.read_only > +ok, err > fiber = require('fiber') > -while s:count() ~= 3 do fiber.sleep(0.1) end > +while s:count() ~= 2 do fiber.sleep(0.1) end > s:select{} > > test_run:switch('storage_1_b') > --- Yes, there is no {2}, because replication source is unset > --- already. > -s:select{} > +s:drop() > > test_run:cmd("switch default") > test_run:drop_cluster(REPLICASET_2) > diff --git a/test/unit/config.result b/test/unit/config.result > index 2a5473c..8b098fb 100644 > --- a/test/unit/config.result > +++ b/test/unit/config.result > @@ -419,3 +419,26 @@ cfg.collect_lua_garbage = false > _ = lcfg.check(cfg) > --- > ... > +-- > +-- gh-84: sync before master demotion, and allow to configure > +-- sync timeout. > +-- > +cfg.sync_timeout = -100 > +--- > +... > +check(cfg) > +--- > +- Sync timeout must be non-negative number > +... > +cfg.sync_timeout = 0 > +--- > +... > +_ = lcfg.check(cfg) > +--- > +... > +cfg.sync_timeout = 10.5 > +--- > +... > +_ = lcfg.check(cfg) > +--- > +... > diff --git a/test/unit/config.test.lua b/test/unit/config.test.lua > index 140c38c..0990d0e 100644 > --- a/test/unit/config.test.lua > +++ b/test/unit/config.test.lua > @@ -166,3 +166,14 @@ cfg.collect_lua_garbage = true > _ = lcfg.check(cfg) > cfg.collect_lua_garbage = false > _ = lcfg.check(cfg) > + > +-- > +-- gh-84: sync before master demotion, and allow to configure > +-- sync timeout. > +-- > +cfg.sync_timeout = -100 > +check(cfg) > +cfg.sync_timeout = 0 > +_ = lcfg.check(cfg) > +cfg.sync_timeout = 10.5 > +_ = lcfg.check(cfg) > diff --git a/vshard/cfg.lua b/vshard/cfg.lua > index c578af2..9860676 100644 > --- a/vshard/cfg.lua > +++ b/vshard/cfg.lua > @@ -198,7 +198,11 @@ local cfg_template = { > {'collect_lua_garbage', { > type = 'boolean', name = 'Garbage Lua collect necessity', > is_optional = true, default = false > - }} > + }}, > + {'sync_timeout', { > + type = 'non-negative number', name = 'Sync timeout', is_optional = > true, + default = consts.DEFAULT_SYNC_TIMEOUT > + }}, > } > > -- > @@ -227,6 +231,7 @@ local function remove_non_box_options(cfg) > cfg.shard_index = nil > cfg.collect_bucket_garbage_interval = nil > cfg.collect_lua_garbage = nil > + cfg.sync_timeout = nil > end > > return { > diff --git a/vshard/consts.lua b/vshard/consts.lua > index 47651d7..1a0e4ab 100644 > --- a/vshard/consts.lua > +++ b/vshard/consts.lua > @@ -20,7 +20,6 @@ return { > REPLICATION_THRESHOLD_FAIL = 10, > > DEFAULT_BUCKET_COUNT = 3000; > - BUCKET_SYNC_TIMEOUT = 0.1; > BUCKET_SENT_GARBAGE_DELAY = 0.5; > DEFAULT_REBALANCER_DISBALANCE_THRESHOLD = 1; > REBALANCER_IDLE_INTERVAL = 60 * 60; > @@ -30,7 +29,7 @@ return { > CALL_TIMEOUT_MAX = 64; > FAILOVER_UP_TIMEOUT = 5; > FAILOVER_DOWN_TIMEOUT = 1; > - SYNC_TIMEOUT = 1; > + DEFAULT_SYNC_TIMEOUT = 1; > RECONNECT_TIMEOUT = 0.5; > DEFAULT_COLLECT_BUCKET_GARBAGE_INTERVAL = 0.5; > RECOVERY_INTERVAL = 5; > diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua > index a0f216e..6d66b05 100644 > --- a/vshard/storage/init.lua > +++ b/vshard/storage/init.lua > @@ -35,10 +35,12 @@ if not M then > errinj = { > ERRINJ_BUCKET_FIND_GARBAGE_DELAY = false, > ERRINJ_RELOAD = false, > + ERRINJ_CFG_DELAY = false, > }, > -- This counter is used to restart background fibers with > -- new reloaded code. > module_version = 0, > + sync_timeout = consts.DEFAULT_SYNC_TIMEOUT, > > ------------------- Garbage collection ------------------- > -- Fiber to remove garbage buckets data. > @@ -305,7 +307,7 @@ local function sync(timeout) > end > > log.debug("Synchronizing replicaset...") > - timeout = timeout or consts.SYNC_TIMEOUT > + timeout = timeout or M.sync_timeout > local vclock = box.info.vclock > local tstart = lfiber.time() > repeat > @@ -539,14 +541,31 @@ local function buckets_discovery() > return ret > end > > +-- > +-- The only thing, that must be done to abort a master demote is > +-- a reset of read_only. > +-- > +local function local_on_master_disable_abort() > + box.cfg{read_only = false} > +end > + > +-- > +-- Prepare to a master demotion. Before it, a master must stop > +-- accept writes, and try to wait until all of its data is > +-- replicated to each slave. > +-- > +local function local_on_master_disable_prepare() > + log.info("Resigning from the replicaset master role...") > + box.cfg({read_only = true}) > + sync(M.sync_timeout) > +end > + > -- > -- This function executes when a master role is removed from local > -- instance during configuration > -- > local function local_on_master_disable() > M._on_master_disable:run() > - box.cfg({read_only = true}) > - log.verbose("Resigning from the replicaset master role...") > -- Stop garbage collecting > if M.collect_bucket_garbage_fiber ~= nil then > M.collect_bucket_garbage_fiber:cancel() > @@ -558,13 +577,29 @@ local function local_on_master_disable() > M.recovery_fiber = nil > log.info('Recovery stopped') > end > - -- Wait until replicas are synchronized before one another become a new > master - sync(consts.SYNC_TIMEOUT) > log.info("Resigned from the replicaset master role") > end > > local collect_garbage_f > > +-- > +-- The only thing, that must be done to abort a master promotion > +-- is a set read_only back to true. > +-- > +local function local_on_master_enable_abort() > + box.cfg({read_only = true}) > +end > + > +-- > +-- Promote does not require sync, because a replica can not have a > +-- data, that is not on a current master - the replica is read > +-- only. But read_only can not be set to false here, because > +-- until box.cfg is called, it can not be guaranteed, that the > +-- promotion will be successful. > +-- > +local function local_on_master_enable_prepare() > + log.info("Taking on replicaset master role...") > +end > -- > -- This function executes whan a master role is added to local > -- instance during configuration > @@ -572,7 +607,6 @@ local collect_garbage_f > local function local_on_master_enable() > box.cfg({read_only = false}) > M._on_master_enable:run() > - log.verbose("Taking on replicaset master role...") > recovery_garbage_receiving_buckets() > -- Start background process to collect garbage. > M.collect_bucket_garbage_fiber = > @@ -1348,9 +1382,37 @@ local function storage_cfg(cfg, this_replica_uuid) > local shard_index = cfg.shard_index > local collect_bucket_garbage_interval = > cfg.collect_bucket_garbage_interval local collect_lua_garbage = > cfg.collect_lua_garbage > + -- > + -- Sync timeout is a special case - it must be updated before > + -- all other options to allow a user to demote a master with > + -- a new sync timeout. > + -- > + local old_sync_timeout = M.sync_timeout > + M.sync_timeout = cfg.sync_timeout > lcfg.remove_non_box_options(cfg) > > - box.cfg(cfg) > + if was_master and not is_master then > + local_on_master_disable_prepare() > + end > + if not was_master and is_master then > + local_on_master_enable_prepare() > + end > + > + local ok, err = pcall(box.cfg, cfg) > + while M.errinj.ERRINJ_CFG_DELAY do > + lfiber.sleep(0.01) > + end > + if not ok then > + M.sync_timeout = old_sync_timeout > + if was_master and not is_master then > + local_on_master_disable_abort() > + end > + if not was_master and is_master then > + local_on_master_enable_abort() > + end > + error(err) > + end > + > log.info("Box has been configured") > local uri = luri.parse(this_replica.uri) > box.once("vshard:storage:1", storage_schema_v1, uri.login, > uri.password) --nextPart3923829.ZClXRgqJih Content-Type: application/pgp-signature; name="signature.asc" Content-Description: This is a digitally signed message part. Content-Transfer-Encoding: 7Bit -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEFZT35EtIMRTDS5hJnoTdFFzh6LUFAlq4zZsACgkQnoTdFFzh 6LXLUwgAwpegEaDSajqTo++p73WB1xPZLFIV8q+E25zY8ZzbBRHtbNljyvXk5UlF p6zw3B0nNmYGOsIYTrvMQSNAqGz4ExVSBloAHIObtHb+keR+ZpUJS2CRVomLUnlo Rx382GGA8Rt6t2gIyn9ZTfUc2vxDYUCbBFsj15T9C64Lmb/+HBi8/Nz2WNN9z/BN h/UaZSZSppRRJO/yonGgSdRz1zMBiE7cm7sgdsnsdA0jzz0BhLNXzcW/Cd3sns00 JUcrpVPHIqX0CEXlSuHgVrg88APZfzb/q/o8nJdN/0zxx5X7gHfj0TwJnlRQxc3c LlFzb8Cz9qPGaNwLt/YqysaBsVkLjg== =hY2k -----END PGP SIGNATURE----- --nextPart3923829.ZClXRgqJih--