[tarantool-patches] Re: [PATCH vshard 1/1] storage: do read_only + sync on master demotion before box.cfg

Georgy Kirichenko georgy at tarantool.org
Mon Mar 26 13:38:19 MSK 2018


Likes Ok for me
On Friday, March 23, 2018 1:11:45 AM MSK Vladislav Shpilevoy wrote:
> Before a master is demoted and updated its box.cfg it must turn
> read_only mode on and try to sync with slaves. With no sync and
> read_only before box.cfg it is possible, that some new data will
> appear during box reconfiguration, and it will not be synced.
> 
> Closes #84
> ---
> 
> Branch: https://github.com/tarantool/vshard/tree/gh-84-sync-before-cfg
> Issue: https://github.com/tarantool/vshard/issues/84
> 
>  test/misc/reconfigure.result             |  14 ++++
>  test/misc/reconfigure.test.lua           |   4 ++
>  test/storage/demote_sync_errinj.result   | 110
> +++++++++++++++++++++++++++++++ test/storage/demote_sync_errinj.test.lua | 
> 39 +++++++++++
>  test/storage/read_only_slave.result      |  27 +++++---
>  test/storage/read_only_slave.test.lua    |  16 +++--
>  test/unit/config.result                  |  23 +++++++
>  test/unit/config.test.lua                |  11 ++++
>  vshard/cfg.lua                           |   7 +-
>  vshard/consts.lua                        |   3 +-
>  vshard/storage/init.lua                  |  76 +++++++++++++++++++--
>  11 files changed, 303 insertions(+), 27 deletions(-)
>  create mode 100644 test/storage/demote_sync_errinj.result
>  create mode 100644 test/storage/demote_sync_errinj.test.lua
> 
> diff --git a/test/misc/reconfigure.result b/test/misc/reconfigure.result
> index 0a014a3..c7960b3 100644
> --- a/test/misc/reconfigure.result
> +++ b/test/misc/reconfigure.result
> @@ -100,6 +100,13 @@ not vshard.storage.internal.collect_lua_garbage
>  ---
>  - true
>  ...
> +vshard.storage.internal.sync_timeout
> +---
> +- 1
> +...
> +cfg.sync_timeout = 100
> +---
> +...
>  cfg.collect_lua_garbage = true
>  ---
>  ...
> @@ -120,6 +127,10 @@ not vshard.storage.internal.collect_lua_garbage
>  ---
>  - true
>  ...
> +vshard.storage.internal.sync_timeout
> +---
> +- 1
> +...
>  vshard.storage.internal.rebalancer_max_receiving ~= 1000
>  ---
>  - true
> @@ -128,6 +139,9 @@ vshard.storage.internal.collect_bucket_garbage_interval
> ~= 100 ---
>  - true
>  ...
> +cfg.sync_timeout = nil
> +---
> +...
>  cfg.collect_lua_garbage = nil
>  ---
>  ...
> diff --git a/test/misc/reconfigure.test.lua b/test/misc/reconfigure.test.lua
> index 033214a..25dc2ca 100644
> --- a/test/misc/reconfigure.test.lua
> +++ b/test/misc/reconfigure.test.lua
> @@ -41,14 +41,18 @@ util.check_error(vshard.storage.cfg, cfg, 'unknow uuid')
> -- changed.
>  --
>  not vshard.storage.internal.collect_lua_garbage
> +vshard.storage.internal.sync_timeout
> +cfg.sync_timeout = 100
>  cfg.collect_lua_garbage = true
>  cfg.rebalancer_max_receiving = 1000
>  cfg.collect_bucket_garbage_interval = 100
>  cfg.invalid_option = 'kek'
>  vshard.storage.cfg(cfg, names.storage_1_a)
>  not vshard.storage.internal.collect_lua_garbage
> +vshard.storage.internal.sync_timeout
>  vshard.storage.internal.rebalancer_max_receiving ~= 1000
>  vshard.storage.internal.collect_bucket_garbage_interval ~= 100
> +cfg.sync_timeout = nil
>  cfg.collect_lua_garbage = nil
>  cfg.rebalancer_max_receiving = nil
>  cfg.collect_bucket_garbage_interval = nil
> diff --git a/test/storage/demote_sync_errinj.result
> b/test/storage/demote_sync_errinj.result new file mode 100644
> index 0000000..2d14504
> --- /dev/null
> +++ b/test/storage/demote_sync_errinj.result
> @@ -0,0 +1,110 @@
> +test_run = require('test_run').new()
> +---
> +...
> +REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
> +---
> +...
> +REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
> +---
> +...
> +test_run:create_cluster(REPLICASET_1, 'storage')
> +---
> +...
> +test_run:create_cluster(REPLICASET_2, 'storage')
> +---
> +...
> +util = require('util')
> +---
> +...
> +util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
> +---
> +...
> +util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
> +---
> +...
> +test_run:switch('storage_1_a')
> +---
> +- true
> +...
> +fiber = require('fiber')
> +---
> +...
> +s = box.schema.create_space('test')
> +---
> +...
> +pk = s:create_index('pk')
> +---
> +...
> +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = true
> +---
> +...
> +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
> +---
> +...
> +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
> +---
> +...
> +f = fiber.create(function() vshard.storage.cfg(cfg, names.storage_1_a) end)
> +---
> +...
> +f:status()
> +---
> +- suspended
> +...
> +-- Can not write - read only mode is already on.
> +s:replace{1}
> +---
> +- error: Can't modify data because this instance is in read-only mode.
> +...
> +test_run:switch('storage_1_b')
> +---
> +- true
> +...
> +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
> +---
> +...
> +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
> +---
> +...
> +vshard.storage.cfg(cfg, names.storage_1_b)
> +---
> +...
> +box.space.test:select{}
> +---
> +- []
> +...
> +test_run:switch('storage_1_a')
> +---
> +- true
> +...
> +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = false
> +---
> +...
> +while f:status() ~= 'dead' do fiber.sleep(0.1) end
> +---
> +...
> +s:select{}
> +---
> +- []
> +...
> +test_run:switch('storage_1_b')
> +---
> +- true
> +...
> +box.space.test:select{}
> +---
> +- []
> +...
> +box.space.test:drop()
> +---
> +...
> +test_run:cmd("switch default")
> +---
> +- true
> +...
> +test_run:drop_cluster(REPLICASET_2)
> +---
> +...
> +test_run:drop_cluster(REPLICASET_1)
> +---
> +...
> diff --git a/test/storage/demote_sync_errinj.test.lua
> b/test/storage/demote_sync_errinj.test.lua new file mode 100644
> index 0000000..f12ab60
> --- /dev/null
> +++ b/test/storage/demote_sync_errinj.test.lua
> @@ -0,0 +1,39 @@
> +test_run = require('test_run').new()
> +REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
> +REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
> +test_run:create_cluster(REPLICASET_1, 'storage')
> +test_run:create_cluster(REPLICASET_2, 'storage')
> +util = require('util')
> +util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
> +util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
> +
> +test_run:switch('storage_1_a')
> +fiber = require('fiber')
> +s = box.schema.create_space('test')
> +pk = s:create_index('pk')
> +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = true
> +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
> +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
> +f = fiber.create(function() vshard.storage.cfg(cfg, names.storage_1_a) end)
> +f:status()
> +-- Can not write - read only mode is already on.
> +s:replace{1}
> +
> +test_run:switch('storage_1_b')
> +cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
> +cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
> +vshard.storage.cfg(cfg, names.storage_1_b)
> +box.space.test:select{}
> +
> +test_run:switch('storage_1_a')
> +vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = false
> +while f:status() ~= 'dead' do fiber.sleep(0.1) end
> +s:select{}
> +
> +test_run:switch('storage_1_b')
> +box.space.test:select{}
> +box.space.test:drop()
> +
> +test_run:cmd("switch default")
> +test_run:drop_cluster(REPLICASET_2)
> +test_run:drop_cluster(REPLICASET_1)
> diff --git a/test/storage/read_only_slave.result
> b/test/storage/read_only_slave.result index 9d5c85e..07d3653 100644
> --- a/test/storage/read_only_slave.result
> +++ b/test/storage/read_only_slave.result
> @@ -36,10 +36,17 @@ s = box.schema.create_space('test')
>  pk = s:create_index('pk')
>  ---
>  ...
> +ok = nil
> +---
> +...
> +err = nil
> +---
> +...
>  function on_master_enable() s:replace{1} end
>  ---
>  ...
> -function on_master_disable() s:replace{2} end
> +-- Test, that in disable trigger already can not write.
> +function on_master_disable() ok, err = pcall(s.replace, s, {2}) end
>  ---
>  ...
>  vshard.storage.on_master_enable(on_master_enable)
> @@ -85,8 +92,8 @@ vshard.storage.on_master_enable(on_master_enable)
>  vshard.storage.on_master_disable(on_master_disable)
>  ---
>  ...
> --- Yes, there is no 3 or 4, because trigger is set after a replica
> --- becames slave.
> +-- Yes, there is no 3 or 4, because a trigger on disable always
> +-- works in readonly.
>  s:select{}
>  ---
>  - - [1]
> @@ -128,28 +135,28 @@ box.cfg.read_only
>  ---
>  - true
>  ...
> +ok, err
> +---
> +- false
> +- Can't modify data because this instance is in read-only mode.
> +...
>  fiber = require('fiber')
>  ---
>  ...
> -while s:count() ~= 3 do fiber.sleep(0.1) end
> +while s:count() ~= 2 do fiber.sleep(0.1) end
>  ---
>  ...
>  s:select{}
>  ---
>  - - [1]
> -  - [2]
>    - [3]
>  ...
>  test_run:switch('storage_1_b')
>  ---
>  - true
>  ...
> --- Yes, there is no {2}, because replication source is unset
> --- already.
> -s:select{}
> +s:drop()
>  ---
> -- - [1]
> -  - [3]
>  ...
>  test_run:cmd("switch default")
>  ---
> diff --git a/test/storage/read_only_slave.test.lua
> b/test/storage/read_only_slave.test.lua index 3253ad7..d49e6f2 100644
> --- a/test/storage/read_only_slave.test.lua
> +++ b/test/storage/read_only_slave.test.lua
> @@ -11,8 +11,11 @@ test_run:switch('storage_1_a')
>  box.cfg.read_only
>  s = box.schema.create_space('test')
>  pk = s:create_index('pk')
> +ok = nil
> +err = nil
>  function on_master_enable() s:replace{1} end
> -function on_master_disable() s:replace{2} end
> +-- Test, that in disable trigger already can not write.
> +function on_master_disable() ok, err = pcall(s.replace, s, {2}) end
>  vshard.storage.on_master_enable(on_master_enable)
>  vshard.storage.on_master_disable(on_master_disable)
>  s:select{}
> @@ -27,8 +30,8 @@ function on_master_enable() s:replace{3} end
>  function on_master_disable() if not box.cfg.read_only then s:replace{4} end
> end vshard.storage.on_master_enable(on_master_enable)
>  vshard.storage.on_master_disable(on_master_disable)
> --- Yes, there is no 3 or 4, because trigger is set after a replica
> --- becames slave.
> +-- Yes, there is no 3 or 4, because a trigger on disable always
> +-- works in readonly.
>  s:select{}
> 
>  -- Check that after master change the read_only is updated, and
> @@ -44,14 +47,13 @@
> cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
> cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
> vshard.storage.cfg(cfg, names.storage_1_a)
>  box.cfg.read_only
> +ok, err
>  fiber = require('fiber')
> -while s:count() ~= 3 do fiber.sleep(0.1) end
> +while s:count() ~= 2 do fiber.sleep(0.1) end
>  s:select{}
> 
>  test_run:switch('storage_1_b')
> --- Yes, there is no {2}, because replication source is unset
> --- already.
> -s:select{}
> +s:drop()
> 
>  test_run:cmd("switch default")
>  test_run:drop_cluster(REPLICASET_2)
> diff --git a/test/unit/config.result b/test/unit/config.result
> index 2a5473c..8b098fb 100644
> --- a/test/unit/config.result
> +++ b/test/unit/config.result
> @@ -419,3 +419,26 @@ cfg.collect_lua_garbage = false
>  _ = lcfg.check(cfg)
>  ---
>  ...
> +--
> +-- gh-84: sync before master demotion, and allow to configure
> +-- sync timeout.
> +--
> +cfg.sync_timeout = -100
> +---
> +...
> +check(cfg)
> +---
> +- Sync timeout must be non-negative number
> +...
> +cfg.sync_timeout = 0
> +---
> +...
> +_ = lcfg.check(cfg)
> +---
> +...
> +cfg.sync_timeout = 10.5
> +---
> +...
> +_ = lcfg.check(cfg)
> +---
> +...
> diff --git a/test/unit/config.test.lua b/test/unit/config.test.lua
> index 140c38c..0990d0e 100644
> --- a/test/unit/config.test.lua
> +++ b/test/unit/config.test.lua
> @@ -166,3 +166,14 @@ cfg.collect_lua_garbage = true
>  _ = lcfg.check(cfg)
>  cfg.collect_lua_garbage = false
>  _ = lcfg.check(cfg)
> +
> +--
> +-- gh-84: sync before master demotion, and allow to configure
> +-- sync timeout.
> +--
> +cfg.sync_timeout = -100
> +check(cfg)
> +cfg.sync_timeout = 0
> +_ = lcfg.check(cfg)
> +cfg.sync_timeout = 10.5
> +_ = lcfg.check(cfg)
> diff --git a/vshard/cfg.lua b/vshard/cfg.lua
> index c578af2..9860676 100644
> --- a/vshard/cfg.lua
> +++ b/vshard/cfg.lua
> @@ -198,7 +198,11 @@ local cfg_template = {
>      {'collect_lua_garbage', {
>          type = 'boolean', name = 'Garbage Lua collect necessity',
>          is_optional = true, default = false
> -    }}
> +    }},
> +    {'sync_timeout', {
> +        type = 'non-negative number', name = 'Sync timeout', is_optional =
> true, +        default = consts.DEFAULT_SYNC_TIMEOUT
> +    }},
>  }
> 
>  --
> @@ -227,6 +231,7 @@ local function remove_non_box_options(cfg)
>      cfg.shard_index = nil
>      cfg.collect_bucket_garbage_interval = nil
>      cfg.collect_lua_garbage = nil
> +    cfg.sync_timeout = nil
>  end
> 
>  return {
> diff --git a/vshard/consts.lua b/vshard/consts.lua
> index 47651d7..1a0e4ab 100644
> --- a/vshard/consts.lua
> +++ b/vshard/consts.lua
> @@ -20,7 +20,6 @@ return {
>      REPLICATION_THRESHOLD_FAIL = 10,
> 
>      DEFAULT_BUCKET_COUNT = 3000;
> -    BUCKET_SYNC_TIMEOUT = 0.1;
>      BUCKET_SENT_GARBAGE_DELAY = 0.5;
>      DEFAULT_REBALANCER_DISBALANCE_THRESHOLD = 1;
>      REBALANCER_IDLE_INTERVAL = 60 * 60;
> @@ -30,7 +29,7 @@ return {
>      CALL_TIMEOUT_MAX = 64;
>      FAILOVER_UP_TIMEOUT = 5;
>      FAILOVER_DOWN_TIMEOUT = 1;
> -    SYNC_TIMEOUT = 1;
> +    DEFAULT_SYNC_TIMEOUT = 1;
>      RECONNECT_TIMEOUT = 0.5;
>      DEFAULT_COLLECT_BUCKET_GARBAGE_INTERVAL = 0.5;
>      RECOVERY_INTERVAL = 5;
> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
> index a0f216e..6d66b05 100644
> --- a/vshard/storage/init.lua
> +++ b/vshard/storage/init.lua
> @@ -35,10 +35,12 @@ if not M then
>          errinj = {
>              ERRINJ_BUCKET_FIND_GARBAGE_DELAY = false,
>              ERRINJ_RELOAD = false,
> +            ERRINJ_CFG_DELAY = false,
>          },
>          -- This counter is used to restart background fibers with
>          -- new reloaded code.
>          module_version = 0,
> +        sync_timeout = consts.DEFAULT_SYNC_TIMEOUT,
> 
>          ------------------- Garbage collection -------------------
>          -- Fiber to remove garbage buckets data.
> @@ -305,7 +307,7 @@ local function sync(timeout)
>      end
> 
>      log.debug("Synchronizing replicaset...")
> -    timeout = timeout or consts.SYNC_TIMEOUT
> +    timeout = timeout or M.sync_timeout
>      local vclock = box.info.vclock
>      local tstart = lfiber.time()
>      repeat
> @@ -539,14 +541,31 @@ local function buckets_discovery()
>      return ret
>  end
> 
> +--
> +-- The only thing, that must be done to abort a master demote is
> +-- a reset of read_only.
> +--
> +local function local_on_master_disable_abort()
> +    box.cfg{read_only = false}
> +end
> +
> +--
> +-- Prepare to a master demotion. Before it, a master must stop
> +-- accept writes, and try to wait until all of its data is
> +-- replicated to each slave.
> +--
> +local function local_on_master_disable_prepare()
> +    log.info("Resigning from the replicaset master role...")
> +    box.cfg({read_only = true})
> +    sync(M.sync_timeout)
> +end
> +
>  --
>  -- This function executes when a master role is removed from local
>  -- instance during configuration
>  --
>  local function local_on_master_disable()
>      M._on_master_disable:run()
> -    box.cfg({read_only = true})
> -    log.verbose("Resigning from the replicaset master role...")
>      -- Stop garbage collecting
>      if M.collect_bucket_garbage_fiber ~= nil then
>          M.collect_bucket_garbage_fiber:cancel()
> @@ -558,13 +577,29 @@ local function local_on_master_disable()
>          M.recovery_fiber = nil
>          log.info('Recovery stopped')
>      end
> -    -- Wait until replicas are synchronized before one another become a new
> master -    sync(consts.SYNC_TIMEOUT)
>      log.info("Resigned from the replicaset master role")
>  end
> 
>  local collect_garbage_f
> 
> +--
> +-- The only thing, that must be done to abort a master promotion
> +-- is a set read_only back to true.
> +--
> +local function local_on_master_enable_abort()
> +    box.cfg({read_only = true})
> +end
> +
> +--
> +-- Promote does not require sync, because a replica can not have a
> +-- data, that is not on a current master - the replica is read
> +-- only. But read_only can not be set to false here, because
> +-- until box.cfg is called, it can not be guaranteed, that the
> +-- promotion will be successful.
> +--
> +local function local_on_master_enable_prepare()
> +    log.info("Taking on replicaset master role...")
> +end
>  --
>  -- This function executes whan a master role is added to local
>  -- instance during configuration
> @@ -572,7 +607,6 @@ local collect_garbage_f
>  local function local_on_master_enable()
>      box.cfg({read_only = false})
>      M._on_master_enable:run()
> -    log.verbose("Taking on replicaset master role...")
>      recovery_garbage_receiving_buckets()
>      -- Start background process to collect garbage.
>      M.collect_bucket_garbage_fiber =
> @@ -1348,9 +1382,37 @@ local function storage_cfg(cfg, this_replica_uuid)
>      local shard_index = cfg.shard_index
>      local collect_bucket_garbage_interval =
> cfg.collect_bucket_garbage_interval local collect_lua_garbage =
> cfg.collect_lua_garbage
> +    --
> +    -- Sync timeout is a special case - it must be updated before
> +    -- all other options to allow a user to demote a master with
> +    -- a new sync timeout.
> +    --
> +    local old_sync_timeout = M.sync_timeout
> +    M.sync_timeout = cfg.sync_timeout
>      lcfg.remove_non_box_options(cfg)
> 
> -    box.cfg(cfg)
> +    if was_master and not is_master then
> +        local_on_master_disable_prepare()
> +    end
> +    if not was_master and is_master then
> +        local_on_master_enable_prepare()
> +    end
> +
> +    local ok, err = pcall(box.cfg, cfg)
> +    while M.errinj.ERRINJ_CFG_DELAY do
> +        lfiber.sleep(0.01)
> +    end
> +    if not ok then
> +        M.sync_timeout = old_sync_timeout
> +        if was_master and not is_master then
> +            local_on_master_disable_abort()
> +        end
> +        if not was_master and is_master then
> +            local_on_master_enable_abort()
> +        end
> +        error(err)
> +    end
> +
>      log.info("Box has been configured")
>      local uri = luri.parse(this_replica.uri)
>      box.once("vshard:storage:1", storage_schema_v1, uri.login,
> uri.password)

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.tarantool.org/pipermail/tarantool-patches/attachments/20180326/ada40efd/attachment.sig>


More information about the Tarantool-patches mailing list