[tarantool-patches] Re: [PATCH 1/3] Update only vshard part of a cfg on reload

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Wed Aug 8 14:17:02 MSK 2018


Thanks for the patch! Pushed into the master.

On 07/08/2018 16:19, Alex Khatskevich wrote:
> 
> On 06.08.2018 20:03, Vladislav Shpilevoy wrote:
>> Thanks for the patch! See 3 comments below.
>>
>>> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
>>> index 102b942..40216ea 100644
>>> --- a/vshard/storage/init.lua
>>> +++ b/vshard/storage/init.lua
>>> @@ -1553,18 +1553,19 @@ local function storage_cfg(cfg, this_replica_uuid)
>>>       --
>>>       -- If a master role of the replica is not changed, then
>>>       -- 'read_only' can be set right here.
>>> -    cfg.listen = cfg.listen or this_replica.uri
>>> -    if cfg.replication == nil and this_replicaset.master and not is_master then
>>> -        cfg.replication = {this_replicaset.master.uri}
>>> +    box_cfg.listen = box_cfg.listen or this_replica.uri
>>> +    if box_cfg.replication == nil and this_replicaset.master
>>> +       and not is_master then
>>> +        box_cfg.replication = {this_replicaset.master.uri}
>>>       else
>>> -        cfg.replication = {}
>>> +        box_cfg.replication = {}
>>>       end
>>>       if was_master == is_master then
>>> -        cfg.read_only = not is_master
>>> +        box_cfg.read_only = not is_master
>>>       end
>>>       if type(box.cfg) == 'function' then
>>> -        cfg.instance_uuid = this_replica.uuid
>>> -        cfg.replicaset_uuid = this_replicaset.uuid
>>> +        box_cfg.instance_uuid = this_replica.uuid
>>> +        box_cfg.replicaset_uuid = this_replicaset.uuid
>>
>> 1. All these box_cfg manipulations should be done under 'if not is_reload'
>> I think.
> Fixed.
>>
>>>       else
>>>           local info = box.info
>>>           if this_replica_uuid ~= info.uuid then
>>> @@ -1607,27 +1610,27 @@ local function storage_cfg(cfg, this_replica_uuid)
>>>           local_on_master_enable_prepare()
>>>       end
>>>
>>> -    local box_cfg = table.copy(cfg)
>>> -    lcfg.remove_non_box_options(box_cfg)
>>> -    local ok, err = pcall(box.cfg, box_cfg)
>>> -    while M.errinj.ERRINJ_CFG_DELAY do
>>> -        lfiber.sleep(0.01)
>>> -    end
>>> -    if not ok then
>>> -        M.sync_timeout = old_sync_timeout
>>> -        if was_master and not is_master then
>>> -            local_on_master_disable_abort()
>>> +    if not is_reload then
>>> +        local ok, err = true, nil
>>> +        ok, err = pcall(box.cfg, box_cfg)
>>
>> 2. Why do you need to announce 'local ok, err' before
>> their usage on the next line?
> fixed.
>>
>>
>>> +        while M.errinj.ERRINJ_CFG_DELAY do
>>> +            lfiber.sleep(0.01)
>>>           end
>>> -        if not was_master and is_master then
>>> -            local_on_master_enable_abort()
>>> +        if not ok then
>>> +            M.sync_timeout = old_sync_timeout
>>> +            if was_master and not is_master then
>>> +                local_on_master_disable_abort()
>>> +            end
>>> +            if not was_master and is_master then
>>> +                local_on_master_enable_abort()
>>> +            end
>>> +            error(err)
>>>           end
>>> -        error(err)
>>> +        log.info("Box has been configured")
>>> +        local uri = luri.parse(this_replica.uri)
>>> +        box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password)
>>>       end
>>>
>>> -    log.info("Box has been configured")
>>> -    local uri = luri.parse(this_replica.uri)
>>> -    box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password)
>>> -
>>>       lreplicaset.rebind_replicasets(new_replicasets, M.replicasets)
>>>       lreplicaset.outdate_replicasets(M.replicasets)
>>>       M.replicasets = new_replicasets
>>> @@ -1874,7 +1877,7 @@ if not rawget(_G, MODULE_INTERNALS) then
>>>       rawset(_G, MODULE_INTERNALS, M)
>>>   else
>>>       reload_evolution.upgrade(M)
>>> -    storage_cfg(M.current_cfg, M.this_replica.uuid)
>>> +    storage_cfg(M.current_cfg, M.this_replica.uuid, true)
>>
>> 3. I see that you have stored vshard_cfg in M.current_cfg. Not a full
>> config. So it does not have any box options. And it causes a question
>> - why do you need to separate reload from non-reload, if reload anyway
>> in such implementation is like 'box.cfg{}' call with no parameters?
>> And if you do not store box_cfg options how are you going to compare
>> configs when we will implement atomic cfg over cluster?
> Fixed. And in a router too.
> 
> 
> 
> Full diff:
> 
> commit d3c35612130ff95b20245993ab5053981d3b985f
> Author: AKhatskevich <avkhatskevich at tarantool.org>
> Date:   Mon Jul 23 16:42:22 2018 +0300
> 
>      Update only vshard part of a cfg on reload
> 
>      Box cfg could have been changed by a user and then overridden by
>      an old vshard config on reload.
> 
>      Since that commit, box part of a config is applied only when
>      it is explicitly passed to a `cfg` method.
> 
>      This change is important for the multiple routers feature.
> 
> diff --git a/vshard/cfg.lua b/vshard/cfg.lua
> index 7c9ab77..af1c3ee 100644
> --- a/vshard/cfg.lua
> +++ b/vshard/cfg.lua
> @@ -221,6 +221,22 @@ local cfg_template = {
>       },
>   }
> 
> +--
> +-- Split it into vshard_cfg and box_cfg parts.
> +--
> +local function cfg_split(cfg)
> +    local vshard_cfg = {}
> +    local box_cfg = {}
> +    for k, v in pairs(cfg) do
> +        if cfg_template[k] then
> +            vshard_cfg[k] = v
> +        else
> +            box_cfg[k] = v
> +        end
> +    end
> +    return vshard_cfg, box_cfg
> +end
> +
>   --
>   -- Names of options which cannot be changed during reconfigure.
>   --
> @@ -252,24 +268,7 @@ local function cfg_check(shard_cfg, old_cfg)
>       return shard_cfg
>   end
> 
> ---
> --- Nullify non-box options.
> ---
> -local function remove_non_box_options(cfg)
> -    cfg.sharding = nil
> -    cfg.weights = nil
> -    cfg.zone = nil
> -    cfg.bucket_count = nil
> -    cfg.rebalancer_disbalance_threshold = nil
> -    cfg.rebalancer_max_receiving = nil
> -    cfg.shard_index = nil
> -    cfg.collect_bucket_garbage_interval = nil
> -    cfg.collect_lua_garbage = nil
> -    cfg.sync_timeout = nil
> -    cfg.connection_outdate_delay = nil
> -end
> -
>   return {
>       check = cfg_check,
> -    remove_non_box_options = remove_non_box_options,
> +    split = cfg_split,
>   }
> diff --git a/vshard/router/init.lua b/vshard/router/init.lua
> index d8c026b..1e8d898 100644
> --- a/vshard/router/init.lua
> +++ b/vshard/router/init.lua
> @@ -491,19 +491,17 @@ end
>   -- Configuration
>   --------------------------------------------------------------------------------
> 
> -local function router_cfg(cfg)
> +local function router_cfg(cfg, is_reload)
>       cfg = lcfg.check(cfg, M.current_cfg)
> -    local new_cfg = table.copy(cfg)
> +    local vshard_cfg, box_cfg = lcfg.split(cfg)
>       if not M.replicasets then
>           log.info('Starting router configuration')
>       else
>           log.info('Starting router reconfiguration')
>       end
> -    local new_replicasets = lreplicaset.buildall(cfg)
> -    local total_bucket_count = cfg.bucket_count
> -    local collect_lua_garbage = cfg.collect_lua_garbage
> -    local box_cfg = table.copy(cfg)
> -    lcfg.remove_non_box_options(box_cfg)
> +    local new_replicasets = lreplicaset.buildall(vshard_cfg)
> +    local total_bucket_count = vshard_cfg.bucket_count
> +    local collect_lua_garbage = vshard_cfg.collect_lua_garbage
>       log.info("Calling box.cfg()...")
>       for k, v in pairs(box_cfg) do
>           log.info({[k] = v})
> @@ -514,8 +512,10 @@ local function router_cfg(cfg)
>       if M.errinj.ERRINJ_CFG then
>           error('Error injection: cfg')
>       end
> -    box.cfg(box_cfg)
> -    log.info("Box has been configured")
> +    if not is_reload then
> +        box.cfg(box_cfg)
> +        log.info("Box has been configured")
> +    end
>       -- Move connections from an old configuration to a new one.
>       -- It must be done with no yields to prevent usage both of not
>       -- fully moved old replicasets, and not fully built new ones.
> @@ -526,8 +526,9 @@ local function router_cfg(cfg)
>           replicaset:connect_all()
>       end
>       lreplicaset.wait_masters_connect(new_replicasets)
> -    lreplicaset.outdate_replicasets(M.replicasets, cfg.connection_outdate_delay)
> -    M.connection_outdate_delay = cfg.connection_outdate_delay
> +    lreplicaset.outdate_replicasets(M.replicasets,
> + vshard_cfg.connection_outdate_delay)
> +    M.connection_outdate_delay = vshard_cfg.connection_outdate_delay
>       M.total_bucket_count = total_bucket_count
>       M.collect_lua_garbage = collect_lua_garbage
>       M.current_cfg = cfg
> @@ -817,7 +818,7 @@ end
>   if not rawget(_G, MODULE_INTERNALS) then
>       rawset(_G, MODULE_INTERNALS, M)
>   else
> -    router_cfg(M.current_cfg)
> +    router_cfg(M.current_cfg, true)
>       M.module_version = M.module_version + 1
>   end
> 
> @@ -825,7 +826,7 @@ M.discovery_f = discovery_f
>   M.failover_f = failover_f
> 
>   return {
> -    cfg = router_cfg;
> +    cfg = function(cfg) return router_cfg(cfg, false) end;
>       info = router_info;
>       buckets_info = router_buckets_info;
>       call = router_call;
> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
> index 1f29323..2080769 100644
> --- a/vshard/storage/init.lua
> +++ b/vshard/storage/init.lua
> @@ -1500,13 +1500,14 @@ end
>   --------------------------------------------------------------------------------
>   -- Configuration
>   --------------------------------------------------------------------------------
> -local function storage_cfg(cfg, this_replica_uuid)
> +
> +local function storage_cfg(cfg, this_replica_uuid, is_reload)
>       if this_replica_uuid == nil then
>           error('Usage: cfg(configuration, this_replica_uuid)')
>       end
>       cfg = lcfg.check(cfg, M.current_cfg)
> -    local new_cfg = table.copy(cfg)
> -    if cfg.weights or cfg.zone then
> +    local vshard_cfg, box_cfg = lcfg.split(cfg)
> +    if vshard_cfg.weights or vshard_cfg.zone then
>           error('Weights and zone are not allowed for storage configuration')
>       end
>       if M.replicasets then
> @@ -1520,7 +1521,7 @@ local function storage_cfg(cfg, this_replica_uuid)
> 
>       local this_replicaset
>       local this_replica
> -    local new_replicasets = lreplicaset.buildall(cfg)
> +    local new_replicasets = lreplicaset.buildall(vshard_cfg)
>       local min_master
>       for rs_uuid, rs in pairs(new_replicasets) do
>           for replica_uuid, replica in pairs(rs.replicas) do
> @@ -1544,46 +1545,14 @@ local function storage_cfg(cfg, this_replica_uuid)
>           log.info('I am master')
>       end
> 
> -    -- Do not change 'read_only' option here - if a master is
> -    -- disabled and there are triggers on master disable, then
> -    -- they would not be able to modify anything, if 'read_only'
> -    -- had been set here. 'Read_only' is set in
> -    -- local_on_master_disable after triggers and is unset in
> -    -- local_on_master_enable before triggers.
> -    --
> -    -- If a master role of the replica is not changed, then
> -    -- 'read_only' can be set right here.
> -    cfg.listen = cfg.listen or this_replica.uri
> -    if cfg.replication == nil and this_replicaset.master and not is_master then
> -        cfg.replication = {this_replicaset.master.uri}
> -    else
> -        cfg.replication = {}
> -    end
> -    if was_master == is_master then
> -        cfg.read_only = not is_master
> -    end
> -    if type(box.cfg) == 'function' then
> -        cfg.instance_uuid = this_replica.uuid
> -        cfg.replicaset_uuid = this_replicaset.uuid
> -    else
> -        local info = box.info
> -        if this_replica_uuid ~= info.uuid then
> -            error(string.format('Instance UUID mismatch: already set "%s" '..
> -                                'but "%s" in arguments', info.uuid,
> -                                this_replica_uuid))
> -        end
> -        if this_replicaset.uuid ~= info.cluster.uuid then
> -            error(string.format('Replicaset UUID mismatch: already set "%s" '..
> -                                'but "%s" in vshard config', info.cluster.uuid,
> -                                this_replicaset.uuid))
> -        end
> -    end
> -    local total_bucket_count = cfg.bucket_count
> -    local rebalancer_disbalance_threshold = cfg.rebalancer_disbalance_threshold
> -    local rebalancer_max_receiving = cfg.rebalancer_max_receiving
> -    local shard_index = cfg.shard_index
> -    local collect_bucket_garbage_interval = cfg.collect_bucket_garbage_interval
> -    local collect_lua_garbage = cfg.collect_lua_garbage
> +    local total_bucket_count = vshard_cfg.bucket_count
> +    local rebalancer_disbalance_threshold =
> +        vshard_cfg.rebalancer_disbalance_threshold
> +    local rebalancer_max_receiving = vshard_cfg.rebalancer_max_receiving
> +    local shard_index = vshard_cfg.shard_index
> +    local collect_bucket_garbage_interval =
> +        vshard_cfg.collect_bucket_garbage_interval
> +    local collect_lua_garbage = vshard_cfg.collect_lua_garbage
> 
>       -- It is considered that all possible errors during cfg
>       -- process occur only before this place.
> @@ -1598,7 +1567,7 @@ local function storage_cfg(cfg, this_replica_uuid)
>       -- a new sync timeout.
>       --
>       local old_sync_timeout = M.sync_timeout
> -    M.sync_timeout = cfg.sync_timeout
> +    M.sync_timeout = vshard_cfg.sync_timeout
> 
>       if was_master and not is_master then
>           local_on_master_disable_prepare()
> @@ -1607,27 +1576,61 @@ local function storage_cfg(cfg, this_replica_uuid)
>           local_on_master_enable_prepare()
>       end
> 
> -    local box_cfg = table.copy(cfg)
> -    lcfg.remove_non_box_options(box_cfg)
> -    local ok, err = pcall(box.cfg, box_cfg)
> -    while M.errinj.ERRINJ_CFG_DELAY do
> -        lfiber.sleep(0.01)
> -    end
> -    if not ok then
> -        M.sync_timeout = old_sync_timeout
> -        if was_master and not is_master then
> -            local_on_master_disable_abort()
> +    if not is_reload then
> +        -- Do not change 'read_only' option here - if a master is
> +        -- disabled and there are triggers on master disable, then
> +        -- they would not be able to modify anything, if 'read_only'
> +        -- had been set here. 'Read_only' is set in
> +        -- local_on_master_disable after triggers and is unset in
> +        -- local_on_master_enable before triggers.
> +        --
> +        -- If a master role of the replica is not changed, then
> +        -- 'read_only' can be set right here.
> +        box_cfg.listen = box_cfg.listen or this_replica.uri
> +        if box_cfg.replication == nil and this_replicaset.master
> +           and not is_master then
> +            box_cfg.replication = {this_replicaset.master.uri}
> +        else
> +            box_cfg.replication = {}
>           end
> -        if not was_master and is_master then
> -            local_on_master_enable_abort()
> +        if was_master == is_master then
> +            box_cfg.read_only = not is_master
>           end
> -        error(err)
> +        if type(box.cfg) == 'function' then
> +            box_cfg.instance_uuid = this_replica.uuid
> +            box_cfg.replicaset_uuid = this_replicaset.uuid
> +        else
> +            local info = box.info
> +            if this_replica_uuid ~= info.uuid then
> +                error(string.format('Instance UUID mismatch: already set ' ..
> +                                    '"%s" but "%s" in arguments', info.uuid,
> +                                    this_replica_uuid))
> +            end
> +            if this_replicaset.uuid ~= info.cluster.uuid then
> +                error(string.format('Replicaset UUID mismatch: already set ' ..
> +                                    '"%s" but "%s" in vshard config',
> +                                    info.cluster.uuid, this_replicaset.uuid))
> +            end
> +        end
> +        local ok, err = pcall(box.cfg, box_cfg)
> +        while M.errinj.ERRINJ_CFG_DELAY do
> +            lfiber.sleep(0.01)
> +        end
> +        if not ok then
> +            M.sync_timeout = old_sync_timeout
> +            if was_master and not is_master then
> +                local_on_master_disable_abort()
> +            end
> +            if not was_master and is_master then
> +                local_on_master_enable_abort()
> +            end
> +            error(err)
> +        end
> +        log.info("Box has been configured")
> +        local uri = luri.parse(this_replica.uri)
> +        box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password)
>       end
> 
> -    log.info("Box has been configured")
> -    local uri = luri.parse(this_replica.uri)
> -    box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password)
> -
>       lreplicaset.rebind_replicasets(new_replicasets, M.replicasets)
>       lreplicaset.outdate_replicasets(M.replicasets)
>       M.replicasets = new_replicasets
> @@ -1639,7 +1642,7 @@ local function storage_cfg(cfg, this_replica_uuid)
>       M.shard_index = shard_index
>       M.collect_bucket_garbage_interval = collect_bucket_garbage_interval
>       M.collect_lua_garbage = collect_lua_garbage
> -    M.current_cfg = new_cfg
> +    M.current_cfg = cfg
> 
>       if was_master and not is_master then
>           local_on_master_disable()
> @@ -1875,7 +1878,7 @@ if not rawget(_G, MODULE_INTERNALS) then
>       rawset(_G, MODULE_INTERNALS, M)
>   else
>       reload_evolution.upgrade(M)
> -    storage_cfg(M.current_cfg, M.this_replica.uuid)
> +    storage_cfg(M.current_cfg, M.this_replica.uuid, true)
>       M.module_version = M.module_version + 1
>   end
> 
> @@ -1914,7 +1917,7 @@ return {
>       rebalancing_is_in_progress = rebalancing_is_in_progress,
>       recovery_wakeup = recovery_wakeup,
>       call = storage_call,
> -    cfg = storage_cfg,
> +    cfg = function(cfg, uuid) return storage_cfg(cfg, uuid, false) end,
>       info = storage_info,
>       buckets_info = storage_buckets_info,
>       buckets_count = storage_buckets_count,
> 
> 
> 




More information about the Tarantool-patches mailing list