From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 4C7C3247CD for ; Tue, 7 Aug 2018 09:20:00 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id XJRyq0A2TQON for ; Tue, 7 Aug 2018 09:20:00 -0400 (EDT) Received: from smtp39.i.mail.ru (smtp39.i.mail.ru [94.100.177.99]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id D5C9D2350F for ; Tue, 7 Aug 2018 09:19:59 -0400 (EDT) Subject: [tarantool-patches] Re: [PATCH 1/3] Update only vshard part of a cfg on reload References: <37ba10547bf64fd880734b0550e4865f58dda14a.1533054045.git.avkhatskevich@tarantool.org> <55486f60-a630-1d19-69e0-5cf99c01f319@tarantool.org> <1457a321-3ed1-ed05-74c2-e4f8508a0156@tarantool.org> From: Alex Khatskevich Message-ID: <1d95c864-6ec2-d9ee-a851-1ea1020a47ae@tarantool.org> Date: Tue, 7 Aug 2018 16:19:57 +0300 MIME-Version: 1.0 In-Reply-To: <1457a321-3ed1-ed05-74c2-e4f8508a0156@tarantool.org> Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Transfer-Encoding: 8bit Content-Language: en-US Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: Vladislav Shpilevoy , tarantool-patches@freelists.org On 06.08.2018 20:03, Vladislav Shpilevoy wrote: > Thanks for the patch! See 3 comments below. > >> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua >> index 102b942..40216ea 100644 >> --- a/vshard/storage/init.lua >> +++ b/vshard/storage/init.lua >> @@ -1553,18 +1553,19 @@ local function storage_cfg(cfg, >> this_replica_uuid) >>       -- >>       -- If a master role of the replica is not changed, then >>       -- 'read_only' can be set right here. >> -    cfg.listen = cfg.listen or this_replica.uri >> -    if cfg.replication == nil and this_replicaset.master and not >> is_master then >> -        cfg.replication = {this_replicaset.master.uri} >> +    box_cfg.listen = box_cfg.listen or this_replica.uri >> +    if box_cfg.replication == nil and this_replicaset.master >> +       and not is_master then >> +        box_cfg.replication = {this_replicaset.master.uri} >>       else >> -        cfg.replication = {} >> +        box_cfg.replication = {} >>       end >>       if was_master == is_master then >> -        cfg.read_only = not is_master >> +        box_cfg.read_only = not is_master >>       end >>       if type(box.cfg) == 'function' then >> -        cfg.instance_uuid = this_replica.uuid >> -        cfg.replicaset_uuid = this_replicaset.uuid >> +        box_cfg.instance_uuid = this_replica.uuid >> +        box_cfg.replicaset_uuid = this_replicaset.uuid > > 1. All these box_cfg manipulations should be done under 'if not > is_reload' > I think. Fixed. > >>       else >>           local info = box.info >>           if this_replica_uuid ~= info.uuid then >> @@ -1607,27 +1610,27 @@ local function storage_cfg(cfg, >> this_replica_uuid) >>           local_on_master_enable_prepare() >>       end >> >> -    local box_cfg = table.copy(cfg) >> -    lcfg.remove_non_box_options(box_cfg) >> -    local ok, err = pcall(box.cfg, box_cfg) >> -    while M.errinj.ERRINJ_CFG_DELAY do >> -        lfiber.sleep(0.01) >> -    end >> -    if not ok then >> -        M.sync_timeout = old_sync_timeout >> -        if was_master and not is_master then >> -            local_on_master_disable_abort() >> +    if not is_reload then >> +        local ok, err = true, nil >> +        ok, err = pcall(box.cfg, box_cfg) > > 2. Why do you need to announce 'local ok, err' before > their usage on the next line? fixed. > > >> +        while M.errinj.ERRINJ_CFG_DELAY do >> +            lfiber.sleep(0.01) >>           end >> -        if not was_master and is_master then >> -            local_on_master_enable_abort() >> +        if not ok then >> +            M.sync_timeout = old_sync_timeout >> +            if was_master and not is_master then >> +                local_on_master_disable_abort() >> +            end >> +            if not was_master and is_master then >> +                local_on_master_enable_abort() >> +            end >> +            error(err) >>           end >> -        error(err) >> +        log.info("Box has been configured") >> +        local uri = luri.parse(this_replica.uri) >> +        box.once("vshard:storage:1", storage_schema_v1, uri.login, >> uri.password) >>       end >> >> -    log.info("Box has been configured") >> -    local uri = luri.parse(this_replica.uri) >> -    box.once("vshard:storage:1", storage_schema_v1, uri.login, >> uri.password) >> - >>       lreplicaset.rebind_replicasets(new_replicasets, M.replicasets) >>       lreplicaset.outdate_replicasets(M.replicasets) >>       M.replicasets = new_replicasets >> @@ -1874,7 +1877,7 @@ if not rawget(_G, MODULE_INTERNALS) then >>       rawset(_G, MODULE_INTERNALS, M) >>   else >>       reload_evolution.upgrade(M) >> -    storage_cfg(M.current_cfg, M.this_replica.uuid) >> +    storage_cfg(M.current_cfg, M.this_replica.uuid, true) > > 3. I see that you have stored vshard_cfg in M.current_cfg. Not a full > config. So it does not have any box options. And it causes a question > - why do you need to separate reload from non-reload, if reload anyway > in such implementation is like 'box.cfg{}' call with no parameters? > And if you do not store box_cfg options how are you going to compare > configs when we will implement atomic cfg over cluster? Fixed. And in a router too. Full diff: commit d3c35612130ff95b20245993ab5053981d3b985f Author: AKhatskevich Date:   Mon Jul 23 16:42:22 2018 +0300     Update only vshard part of a cfg on reload     Box cfg could have been changed by a user and then overridden by     an old vshard config on reload.     Since that commit, box part of a config is applied only when     it is explicitly passed to a `cfg` method.     This change is important for the multiple routers feature. diff --git a/vshard/cfg.lua b/vshard/cfg.lua index 7c9ab77..af1c3ee 100644 --- a/vshard/cfg.lua +++ b/vshard/cfg.lua @@ -221,6 +221,22 @@ local cfg_template = {      },  } +-- +-- Split it into vshard_cfg and box_cfg parts. +-- +local function cfg_split(cfg) +    local vshard_cfg = {} +    local box_cfg = {} +    for k, v in pairs(cfg) do +        if cfg_template[k] then +            vshard_cfg[k] = v +        else +            box_cfg[k] = v +        end +    end +    return vshard_cfg, box_cfg +end +  --  -- Names of options which cannot be changed during reconfigure.  -- @@ -252,24 +268,7 @@ local function cfg_check(shard_cfg, old_cfg)      return shard_cfg  end --- --- Nullify non-box options. --- -local function remove_non_box_options(cfg) -    cfg.sharding = nil -    cfg.weights = nil -    cfg.zone = nil -    cfg.bucket_count = nil -    cfg.rebalancer_disbalance_threshold = nil -    cfg.rebalancer_max_receiving = nil -    cfg.shard_index = nil -    cfg.collect_bucket_garbage_interval = nil -    cfg.collect_lua_garbage = nil -    cfg.sync_timeout = nil -    cfg.connection_outdate_delay = nil -end -  return {      check = cfg_check, -    remove_non_box_options = remove_non_box_options, +    split = cfg_split,  } diff --git a/vshard/router/init.lua b/vshard/router/init.lua index d8c026b..1e8d898 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -491,19 +491,17 @@ end  -- Configuration  -------------------------------------------------------------------------------- -local function router_cfg(cfg) +local function router_cfg(cfg, is_reload)      cfg = lcfg.check(cfg, M.current_cfg) -    local new_cfg = table.copy(cfg) +    local vshard_cfg, box_cfg = lcfg.split(cfg)      if not M.replicasets then          log.info('Starting router configuration')      else          log.info('Starting router reconfiguration')      end -    local new_replicasets = lreplicaset.buildall(cfg) -    local total_bucket_count = cfg.bucket_count -    local collect_lua_garbage = cfg.collect_lua_garbage -    local box_cfg = table.copy(cfg) -    lcfg.remove_non_box_options(box_cfg) +    local new_replicasets = lreplicaset.buildall(vshard_cfg) +    local total_bucket_count = vshard_cfg.bucket_count +    local collect_lua_garbage = vshard_cfg.collect_lua_garbage      log.info("Calling box.cfg()...")      for k, v in pairs(box_cfg) do          log.info({[k] = v}) @@ -514,8 +512,10 @@ local function router_cfg(cfg)      if M.errinj.ERRINJ_CFG then          error('Error injection: cfg')      end -    box.cfg(box_cfg) -    log.info("Box has been configured") +    if not is_reload then +        box.cfg(box_cfg) +        log.info("Box has been configured") +    end      -- Move connections from an old configuration to a new one.      -- It must be done with no yields to prevent usage both of not      -- fully moved old replicasets, and not fully built new ones. @@ -526,8 +526,9 @@ local function router_cfg(cfg)          replicaset:connect_all()      end      lreplicaset.wait_masters_connect(new_replicasets) -    lreplicaset.outdate_replicasets(M.replicasets, cfg.connection_outdate_delay) -    M.connection_outdate_delay = cfg.connection_outdate_delay +    lreplicaset.outdate_replicasets(M.replicasets, + vshard_cfg.connection_outdate_delay) +    M.connection_outdate_delay = vshard_cfg.connection_outdate_delay      M.total_bucket_count = total_bucket_count      M.collect_lua_garbage = collect_lua_garbage      M.current_cfg = cfg @@ -817,7 +818,7 @@ end  if not rawget(_G, MODULE_INTERNALS) then      rawset(_G, MODULE_INTERNALS, M)  else -    router_cfg(M.current_cfg) +    router_cfg(M.current_cfg, true)      M.module_version = M.module_version + 1  end @@ -825,7 +826,7 @@ M.discovery_f = discovery_f  M.failover_f = failover_f  return { -    cfg = router_cfg; +    cfg = function(cfg) return router_cfg(cfg, false) end;      info = router_info;      buckets_info = router_buckets_info;      call = router_call; diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index 1f29323..2080769 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -1500,13 +1500,14 @@ end  --------------------------------------------------------------------------------  -- Configuration  -------------------------------------------------------------------------------- -local function storage_cfg(cfg, this_replica_uuid) + +local function storage_cfg(cfg, this_replica_uuid, is_reload)      if this_replica_uuid == nil then          error('Usage: cfg(configuration, this_replica_uuid)')      end      cfg = lcfg.check(cfg, M.current_cfg) -    local new_cfg = table.copy(cfg) -    if cfg.weights or cfg.zone then +    local vshard_cfg, box_cfg = lcfg.split(cfg) +    if vshard_cfg.weights or vshard_cfg.zone then          error('Weights and zone are not allowed for storage configuration')      end      if M.replicasets then @@ -1520,7 +1521,7 @@ local function storage_cfg(cfg, this_replica_uuid)      local this_replicaset      local this_replica -    local new_replicasets = lreplicaset.buildall(cfg) +    local new_replicasets = lreplicaset.buildall(vshard_cfg)      local min_master      for rs_uuid, rs in pairs(new_replicasets) do          for replica_uuid, replica in pairs(rs.replicas) do @@ -1544,46 +1545,14 @@ local function storage_cfg(cfg, this_replica_uuid)          log.info('I am master')      end -    -- Do not change 'read_only' option here - if a master is -    -- disabled and there are triggers on master disable, then -    -- they would not be able to modify anything, if 'read_only' -    -- had been set here. 'Read_only' is set in -    -- local_on_master_disable after triggers and is unset in -    -- local_on_master_enable before triggers. -    -- -    -- If a master role of the replica is not changed, then -    -- 'read_only' can be set right here. -    cfg.listen = cfg.listen or this_replica.uri -    if cfg.replication == nil and this_replicaset.master and not is_master then -        cfg.replication = {this_replicaset.master.uri} -    else -        cfg.replication = {} -    end -    if was_master == is_master then -        cfg.read_only = not is_master -    end -    if type(box.cfg) == 'function' then -        cfg.instance_uuid = this_replica.uuid -        cfg.replicaset_uuid = this_replicaset.uuid -    else -        local info = box.info -        if this_replica_uuid ~= info.uuid then -            error(string.format('Instance UUID mismatch: already set "%s" '.. -                                'but "%s" in arguments', info.uuid, -                                this_replica_uuid)) -        end -        if this_replicaset.uuid ~= info.cluster.uuid then -            error(string.format('Replicaset UUID mismatch: already set "%s" '.. -                                'but "%s" in vshard config', info.cluster.uuid, -                                this_replicaset.uuid)) -        end -    end -    local total_bucket_count = cfg.bucket_count -    local rebalancer_disbalance_threshold = cfg.rebalancer_disbalance_threshold -    local rebalancer_max_receiving = cfg.rebalancer_max_receiving -    local shard_index = cfg.shard_index -    local collect_bucket_garbage_interval = cfg.collect_bucket_garbage_interval -    local collect_lua_garbage = cfg.collect_lua_garbage +    local total_bucket_count = vshard_cfg.bucket_count +    local rebalancer_disbalance_threshold = +        vshard_cfg.rebalancer_disbalance_threshold +    local rebalancer_max_receiving = vshard_cfg.rebalancer_max_receiving +    local shard_index = vshard_cfg.shard_index +    local collect_bucket_garbage_interval = +        vshard_cfg.collect_bucket_garbage_interval +    local collect_lua_garbage = vshard_cfg.collect_lua_garbage      -- It is considered that all possible errors during cfg      -- process occur only before this place. @@ -1598,7 +1567,7 @@ local function storage_cfg(cfg, this_replica_uuid)      -- a new sync timeout.      --      local old_sync_timeout = M.sync_timeout -    M.sync_timeout = cfg.sync_timeout +    M.sync_timeout = vshard_cfg.sync_timeout      if was_master and not is_master then          local_on_master_disable_prepare() @@ -1607,27 +1576,61 @@ local function storage_cfg(cfg, this_replica_uuid)          local_on_master_enable_prepare()      end -    local box_cfg = table.copy(cfg) -    lcfg.remove_non_box_options(box_cfg) -    local ok, err = pcall(box.cfg, box_cfg) -    while M.errinj.ERRINJ_CFG_DELAY do -        lfiber.sleep(0.01) -    end -    if not ok then -        M.sync_timeout = old_sync_timeout -        if was_master and not is_master then -            local_on_master_disable_abort() +    if not is_reload then +        -- Do not change 'read_only' option here - if a master is +        -- disabled and there are triggers on master disable, then +        -- they would not be able to modify anything, if 'read_only' +        -- had been set here. 'Read_only' is set in +        -- local_on_master_disable after triggers and is unset in +        -- local_on_master_enable before triggers. +        -- +        -- If a master role of the replica is not changed, then +        -- 'read_only' can be set right here. +        box_cfg.listen = box_cfg.listen or this_replica.uri +        if box_cfg.replication == nil and this_replicaset.master +           and not is_master then +            box_cfg.replication = {this_replicaset.master.uri} +        else +            box_cfg.replication = {}          end -        if not was_master and is_master then -            local_on_master_enable_abort() +        if was_master == is_master then +            box_cfg.read_only = not is_master          end -        error(err) +        if type(box.cfg) == 'function' then +            box_cfg.instance_uuid = this_replica.uuid +            box_cfg.replicaset_uuid = this_replicaset.uuid +        else +            local info = box.info +            if this_replica_uuid ~= info.uuid then +                error(string.format('Instance UUID mismatch: already set ' .. +                                    '"%s" but "%s" in arguments', info.uuid, +                                    this_replica_uuid)) +            end +            if this_replicaset.uuid ~= info.cluster.uuid then +                error(string.format('Replicaset UUID mismatch: already set ' .. +                                    '"%s" but "%s" in vshard config', +                                    info.cluster.uuid, this_replicaset.uuid)) +            end +        end +        local ok, err = pcall(box.cfg, box_cfg) +        while M.errinj.ERRINJ_CFG_DELAY do +            lfiber.sleep(0.01) +        end +        if not ok then +            M.sync_timeout = old_sync_timeout +            if was_master and not is_master then +                local_on_master_disable_abort() +            end +            if not was_master and is_master then +                local_on_master_enable_abort() +            end +            error(err) +        end +        log.info("Box has been configured") +        local uri = luri.parse(this_replica.uri) +        box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password)      end -    log.info("Box has been configured") -    local uri = luri.parse(this_replica.uri) -    box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password) -      lreplicaset.rebind_replicasets(new_replicasets, M.replicasets)      lreplicaset.outdate_replicasets(M.replicasets)      M.replicasets = new_replicasets @@ -1639,7 +1642,7 @@ local function storage_cfg(cfg, this_replica_uuid)      M.shard_index = shard_index      M.collect_bucket_garbage_interval = collect_bucket_garbage_interval      M.collect_lua_garbage = collect_lua_garbage -    M.current_cfg = new_cfg +    M.current_cfg = cfg      if was_master and not is_master then          local_on_master_disable() @@ -1875,7 +1878,7 @@ if not rawget(_G, MODULE_INTERNALS) then      rawset(_G, MODULE_INTERNALS, M)  else      reload_evolution.upgrade(M) -    storage_cfg(M.current_cfg, M.this_replica.uuid) +    storage_cfg(M.current_cfg, M.this_replica.uuid, true)      M.module_version = M.module_version + 1  end @@ -1914,7 +1917,7 @@ return {      rebalancing_is_in_progress = rebalancing_is_in_progress,      recovery_wakeup = recovery_wakeup,      call = storage_call, -    cfg = storage_cfg, +    cfg = function(cfg, uuid) return storage_cfg(cfg, uuid, false) end,      info = storage_info,      buckets_info = storage_buckets_info,      buckets_count = storage_buckets_count,