From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 591C321C0C for ; Wed, 8 Aug 2018 07:17:07 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id gkSxD0GIgq7k for ; Wed, 8 Aug 2018 07:17:07 -0400 (EDT) Received: from smtp32.i.mail.ru (smtp32.i.mail.ru [94.100.177.92]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 7639F28602 for ; Wed, 8 Aug 2018 07:17:06 -0400 (EDT) Subject: [tarantool-patches] Re: [PATCH 1/3] Update only vshard part of a cfg on reload References: <37ba10547bf64fd880734b0550e4865f58dda14a.1533054045.git.avkhatskevich@tarantool.org> <55486f60-a630-1d19-69e0-5cf99c01f319@tarantool.org> <1457a321-3ed1-ed05-74c2-e4f8508a0156@tarantool.org> <1d95c864-6ec2-d9ee-a851-1ea1020a47ae@tarantool.org> From: Vladislav Shpilevoy Message-ID: <4709e32f-a78d-7af0-64ec-eaed383e4227@tarantool.org> Date: Wed, 8 Aug 2018 14:17:02 +0300 MIME-Version: 1.0 In-Reply-To: <1d95c864-6ec2-d9ee-a851-1ea1020a47ae@tarantool.org> Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Language: en-US Content-Transfer-Encoding: 8bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org, Alex Khatskevich Thanks for the patch! Pushed into the master. On 07/08/2018 16:19, Alex Khatskevich wrote: > > On 06.08.2018 20:03, Vladislav Shpilevoy wrote: >> Thanks for the patch! See 3 comments below. >> >>> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua >>> index 102b942..40216ea 100644 >>> --- a/vshard/storage/init.lua >>> +++ b/vshard/storage/init.lua >>> @@ -1553,18 +1553,19 @@ local function storage_cfg(cfg, this_replica_uuid) >>>       -- >>>       -- If a master role of the replica is not changed, then >>>       -- 'read_only' can be set right here. >>> -    cfg.listen = cfg.listen or this_replica.uri >>> -    if cfg.replication == nil and this_replicaset.master and not is_master then >>> -        cfg.replication = {this_replicaset.master.uri} >>> +    box_cfg.listen = box_cfg.listen or this_replica.uri >>> +    if box_cfg.replication == nil and this_replicaset.master >>> +       and not is_master then >>> +        box_cfg.replication = {this_replicaset.master.uri} >>>       else >>> -        cfg.replication = {} >>> +        box_cfg.replication = {} >>>       end >>>       if was_master == is_master then >>> -        cfg.read_only = not is_master >>> +        box_cfg.read_only = not is_master >>>       end >>>       if type(box.cfg) == 'function' then >>> -        cfg.instance_uuid = this_replica.uuid >>> -        cfg.replicaset_uuid = this_replicaset.uuid >>> +        box_cfg.instance_uuid = this_replica.uuid >>> +        box_cfg.replicaset_uuid = this_replicaset.uuid >> >> 1. All these box_cfg manipulations should be done under 'if not is_reload' >> I think. > Fixed. >> >>>       else >>>           local info = box.info >>>           if this_replica_uuid ~= info.uuid then >>> @@ -1607,27 +1610,27 @@ local function storage_cfg(cfg, this_replica_uuid) >>>           local_on_master_enable_prepare() >>>       end >>> >>> -    local box_cfg = table.copy(cfg) >>> -    lcfg.remove_non_box_options(box_cfg) >>> -    local ok, err = pcall(box.cfg, box_cfg) >>> -    while M.errinj.ERRINJ_CFG_DELAY do >>> -        lfiber.sleep(0.01) >>> -    end >>> -    if not ok then >>> -        M.sync_timeout = old_sync_timeout >>> -        if was_master and not is_master then >>> -            local_on_master_disable_abort() >>> +    if not is_reload then >>> +        local ok, err = true, nil >>> +        ok, err = pcall(box.cfg, box_cfg) >> >> 2. Why do you need to announce 'local ok, err' before >> their usage on the next line? > fixed. >> >> >>> +        while M.errinj.ERRINJ_CFG_DELAY do >>> +            lfiber.sleep(0.01) >>>           end >>> -        if not was_master and is_master then >>> -            local_on_master_enable_abort() >>> +        if not ok then >>> +            M.sync_timeout = old_sync_timeout >>> +            if was_master and not is_master then >>> +                local_on_master_disable_abort() >>> +            end >>> +            if not was_master and is_master then >>> +                local_on_master_enable_abort() >>> +            end >>> +            error(err) >>>           end >>> -        error(err) >>> +        log.info("Box has been configured") >>> +        local uri = luri.parse(this_replica.uri) >>> +        box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password) >>>       end >>> >>> -    log.info("Box has been configured") >>> -    local uri = luri.parse(this_replica.uri) >>> -    box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password) >>> - >>>       lreplicaset.rebind_replicasets(new_replicasets, M.replicasets) >>>       lreplicaset.outdate_replicasets(M.replicasets) >>>       M.replicasets = new_replicasets >>> @@ -1874,7 +1877,7 @@ if not rawget(_G, MODULE_INTERNALS) then >>>       rawset(_G, MODULE_INTERNALS, M) >>>   else >>>       reload_evolution.upgrade(M) >>> -    storage_cfg(M.current_cfg, M.this_replica.uuid) >>> +    storage_cfg(M.current_cfg, M.this_replica.uuid, true) >> >> 3. I see that you have stored vshard_cfg in M.current_cfg. Not a full >> config. So it does not have any box options. And it causes a question >> - why do you need to separate reload from non-reload, if reload anyway >> in such implementation is like 'box.cfg{}' call with no parameters? >> And if you do not store box_cfg options how are you going to compare >> configs when we will implement atomic cfg over cluster? > Fixed. And in a router too. > > > > Full diff: > > commit d3c35612130ff95b20245993ab5053981d3b985f > Author: AKhatskevich > Date:   Mon Jul 23 16:42:22 2018 +0300 > >     Update only vshard part of a cfg on reload > >     Box cfg could have been changed by a user and then overridden by >     an old vshard config on reload. > >     Since that commit, box part of a config is applied only when >     it is explicitly passed to a `cfg` method. > >     This change is important for the multiple routers feature. > > diff --git a/vshard/cfg.lua b/vshard/cfg.lua > index 7c9ab77..af1c3ee 100644 > --- a/vshard/cfg.lua > +++ b/vshard/cfg.lua > @@ -221,6 +221,22 @@ local cfg_template = { >      }, >  } > > +-- > +-- Split it into vshard_cfg and box_cfg parts. > +-- > +local function cfg_split(cfg) > +    local vshard_cfg = {} > +    local box_cfg = {} > +    for k, v in pairs(cfg) do > +        if cfg_template[k] then > +            vshard_cfg[k] = v > +        else > +            box_cfg[k] = v > +        end > +    end > +    return vshard_cfg, box_cfg > +end > + >  -- >  -- Names of options which cannot be changed during reconfigure. >  -- > @@ -252,24 +268,7 @@ local function cfg_check(shard_cfg, old_cfg) >      return shard_cfg >  end > > --- > --- Nullify non-box options. > --- > -local function remove_non_box_options(cfg) > -    cfg.sharding = nil > -    cfg.weights = nil > -    cfg.zone = nil > -    cfg.bucket_count = nil > -    cfg.rebalancer_disbalance_threshold = nil > -    cfg.rebalancer_max_receiving = nil > -    cfg.shard_index = nil > -    cfg.collect_bucket_garbage_interval = nil > -    cfg.collect_lua_garbage = nil > -    cfg.sync_timeout = nil > -    cfg.connection_outdate_delay = nil > -end > - >  return { >      check = cfg_check, > -    remove_non_box_options = remove_non_box_options, > +    split = cfg_split, >  } > diff --git a/vshard/router/init.lua b/vshard/router/init.lua > index d8c026b..1e8d898 100644 > --- a/vshard/router/init.lua > +++ b/vshard/router/init.lua > @@ -491,19 +491,17 @@ end >  -- Configuration >  -------------------------------------------------------------------------------- > > -local function router_cfg(cfg) > +local function router_cfg(cfg, is_reload) >      cfg = lcfg.check(cfg, M.current_cfg) > -    local new_cfg = table.copy(cfg) > +    local vshard_cfg, box_cfg = lcfg.split(cfg) >      if not M.replicasets then >          log.info('Starting router configuration') >      else >          log.info('Starting router reconfiguration') >      end > -    local new_replicasets = lreplicaset.buildall(cfg) > -    local total_bucket_count = cfg.bucket_count > -    local collect_lua_garbage = cfg.collect_lua_garbage > -    local box_cfg = table.copy(cfg) > -    lcfg.remove_non_box_options(box_cfg) > +    local new_replicasets = lreplicaset.buildall(vshard_cfg) > +    local total_bucket_count = vshard_cfg.bucket_count > +    local collect_lua_garbage = vshard_cfg.collect_lua_garbage >      log.info("Calling box.cfg()...") >      for k, v in pairs(box_cfg) do >          log.info({[k] = v}) > @@ -514,8 +512,10 @@ local function router_cfg(cfg) >      if M.errinj.ERRINJ_CFG then >          error('Error injection: cfg') >      end > -    box.cfg(box_cfg) > -    log.info("Box has been configured") > +    if not is_reload then > +        box.cfg(box_cfg) > +        log.info("Box has been configured") > +    end >      -- Move connections from an old configuration to a new one. >      -- It must be done with no yields to prevent usage both of not >      -- fully moved old replicasets, and not fully built new ones. > @@ -526,8 +526,9 @@ local function router_cfg(cfg) >          replicaset:connect_all() >      end >      lreplicaset.wait_masters_connect(new_replicasets) > -    lreplicaset.outdate_replicasets(M.replicasets, cfg.connection_outdate_delay) > -    M.connection_outdate_delay = cfg.connection_outdate_delay > +    lreplicaset.outdate_replicasets(M.replicasets, > + vshard_cfg.connection_outdate_delay) > +    M.connection_outdate_delay = vshard_cfg.connection_outdate_delay >      M.total_bucket_count = total_bucket_count >      M.collect_lua_garbage = collect_lua_garbage >      M.current_cfg = cfg > @@ -817,7 +818,7 @@ end >  if not rawget(_G, MODULE_INTERNALS) then >      rawset(_G, MODULE_INTERNALS, M) >  else > -    router_cfg(M.current_cfg) > +    router_cfg(M.current_cfg, true) >      M.module_version = M.module_version + 1 >  end > > @@ -825,7 +826,7 @@ M.discovery_f = discovery_f >  M.failover_f = failover_f > >  return { > -    cfg = router_cfg; > +    cfg = function(cfg) return router_cfg(cfg, false) end; >      info = router_info; >      buckets_info = router_buckets_info; >      call = router_call; > diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua > index 1f29323..2080769 100644 > --- a/vshard/storage/init.lua > +++ b/vshard/storage/init.lua > @@ -1500,13 +1500,14 @@ end >  -------------------------------------------------------------------------------- >  -- Configuration >  -------------------------------------------------------------------------------- > -local function storage_cfg(cfg, this_replica_uuid) > + > +local function storage_cfg(cfg, this_replica_uuid, is_reload) >      if this_replica_uuid == nil then >          error('Usage: cfg(configuration, this_replica_uuid)') >      end >      cfg = lcfg.check(cfg, M.current_cfg) > -    local new_cfg = table.copy(cfg) > -    if cfg.weights or cfg.zone then > +    local vshard_cfg, box_cfg = lcfg.split(cfg) > +    if vshard_cfg.weights or vshard_cfg.zone then >          error('Weights and zone are not allowed for storage configuration') >      end >      if M.replicasets then > @@ -1520,7 +1521,7 @@ local function storage_cfg(cfg, this_replica_uuid) > >      local this_replicaset >      local this_replica > -    local new_replicasets = lreplicaset.buildall(cfg) > +    local new_replicasets = lreplicaset.buildall(vshard_cfg) >      local min_master >      for rs_uuid, rs in pairs(new_replicasets) do >          for replica_uuid, replica in pairs(rs.replicas) do > @@ -1544,46 +1545,14 @@ local function storage_cfg(cfg, this_replica_uuid) >          log.info('I am master') >      end > > -    -- Do not change 'read_only' option here - if a master is > -    -- disabled and there are triggers on master disable, then > -    -- they would not be able to modify anything, if 'read_only' > -    -- had been set here. 'Read_only' is set in > -    -- local_on_master_disable after triggers and is unset in > -    -- local_on_master_enable before triggers. > -    -- > -    -- If a master role of the replica is not changed, then > -    -- 'read_only' can be set right here. > -    cfg.listen = cfg.listen or this_replica.uri > -    if cfg.replication == nil and this_replicaset.master and not is_master then > -        cfg.replication = {this_replicaset.master.uri} > -    else > -        cfg.replication = {} > -    end > -    if was_master == is_master then > -        cfg.read_only = not is_master > -    end > -    if type(box.cfg) == 'function' then > -        cfg.instance_uuid = this_replica.uuid > -        cfg.replicaset_uuid = this_replicaset.uuid > -    else > -        local info = box.info > -        if this_replica_uuid ~= info.uuid then > -            error(string.format('Instance UUID mismatch: already set "%s" '.. > -                                'but "%s" in arguments', info.uuid, > -                                this_replica_uuid)) > -        end > -        if this_replicaset.uuid ~= info.cluster.uuid then > -            error(string.format('Replicaset UUID mismatch: already set "%s" '.. > -                                'but "%s" in vshard config', info.cluster.uuid, > -                                this_replicaset.uuid)) > -        end > -    end > -    local total_bucket_count = cfg.bucket_count > -    local rebalancer_disbalance_threshold = cfg.rebalancer_disbalance_threshold > -    local rebalancer_max_receiving = cfg.rebalancer_max_receiving > -    local shard_index = cfg.shard_index > -    local collect_bucket_garbage_interval = cfg.collect_bucket_garbage_interval > -    local collect_lua_garbage = cfg.collect_lua_garbage > +    local total_bucket_count = vshard_cfg.bucket_count > +    local rebalancer_disbalance_threshold = > +        vshard_cfg.rebalancer_disbalance_threshold > +    local rebalancer_max_receiving = vshard_cfg.rebalancer_max_receiving > +    local shard_index = vshard_cfg.shard_index > +    local collect_bucket_garbage_interval = > +        vshard_cfg.collect_bucket_garbage_interval > +    local collect_lua_garbage = vshard_cfg.collect_lua_garbage > >      -- It is considered that all possible errors during cfg >      -- process occur only before this place. > @@ -1598,7 +1567,7 @@ local function storage_cfg(cfg, this_replica_uuid) >      -- a new sync timeout. >      -- >      local old_sync_timeout = M.sync_timeout > -    M.sync_timeout = cfg.sync_timeout > +    M.sync_timeout = vshard_cfg.sync_timeout > >      if was_master and not is_master then >          local_on_master_disable_prepare() > @@ -1607,27 +1576,61 @@ local function storage_cfg(cfg, this_replica_uuid) >          local_on_master_enable_prepare() >      end > > -    local box_cfg = table.copy(cfg) > -    lcfg.remove_non_box_options(box_cfg) > -    local ok, err = pcall(box.cfg, box_cfg) > -    while M.errinj.ERRINJ_CFG_DELAY do > -        lfiber.sleep(0.01) > -    end > -    if not ok then > -        M.sync_timeout = old_sync_timeout > -        if was_master and not is_master then > -            local_on_master_disable_abort() > +    if not is_reload then > +        -- Do not change 'read_only' option here - if a master is > +        -- disabled and there are triggers on master disable, then > +        -- they would not be able to modify anything, if 'read_only' > +        -- had been set here. 'Read_only' is set in > +        -- local_on_master_disable after triggers and is unset in > +        -- local_on_master_enable before triggers. > +        -- > +        -- If a master role of the replica is not changed, then > +        -- 'read_only' can be set right here. > +        box_cfg.listen = box_cfg.listen or this_replica.uri > +        if box_cfg.replication == nil and this_replicaset.master > +           and not is_master then > +            box_cfg.replication = {this_replicaset.master.uri} > +        else > +            box_cfg.replication = {} >          end > -        if not was_master and is_master then > -            local_on_master_enable_abort() > +        if was_master == is_master then > +            box_cfg.read_only = not is_master >          end > -        error(err) > +        if type(box.cfg) == 'function' then > +            box_cfg.instance_uuid = this_replica.uuid > +            box_cfg.replicaset_uuid = this_replicaset.uuid > +        else > +            local info = box.info > +            if this_replica_uuid ~= info.uuid then > +                error(string.format('Instance UUID mismatch: already set ' .. > +                                    '"%s" but "%s" in arguments', info.uuid, > +                                    this_replica_uuid)) > +            end > +            if this_replicaset.uuid ~= info.cluster.uuid then > +                error(string.format('Replicaset UUID mismatch: already set ' .. > +                                    '"%s" but "%s" in vshard config', > +                                    info.cluster.uuid, this_replicaset.uuid)) > +            end > +        end > +        local ok, err = pcall(box.cfg, box_cfg) > +        while M.errinj.ERRINJ_CFG_DELAY do > +            lfiber.sleep(0.01) > +        end > +        if not ok then > +            M.sync_timeout = old_sync_timeout > +            if was_master and not is_master then > +                local_on_master_disable_abort() > +            end > +            if not was_master and is_master then > +                local_on_master_enable_abort() > +            end > +            error(err) > +        end > +        log.info("Box has been configured") > +        local uri = luri.parse(this_replica.uri) > +        box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password) >      end > > -    log.info("Box has been configured") > -    local uri = luri.parse(this_replica.uri) > -    box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password) > - >      lreplicaset.rebind_replicasets(new_replicasets, M.replicasets) >      lreplicaset.outdate_replicasets(M.replicasets) >      M.replicasets = new_replicasets > @@ -1639,7 +1642,7 @@ local function storage_cfg(cfg, this_replica_uuid) >      M.shard_index = shard_index >      M.collect_bucket_garbage_interval = collect_bucket_garbage_interval >      M.collect_lua_garbage = collect_lua_garbage > -    M.current_cfg = new_cfg > +    M.current_cfg = cfg > >      if was_master and not is_master then >          local_on_master_disable() > @@ -1875,7 +1878,7 @@ if not rawget(_G, MODULE_INTERNALS) then >      rawset(_G, MODULE_INTERNALS, M) >  else >      reload_evolution.upgrade(M) > -    storage_cfg(M.current_cfg, M.this_replica.uuid) > +    storage_cfg(M.current_cfg, M.this_replica.uuid, true) >      M.module_version = M.module_version + 1 >  end > > @@ -1914,7 +1917,7 @@ return { >      rebalancing_is_in_progress = rebalancing_is_in_progress, >      recovery_wakeup = recovery_wakeup, >      call = storage_call, > -    cfg = storage_cfg, > +    cfg = function(cfg, uuid) return storage_cfg(cfg, uuid, false) end, >      info = storage_info, >      buckets_info = storage_buckets_info, >      buckets_count = storage_buckets_count, > > >