[tarantool-patches] Re: [PATCH 1/3] Update only vshard part of a cfg on reload

Alex Khatskevich avkhatskevich at tarantool.org
Tue Aug 7 16:19:57 MSK 2018


On 06.08.2018 20:03, Vladislav Shpilevoy wrote:
> Thanks for the patch! See 3 comments below.
>
>> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
>> index 102b942..40216ea 100644
>> --- a/vshard/storage/init.lua
>> +++ b/vshard/storage/init.lua
>> @@ -1553,18 +1553,19 @@ local function storage_cfg(cfg, 
>> this_replica_uuid)
>>       --
>>       -- If a master role of the replica is not changed, then
>>       -- 'read_only' can be set right here.
>> -    cfg.listen = cfg.listen or this_replica.uri
>> -    if cfg.replication == nil and this_replicaset.master and not 
>> is_master then
>> -        cfg.replication = {this_replicaset.master.uri}
>> +    box_cfg.listen = box_cfg.listen or this_replica.uri
>> +    if box_cfg.replication == nil and this_replicaset.master
>> +       and not is_master then
>> +        box_cfg.replication = {this_replicaset.master.uri}
>>       else
>> -        cfg.replication = {}
>> +        box_cfg.replication = {}
>>       end
>>       if was_master == is_master then
>> -        cfg.read_only = not is_master
>> +        box_cfg.read_only = not is_master
>>       end
>>       if type(box.cfg) == 'function' then
>> -        cfg.instance_uuid = this_replica.uuid
>> -        cfg.replicaset_uuid = this_replicaset.uuid
>> +        box_cfg.instance_uuid = this_replica.uuid
>> +        box_cfg.replicaset_uuid = this_replicaset.uuid
>
> 1. All these box_cfg manipulations should be done under 'if not 
> is_reload'
> I think.
Fixed.
>
>>       else
>>           local info = box.info
>>           if this_replica_uuid ~= info.uuid then
>> @@ -1607,27 +1610,27 @@ local function storage_cfg(cfg, 
>> this_replica_uuid)
>>           local_on_master_enable_prepare()
>>       end
>>
>> -    local box_cfg = table.copy(cfg)
>> -    lcfg.remove_non_box_options(box_cfg)
>> -    local ok, err = pcall(box.cfg, box_cfg)
>> -    while M.errinj.ERRINJ_CFG_DELAY do
>> -        lfiber.sleep(0.01)
>> -    end
>> -    if not ok then
>> -        M.sync_timeout = old_sync_timeout
>> -        if was_master and not is_master then
>> -            local_on_master_disable_abort()
>> +    if not is_reload then
>> +        local ok, err = true, nil
>> +        ok, err = pcall(box.cfg, box_cfg)
>
> 2. Why do you need to announce 'local ok, err' before
> their usage on the next line?
fixed.
>
>
>> +        while M.errinj.ERRINJ_CFG_DELAY do
>> +            lfiber.sleep(0.01)
>>           end
>> -        if not was_master and is_master then
>> -            local_on_master_enable_abort()
>> +        if not ok then
>> +            M.sync_timeout = old_sync_timeout
>> +            if was_master and not is_master then
>> +                local_on_master_disable_abort()
>> +            end
>> +            if not was_master and is_master then
>> +                local_on_master_enable_abort()
>> +            end
>> +            error(err)
>>           end
>> -        error(err)
>> +        log.info("Box has been configured")
>> +        local uri = luri.parse(this_replica.uri)
>> +        box.once("vshard:storage:1", storage_schema_v1, uri.login, 
>> uri.password)
>>       end
>>
>> -    log.info("Box has been configured")
>> -    local uri = luri.parse(this_replica.uri)
>> -    box.once("vshard:storage:1", storage_schema_v1, uri.login, 
>> uri.password)
>> -
>>       lreplicaset.rebind_replicasets(new_replicasets, M.replicasets)
>>       lreplicaset.outdate_replicasets(M.replicasets)
>>       M.replicasets = new_replicasets
>> @@ -1874,7 +1877,7 @@ if not rawget(_G, MODULE_INTERNALS) then
>>       rawset(_G, MODULE_INTERNALS, M)
>>   else
>>       reload_evolution.upgrade(M)
>> -    storage_cfg(M.current_cfg, M.this_replica.uuid)
>> +    storage_cfg(M.current_cfg, M.this_replica.uuid, true)
>
> 3. I see that you have stored vshard_cfg in M.current_cfg. Not a full
> config. So it does not have any box options. And it causes a question
> - why do you need to separate reload from non-reload, if reload anyway
> in such implementation is like 'box.cfg{}' call with no parameters?
> And if you do not store box_cfg options how are you going to compare
> configs when we will implement atomic cfg over cluster?
Fixed. And in a router too.



Full diff:

commit d3c35612130ff95b20245993ab5053981d3b985f
Author: AKhatskevich <avkhatskevich at tarantool.org>
Date:   Mon Jul 23 16:42:22 2018 +0300

     Update only vshard part of a cfg on reload

     Box cfg could have been changed by a user and then overridden by
     an old vshard config on reload.

     Since that commit, box part of a config is applied only when
     it is explicitly passed to a `cfg` method.

     This change is important for the multiple routers feature.

diff --git a/vshard/cfg.lua b/vshard/cfg.lua
index 7c9ab77..af1c3ee 100644
--- a/vshard/cfg.lua
+++ b/vshard/cfg.lua
@@ -221,6 +221,22 @@ local cfg_template = {
      },
  }

+--
+-- Split it into vshard_cfg and box_cfg parts.
+--
+local function cfg_split(cfg)
+    local vshard_cfg = {}
+    local box_cfg = {}
+    for k, v in pairs(cfg) do
+        if cfg_template[k] then
+            vshard_cfg[k] = v
+        else
+            box_cfg[k] = v
+        end
+    end
+    return vshard_cfg, box_cfg
+end
+
  --
  -- Names of options which cannot be changed during reconfigure.
  --
@@ -252,24 +268,7 @@ local function cfg_check(shard_cfg, old_cfg)
      return shard_cfg
  end

---
--- Nullify non-box options.
---
-local function remove_non_box_options(cfg)
-    cfg.sharding = nil
-    cfg.weights = nil
-    cfg.zone = nil
-    cfg.bucket_count = nil
-    cfg.rebalancer_disbalance_threshold = nil
-    cfg.rebalancer_max_receiving = nil
-    cfg.shard_index = nil
-    cfg.collect_bucket_garbage_interval = nil
-    cfg.collect_lua_garbage = nil
-    cfg.sync_timeout = nil
-    cfg.connection_outdate_delay = nil
-end
-
  return {
      check = cfg_check,
-    remove_non_box_options = remove_non_box_options,
+    split = cfg_split,
  }
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index d8c026b..1e8d898 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -491,19 +491,17 @@ end
  -- Configuration
  --------------------------------------------------------------------------------

-local function router_cfg(cfg)
+local function router_cfg(cfg, is_reload)
      cfg = lcfg.check(cfg, M.current_cfg)
-    local new_cfg = table.copy(cfg)
+    local vshard_cfg, box_cfg = lcfg.split(cfg)
      if not M.replicasets then
          log.info('Starting router configuration')
      else
          log.info('Starting router reconfiguration')
      end
-    local new_replicasets = lreplicaset.buildall(cfg)
-    local total_bucket_count = cfg.bucket_count
-    local collect_lua_garbage = cfg.collect_lua_garbage
-    local box_cfg = table.copy(cfg)
-    lcfg.remove_non_box_options(box_cfg)
+    local new_replicasets = lreplicaset.buildall(vshard_cfg)
+    local total_bucket_count = vshard_cfg.bucket_count
+    local collect_lua_garbage = vshard_cfg.collect_lua_garbage
      log.info("Calling box.cfg()...")
      for k, v in pairs(box_cfg) do
          log.info({[k] = v})
@@ -514,8 +512,10 @@ local function router_cfg(cfg)
      if M.errinj.ERRINJ_CFG then
          error('Error injection: cfg')
      end
-    box.cfg(box_cfg)
-    log.info("Box has been configured")
+    if not is_reload then
+        box.cfg(box_cfg)
+        log.info("Box has been configured")
+    end
      -- Move connections from an old configuration to a new one.
      -- It must be done with no yields to prevent usage both of not
      -- fully moved old replicasets, and not fully built new ones.
@@ -526,8 +526,9 @@ local function router_cfg(cfg)
          replicaset:connect_all()
      end
      lreplicaset.wait_masters_connect(new_replicasets)
-    lreplicaset.outdate_replicasets(M.replicasets, 
cfg.connection_outdate_delay)
-    M.connection_outdate_delay = cfg.connection_outdate_delay
+    lreplicaset.outdate_replicasets(M.replicasets,
+ vshard_cfg.connection_outdate_delay)
+    M.connection_outdate_delay = vshard_cfg.connection_outdate_delay
      M.total_bucket_count = total_bucket_count
      M.collect_lua_garbage = collect_lua_garbage
      M.current_cfg = cfg
@@ -817,7 +818,7 @@ end
  if not rawget(_G, MODULE_INTERNALS) then
      rawset(_G, MODULE_INTERNALS, M)
  else
-    router_cfg(M.current_cfg)
+    router_cfg(M.current_cfg, true)
      M.module_version = M.module_version + 1
  end

@@ -825,7 +826,7 @@ M.discovery_f = discovery_f
  M.failover_f = failover_f

  return {
-    cfg = router_cfg;
+    cfg = function(cfg) return router_cfg(cfg, false) end;
      info = router_info;
      buckets_info = router_buckets_info;
      call = router_call;
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 1f29323..2080769 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -1500,13 +1500,14 @@ end
  --------------------------------------------------------------------------------
  -- Configuration
  --------------------------------------------------------------------------------
-local function storage_cfg(cfg, this_replica_uuid)
+
+local function storage_cfg(cfg, this_replica_uuid, is_reload)
      if this_replica_uuid == nil then
          error('Usage: cfg(configuration, this_replica_uuid)')
      end
      cfg = lcfg.check(cfg, M.current_cfg)
-    local new_cfg = table.copy(cfg)
-    if cfg.weights or cfg.zone then
+    local vshard_cfg, box_cfg = lcfg.split(cfg)
+    if vshard_cfg.weights or vshard_cfg.zone then
          error('Weights and zone are not allowed for storage 
configuration')
      end
      if M.replicasets then
@@ -1520,7 +1521,7 @@ local function storage_cfg(cfg, this_replica_uuid)

      local this_replicaset
      local this_replica
-    local new_replicasets = lreplicaset.buildall(cfg)
+    local new_replicasets = lreplicaset.buildall(vshard_cfg)
      local min_master
      for rs_uuid, rs in pairs(new_replicasets) do
          for replica_uuid, replica in pairs(rs.replicas) do
@@ -1544,46 +1545,14 @@ local function storage_cfg(cfg, this_replica_uuid)
          log.info('I am master')
      end

-    -- Do not change 'read_only' option here - if a master is
-    -- disabled and there are triggers on master disable, then
-    -- they would not be able to modify anything, if 'read_only'
-    -- had been set here. 'Read_only' is set in
-    -- local_on_master_disable after triggers and is unset in
-    -- local_on_master_enable before triggers.
-    --
-    -- If a master role of the replica is not changed, then
-    -- 'read_only' can be set right here.
-    cfg.listen = cfg.listen or this_replica.uri
-    if cfg.replication == nil and this_replicaset.master and not 
is_master then
-        cfg.replication = {this_replicaset.master.uri}
-    else
-        cfg.replication = {}
-    end
-    if was_master == is_master then
-        cfg.read_only = not is_master
-    end
-    if type(box.cfg) == 'function' then
-        cfg.instance_uuid = this_replica.uuid
-        cfg.replicaset_uuid = this_replicaset.uuid
-    else
-        local info = box.info
-        if this_replica_uuid ~= info.uuid then
-            error(string.format('Instance UUID mismatch: already set 
"%s" '..
-                                'but "%s" in arguments', info.uuid,
-                                this_replica_uuid))
-        end
-        if this_replicaset.uuid ~= info.cluster.uuid then
-            error(string.format('Replicaset UUID mismatch: already set 
"%s" '..
-                                'but "%s" in vshard config', 
info.cluster.uuid,
-                                this_replicaset.uuid))
-        end
-    end
-    local total_bucket_count = cfg.bucket_count
-    local rebalancer_disbalance_threshold = 
cfg.rebalancer_disbalance_threshold
-    local rebalancer_max_receiving = cfg.rebalancer_max_receiving
-    local shard_index = cfg.shard_index
-    local collect_bucket_garbage_interval = 
cfg.collect_bucket_garbage_interval
-    local collect_lua_garbage = cfg.collect_lua_garbage
+    local total_bucket_count = vshard_cfg.bucket_count
+    local rebalancer_disbalance_threshold =
+        vshard_cfg.rebalancer_disbalance_threshold
+    local rebalancer_max_receiving = vshard_cfg.rebalancer_max_receiving
+    local shard_index = vshard_cfg.shard_index
+    local collect_bucket_garbage_interval =
+        vshard_cfg.collect_bucket_garbage_interval
+    local collect_lua_garbage = vshard_cfg.collect_lua_garbage

      -- It is considered that all possible errors during cfg
      -- process occur only before this place.
@@ -1598,7 +1567,7 @@ local function storage_cfg(cfg, this_replica_uuid)
      -- a new sync timeout.
      --
      local old_sync_timeout = M.sync_timeout
-    M.sync_timeout = cfg.sync_timeout
+    M.sync_timeout = vshard_cfg.sync_timeout

      if was_master and not is_master then
          local_on_master_disable_prepare()
@@ -1607,27 +1576,61 @@ local function storage_cfg(cfg, this_replica_uuid)
          local_on_master_enable_prepare()
      end

-    local box_cfg = table.copy(cfg)
-    lcfg.remove_non_box_options(box_cfg)
-    local ok, err = pcall(box.cfg, box_cfg)
-    while M.errinj.ERRINJ_CFG_DELAY do
-        lfiber.sleep(0.01)
-    end
-    if not ok then
-        M.sync_timeout = old_sync_timeout
-        if was_master and not is_master then
-            local_on_master_disable_abort()
+    if not is_reload then
+        -- Do not change 'read_only' option here - if a master is
+        -- disabled and there are triggers on master disable, then
+        -- they would not be able to modify anything, if 'read_only'
+        -- had been set here. 'Read_only' is set in
+        -- local_on_master_disable after triggers and is unset in
+        -- local_on_master_enable before triggers.
+        --
+        -- If a master role of the replica is not changed, then
+        -- 'read_only' can be set right here.
+        box_cfg.listen = box_cfg.listen or this_replica.uri
+        if box_cfg.replication == nil and this_replicaset.master
+           and not is_master then
+            box_cfg.replication = {this_replicaset.master.uri}
+        else
+            box_cfg.replication = {}
          end
-        if not was_master and is_master then
-            local_on_master_enable_abort()
+        if was_master == is_master then
+            box_cfg.read_only = not is_master
          end
-        error(err)
+        if type(box.cfg) == 'function' then
+            box_cfg.instance_uuid = this_replica.uuid
+            box_cfg.replicaset_uuid = this_replicaset.uuid
+        else
+            local info = box.info
+            if this_replica_uuid ~= info.uuid then
+                error(string.format('Instance UUID mismatch: already 
set ' ..
+                                    '"%s" but "%s" in arguments', 
info.uuid,
+                                    this_replica_uuid))
+            end
+            if this_replicaset.uuid ~= info.cluster.uuid then
+                error(string.format('Replicaset UUID mismatch: already 
set ' ..
+                                    '"%s" but "%s" in vshard config',
+                                    info.cluster.uuid, 
this_replicaset.uuid))
+            end
+        end
+        local ok, err = pcall(box.cfg, box_cfg)
+        while M.errinj.ERRINJ_CFG_DELAY do
+            lfiber.sleep(0.01)
+        end
+        if not ok then
+            M.sync_timeout = old_sync_timeout
+            if was_master and not is_master then
+                local_on_master_disable_abort()
+            end
+            if not was_master and is_master then
+                local_on_master_enable_abort()
+            end
+            error(err)
+        end
+        log.info("Box has been configured")
+        local uri = luri.parse(this_replica.uri)
+        box.once("vshard:storage:1", storage_schema_v1, uri.login, 
uri.password)
      end

-    log.info("Box has been configured")
-    local uri = luri.parse(this_replica.uri)
-    box.once("vshard:storage:1", storage_schema_v1, uri.login, 
uri.password)
-
      lreplicaset.rebind_replicasets(new_replicasets, M.replicasets)
      lreplicaset.outdate_replicasets(M.replicasets)
      M.replicasets = new_replicasets
@@ -1639,7 +1642,7 @@ local function storage_cfg(cfg, this_replica_uuid)
      M.shard_index = shard_index
      M.collect_bucket_garbage_interval = collect_bucket_garbage_interval
      M.collect_lua_garbage = collect_lua_garbage
-    M.current_cfg = new_cfg
+    M.current_cfg = cfg

      if was_master and not is_master then
          local_on_master_disable()
@@ -1875,7 +1878,7 @@ if not rawget(_G, MODULE_INTERNALS) then
      rawset(_G, MODULE_INTERNALS, M)
  else
      reload_evolution.upgrade(M)
-    storage_cfg(M.current_cfg, M.this_replica.uuid)
+    storage_cfg(M.current_cfg, M.this_replica.uuid, true)
      M.module_version = M.module_version + 1
  end

@@ -1914,7 +1917,7 @@ return {
      rebalancing_is_in_progress = rebalancing_is_in_progress,
      recovery_wakeup = recovery_wakeup,
      call = storage_call,
-    cfg = storage_cfg,
+    cfg = function(cfg, uuid) return storage_cfg(cfg, uuid, false) end,
      info = storage_info,
      buckets_info = storage_buckets_info,
      buckets_count = storage_buckets_count,






More information about the Tarantool-patches mailing list