* [tarantool-patches] [PATCH 0/3] vshard reload mechanism
@ 2018-07-18 17:47 AKhatskevich
2018-07-18 17:47 ` [tarantool-patches] [PATCH 1/3] Add test on error during reconfigure AKhatskevich
` (2 more replies)
0 siblings, 3 replies; 16+ messages in thread
From: AKhatskevich @ 2018-07-18 17:47 UTC (permalink / raw)
To: v.shpilevoy, tarantool-patches
Issue1: https://github.com/tarantool/vshard/issues/112
Issue2: https://github.com/tarantool/vshard/issues/125
Branch: https://github.com/tarantool/vshard/tree/kh/gh-112-reload-mt-2
This patcheset improves vshard reload mechanism.
AKhatskevich (3):
Add test on error during reconfigure
Complete module reload
Introduce storage reload evolution
.travis.yml | 2 +-
rpm/prebuild.sh | 2 +
test/lua_libs/git_util.lua | 39 +++++++
test/lua_libs/util.lua | 36 +++++++
test/reload_evolution/storage.result | 184 +++++++++++++++++++++++++++++++++
test/reload_evolution/storage.test.lua | 64 ++++++++++++
test/reload_evolution/storage_1_a.lua | 144 ++++++++++++++++++++++++++
test/reload_evolution/storage_1_b.lua | 1 +
test/reload_evolution/storage_2_a.lua | 1 +
test/reload_evolution/storage_2_b.lua | 1 +
test/reload_evolution/suite.ini | 6 ++
test/reload_evolution/test.lua | 9 ++
test/router/reload.result | 159 ++++++++++++++++++++++++++++
test/router/reload.test.lua | 48 +++++++++
test/router/router.result | 36 ++++++-
test/router/router.test.lua | 10 ++
test/storage/reload.result | 68 ++++++++++++
test/storage/reload.test.lua | 23 +++++
test/storage/storage.result | 39 +++++++
test/storage/storage.test.lua | 12 +++
vshard/cfg.lua | 6 ++
vshard/error.lua | 5 +
vshard/replicaset.lua | 100 +++++++++++++-----
vshard/router/init.lua | 51 ++++++---
vshard/storage/init.lua | 65 +++++++++---
vshard/storage/reload_evolution.lua | 58 +++++++++++
vshard/util.lua | 20 ++++
27 files changed, 1133 insertions(+), 56 deletions(-)
create mode 100644 test/lua_libs/git_util.lua
create mode 100644 test/reload_evolution/storage.result
create mode 100644 test/reload_evolution/storage.test.lua
create mode 100755 test/reload_evolution/storage_1_a.lua
create mode 120000 test/reload_evolution/storage_1_b.lua
create mode 120000 test/reload_evolution/storage_2_a.lua
create mode 120000 test/reload_evolution/storage_2_b.lua
create mode 100644 test/reload_evolution/suite.ini
create mode 100644 test/reload_evolution/test.lua
create mode 100644 vshard/storage/reload_evolution.lua
--
2.14.1
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] [PATCH 1/3] Add test on error during reconfigure
2018-07-18 17:47 [tarantool-patches] [PATCH 0/3] vshard reload mechanism AKhatskevich
@ 2018-07-18 17:47 ` AKhatskevich
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
2018-07-18 17:47 ` [tarantool-patches] [PATCH 2/3] Complete module reload AKhatskevich
2018-07-18 17:47 ` [tarantool-patches] [PATCH 3/3] Introduce storage reload evolution AKhatskevich
2 siblings, 1 reply; 16+ messages in thread
From: AKhatskevich @ 2018-07-18 17:47 UTC (permalink / raw)
To: v.shpilevoy, tarantool-patches
In case reconfigure process fails, the node should continue
work properly.
---
test/lua_libs/util.lua | 16 ++++++++++++++++
test/router/router.result | 33 +++++++++++++++++++++++++++++++++
test/router/router.test.lua | 10 ++++++++++
test/storage/storage.result | 39 +++++++++++++++++++++++++++++++++++++++
test/storage/storage.test.lua | 12 ++++++++++++
vshard/router/init.lua | 7 +++++++
vshard/storage/init.lua | 9 +++++++++
7 files changed, 126 insertions(+)
diff --git a/test/lua_libs/util.lua b/test/lua_libs/util.lua
index f2d3b48..aeb2342 100644
--- a/test/lua_libs/util.lua
+++ b/test/lua_libs/util.lua
@@ -69,9 +69,25 @@ local function wait_master(test_run, replicaset, master)
log.info('Slaves are connected to a master "%s"', master)
end
+-- Check that data has at least all fields as an ethalon.
+local function has_same_fields(ethalon, data)
+ assert(type(ethalon) == 'table' and type(data) == 'table')
+ local diff = {}
+ for k, v in pairs(ethalon) do
+ if v ~= data[k] then
+ table.insert(diff, k)
+ end
+ end
+ if #diff > 0 then
+ return false, diff
+ end
+ return true
+end
+
return {
check_error = check_error,
shuffle_masters = shuffle_masters,
collect_timeouts = collect_timeouts,
wait_master = wait_master,
+ has_same_fields = has_same_fields,
}
diff --git a/test/router/router.result b/test/router/router.result
index 15f4fd0..7ec3e15 100644
--- a/test/router/router.result
+++ b/test/router/router.result
@@ -1156,6 +1156,39 @@ util.check_error(vshard.router.cfg, non_dynamic_cfg)
---
- Non-dynamic option shard_index cannot be reconfigured
...
+-- Error during reconfigure process.
+vshard.router.route(1):callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+vshard.router.internal.errinj.ERRINJ_CFG = true
+---
+...
+old_internal = table.copy(vshard.router.internal)
+---
+...
+_, err = pcall(vshard.router.cfg, cfg)
+---
+...
+err:match('Error injection:.*')
+---
+- 'Error injection: cfg'
+...
+vshard.router.internal.errinj.ERRINJ_CFG = false
+---
+...
+util.has_same_fields(old_internal, vshard.router.internal)
+---
+- true
+...
+vshard.router.route(1):callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
_ = test_run:cmd("switch default")
---
...
diff --git a/test/router/router.test.lua b/test/router/router.test.lua
index 8006e5d..f36016e 100644
--- a/test/router/router.test.lua
+++ b/test/router/router.test.lua
@@ -444,6 +444,16 @@ non_dynamic_cfg = table.copy(cfg)
non_dynamic_cfg.shard_index = 'non_default_name'
util.check_error(vshard.router.cfg, non_dynamic_cfg)
+-- Error during reconfigure process.
+vshard.router.route(1):callro('echo', {'some_data'})
+vshard.router.internal.errinj.ERRINJ_CFG = true
+old_internal = table.copy(vshard.router.internal)
+_, err = pcall(vshard.router.cfg, cfg)
+err:match('Error injection:.*')
+vshard.router.internal.errinj.ERRINJ_CFG = false
+util.has_same_fields(old_internal, vshard.router.internal)
+vshard.router.route(1):callro('echo', {'some_data'})
+
_ = test_run:cmd("switch default")
test_run:drop_cluster(REPLICASET_2)
diff --git a/test/storage/storage.result b/test/storage/storage.result
index 4399ff0..ff07fe9 100644
--- a/test/storage/storage.result
+++ b/test/storage/storage.result
@@ -732,6 +732,45 @@ util.check_error(vshard.storage.cfg, non_dynamic_cfg, names.storage_1_a)
---
- Non-dynamic option bucket_count cannot be reconfigured
...
+-- Error during reconfigure process.
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+vshard.storage.internal.errinj.ERRINJ_CFG = true
+---
+...
+old_internal = table.copy(vshard.storage.internal)
+---
+...
+_, err = pcall(vshard.storage.cfg, cfg, names.storage_1_a)
+---
+...
+err:match('Error injection:.*')
+---
+- 'Error injection: cfg'
+...
+vshard.storage.internal.errinj.ERRINJ_CFG = false
+---
+...
+util.has_same_fields(old_internal, vshard.storage.internal)
+---
+- true
+...
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
_ = test_run:cmd("switch default")
---
...
diff --git a/test/storage/storage.test.lua b/test/storage/storage.test.lua
index 72564e1..04bb608 100644
--- a/test/storage/storage.test.lua
+++ b/test/storage/storage.test.lua
@@ -182,6 +182,18 @@ non_dynamic_cfg = table.copy(cfg)
non_dynamic_cfg.bucket_count = require('vshard.consts').DEFAULT_BUCKET_COUNT + 1
util.check_error(vshard.storage.cfg, non_dynamic_cfg, names.storage_1_a)
+-- Error during reconfigure process.
+_, rs = next(vshard.storage.internal.replicasets)
+rs:callro('echo', {'some_data'})
+vshard.storage.internal.errinj.ERRINJ_CFG = true
+old_internal = table.copy(vshard.storage.internal)
+_, err = pcall(vshard.storage.cfg, cfg, names.storage_1_a)
+err:match('Error injection:.*')
+vshard.storage.internal.errinj.ERRINJ_CFG = false
+util.has_same_fields(old_internal, vshard.storage.internal)
+_, rs = next(vshard.storage.internal.replicasets)
+rs:callro('echo', {'some_data'})
+
_ = test_run:cmd("switch default")
test_run:drop_cluster(REPLICASET_2)
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 4531f3a..a143070 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -11,6 +11,7 @@ local M = rawget(_G, '__module_vshard_router')
if not M then
M = {
errinj = {
+ ERRINJ_CFG = false,
ERRINJ_FAILOVER_CHANGE_CFG = false,
ERRINJ_RELOAD = false,
ERRINJ_LONG_DISCOVERY = false,
@@ -486,6 +487,12 @@ local function router_cfg(cfg)
for k, v in pairs(cfg) do
log.info({[k] = v})
end
+ -- It is considered that all possible errors during cfg
+ -- process occur only before this place.
+ -- This check should be placed as late as possible.
+ if M.errinj.ERRINJ_CFG then
+ error('Error injection: cfg')
+ end
box.cfg(cfg)
log.info("Box has been configured")
M.total_bucket_count = total_bucket_count
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index ff204a4..052e94f 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -33,6 +33,7 @@ if not M then
-- Bucket count stored on all replicasets.
total_bucket_count = 0,
errinj = {
+ ERRINJ_CFG = false,
ERRINJ_BUCKET_FIND_GARBAGE_DELAY = false,
ERRINJ_RELOAD = false,
ERRINJ_CFG_DELAY = false,
@@ -1560,6 +1561,14 @@ local function storage_cfg(cfg, this_replica_uuid)
local shard_index = cfg.shard_index
local collect_bucket_garbage_interval = cfg.collect_bucket_garbage_interval
local collect_lua_garbage = cfg.collect_lua_garbage
+
+ -- It is considered that all possible errors during cfg
+ -- process occur only before this place.
+ -- This check should be placed as late as possible.
+ if M.errinj.ERRINJ_CFG then
+ error('Error injection: cfg')
+ end
+
--
-- Sync timeout is a special case - it must be updated before
-- all other options to allow a user to demote a master with
--
2.14.1
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] [PATCH 2/3] Complete module reload
2018-07-18 17:47 [tarantool-patches] [PATCH 0/3] vshard reload mechanism AKhatskevich
2018-07-18 17:47 ` [tarantool-patches] [PATCH 1/3] Add test on error during reconfigure AKhatskevich
@ 2018-07-18 17:47 ` AKhatskevich
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
2018-07-18 17:47 ` [tarantool-patches] [PATCH 3/3] Introduce storage reload evolution AKhatskevich
2 siblings, 1 reply; 16+ messages in thread
From: AKhatskevich @ 2018-07-18 17:47 UTC (permalink / raw)
To: v.shpilevoy, tarantool-patches
In case one need to upgrade vshard to a new version, this commit
improves reload mechanism to allow to do that for a wider variety of
possible changes (between two versions).
Changes:
* introduce cfg option `connection_outdate_delay`
* improve reload mechanism
* add `util.async_task` method, which runs a function after a
delay
* delete replicaset:rebind_connections method as it is replaced
with `rebind_replicasets` which updates all replicasets at once
Reload mechanism:
* reload all vshard modules
* create new `replicaset` and `replica` objects
* reuse old netbox connections in new replica objects if
possible
* update router/storage.internal table
* after a `connection_outdate_delay` disable old instances of
`replicaset` and `replica` objects
Reload works for modules:
* vshard.router
* vshard.storage
Here is a module reload algorithm:
* old vshard is working
* delete old vshard src
* install new vshard
* call: package.loaded['vshard.router'] = nil
* call: old_router = vshard.router -- Save working router copy.
* call: vshard.router = require('vshard.router')
* if require fails: continue using old_router
* if require succeeds: use vshard.router
In case reload process fails, old router/storage module, replicaset and
replica objects continue working properly. If reload succeeds, all old
objects would be deprecated.
Extra changes:
* introduce MODULE_INTERNALS which stores name of the module
internal data in the global namespace
Part of <shut git>112
---
test/router/reload.result | 159 +++++++++++++++++++++++++++++++++++++++++++
test/router/reload.test.lua | 48 +++++++++++++
test/router/router.result | 3 +-
test/storage/reload.result | 68 ++++++++++++++++++
test/storage/reload.test.lua | 23 +++++++
vshard/cfg.lua | 6 ++
vshard/error.lua | 5 ++
vshard/replicaset.lua | 100 ++++++++++++++++++++-------
vshard/router/init.lua | 44 ++++++++----
vshard/storage/init.lua | 45 ++++++++----
vshard/util.lua | 20 ++++++
11 files changed, 466 insertions(+), 55 deletions(-)
diff --git a/test/router/reload.result b/test/router/reload.result
index 47f3c2e..3fbbe6e 100644
--- a/test/router/reload.result
+++ b/test/router/reload.result
@@ -174,6 +174,165 @@ vshard.router.module_version()
check_reloaded()
---
...
+--
+-- Outdate old replicaset and replica objets.
+--
+rs = vshard.router.route(1)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+package.loaded["vshard.router"] = nil
+---
+...
+_ = require('vshard.router')
+---
+...
+-- Make sure outdate async task has had cpu time.
+fiber.sleep(0.005)
+---
+...
+rs.callro(rs, 'echo', {'some_data'})
+---
+- null
+- type: ShardingError
+ name: OBJECT_IS_OUTDATED
+ message: Object is outdated after module reload/reconfigure. Use new instance.
+ code: 20
+...
+vshard.router = require('vshard.router')
+---
+...
+rs = vshard.router.route(1)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+-- Test `connection_outdate_delay`.
+old_connection_delay = cfg.connection_outdate_delay
+---
+...
+cfg.connection_outdate_delay = 0.3
+---
+...
+vshard.router.cfg(cfg)
+---
+...
+cfg.connection_outdate_delay = old_connection_delay
+---
+...
+vshard.router.internal.connection_outdate_delay = nil
+---
+...
+vshard.router = require('vshard.router')
+---
+...
+rs_new = vshard.router.route(1)
+---
+...
+rs_old = rs
+---
+...
+_, replica_old = next(rs_old.replicas)
+---
+...
+rs_new:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+-- Check old objets are still valid.
+rs_old:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+replica_old.conn ~= nil
+---
+- true
+...
+fiber.sleep(0.2)
+---
+...
+rs_old:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+replica_old.conn ~= nil
+---
+- true
+...
+replica_old.outdated == nil
+---
+- true
+...
+fiber.sleep(0.2)
+---
+...
+rs_old:callro('echo', {'some_data'})
+---
+- null
+- type: ShardingError
+ name: OBJECT_IS_OUTDATED
+ message: Object is outdated after module reload/reconfigure. Use new instance.
+ code: 20
+...
+replica_old.conn == nil
+---
+- true
+...
+replica_old.outdated == true
+---
+- true
+...
+rs_new:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+-- Error during reconfigure process.
+_ = vshard.router.route(1):callro('echo', {'some_data'})
+---
+...
+vshard.router.internal.errinj.ERRINJ_CFG = true
+---
+...
+old_internal = table.copy(vshard.router.internal)
+---
+...
+package.loaded["vshard.router"] = nil
+---
+...
+_, err = pcall(require, 'vshard.router')
+---
+...
+err:match('Error injection:.*')
+---
+- 'Error injection: cfg'
+...
+vshard.router.internal.errinj.ERRINJ_CFG = false
+---
+...
+util.has_same_fields(old_internal, vshard.router.internal)
+---
+- true
+...
+_ = vshard.router.route(1):callro('echo', {'some_data'})
+---
+...
test_run:switch('default')
---
- true
diff --git a/test/router/reload.test.lua b/test/router/reload.test.lua
index af2939d..8d2b2cf 100644
--- a/test/router/reload.test.lua
+++ b/test/router/reload.test.lua
@@ -86,6 +86,54 @@ _ = require('vshard.router')
vshard.router.module_version()
check_reloaded()
+--
+-- Outdate old replicaset and replica objets.
+--
+rs = vshard.router.route(1)
+rs:callro('echo', {'some_data'})
+package.loaded["vshard.router"] = nil
+_ = require('vshard.router')
+-- Make sure outdate async task has had cpu time.
+fiber.sleep(0.005)
+rs.callro(rs, 'echo', {'some_data'})
+vshard.router = require('vshard.router')
+rs = vshard.router.route(1)
+rs:callro('echo', {'some_data'})
+-- Test `connection_outdate_delay`.
+old_connection_delay = cfg.connection_outdate_delay
+cfg.connection_outdate_delay = 0.3
+vshard.router.cfg(cfg)
+cfg.connection_outdate_delay = old_connection_delay
+vshard.router.internal.connection_outdate_delay = nil
+vshard.router = require('vshard.router')
+rs_new = vshard.router.route(1)
+rs_old = rs
+_, replica_old = next(rs_old.replicas)
+rs_new:callro('echo', {'some_data'})
+-- Check old objets are still valid.
+rs_old:callro('echo', {'some_data'})
+replica_old.conn ~= nil
+fiber.sleep(0.2)
+rs_old:callro('echo', {'some_data'})
+replica_old.conn ~= nil
+replica_old.outdated == nil
+fiber.sleep(0.2)
+rs_old:callro('echo', {'some_data'})
+replica_old.conn == nil
+replica_old.outdated == true
+rs_new:callro('echo', {'some_data'})
+
+-- Error during reconfigure process.
+_ = vshard.router.route(1):callro('echo', {'some_data'})
+vshard.router.internal.errinj.ERRINJ_CFG = true
+old_internal = table.copy(vshard.router.internal)
+package.loaded["vshard.router"] = nil
+_, err = pcall(require, 'vshard.router')
+err:match('Error injection:.*')
+vshard.router.internal.errinj.ERRINJ_CFG = false
+util.has_same_fields(old_internal, vshard.router.internal)
+_ = vshard.router.route(1):callro('echo', {'some_data'})
+
test_run:switch('default')
test_run:cmd('stop server router_1')
test_run:cmd('cleanup server router_1')
diff --git a/test/router/router.result b/test/router/router.result
index 7ec3e15..ab939ce 100644
--- a/test/router/router.result
+++ b/test/router/router.result
@@ -1024,11 +1024,10 @@ error_messages
- - Use replicaset:callro(...) instead of replicaset.callro(...)
- Use replicaset:connect_master(...) instead of replicaset.connect_master(...)
- Use replicaset:connect_replica(...) instead of replicaset.connect_replica(...)
- - Use replicaset:rebind_connections(...) instead of replicaset.rebind_connections(...)
- Use replicaset:down_replica_priority(...) instead of replicaset.down_replica_priority(...)
- Use replicaset:call(...) instead of replicaset.call(...)
- - Use replicaset:up_replica_priority(...) instead of replicaset.up_replica_priority(...)
- Use replicaset:connect(...) instead of replicaset.connect(...)
+ - Use replicaset:up_replica_priority(...) instead of replicaset.up_replica_priority(...)
- Use replicaset:callrw(...) instead of replicaset.callrw(...)
- Use replicaset:connect_all(...) instead of replicaset.connect_all(...)
...
diff --git a/test/storage/reload.result b/test/storage/reload.result
index 531d984..0281e27 100644
--- a/test/storage/reload.result
+++ b/test/storage/reload.result
@@ -174,6 +174,74 @@ vshard.storage.module_version()
check_reloaded()
---
...
+--
+-- Outdate old replicaset and replica objets.
+--
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+package.loaded["vshard.storage"] = nil
+---
+...
+_ = require('vshard.storage')
+---
+...
+rs.callro(rs, 'echo', {'some_data'})
+---
+- null
+- type: ShardingError
+ name: OBJECT_IS_OUTDATED
+ message: Object is outdated after module reload/reconfigure. Use new instance.
+ code: 20
+...
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+rs.callro(rs, 'echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+-- Error during reload process.
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+vshard.storage.internal.errinj.ERRINJ_CFG = true
+---
+...
+old_internal = table.copy(vshard.storage.internal)
+---
+...
+package.loaded["vshard.storage"] = nil
+---
+...
+_, err = pcall(require, 'vshard.storage')
+---
+...
+err:match('Error injection:.*')
+---
+- 'Error injection: cfg'
+...
+vshard.storage.internal.errinj.ERRINJ_CFG = false
+---
+...
+util.has_same_fields(old_internal, vshard.storage.internal)
+---
+- true
+...
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+_ = rs:callro('echo', {'some_data'})
+---
+...
test_run:switch('default')
---
- true
diff --git a/test/storage/reload.test.lua b/test/storage/reload.test.lua
index 64c3a60..b51a364 100644
--- a/test/storage/reload.test.lua
+++ b/test/storage/reload.test.lua
@@ -87,6 +87,29 @@ _ = require('vshard.storage')
vshard.storage.module_version()
check_reloaded()
+--
+-- Outdate old replicaset and replica objets.
+--
+_, rs = next(vshard.storage.internal.replicasets)
+package.loaded["vshard.storage"] = nil
+_ = require('vshard.storage')
+rs.callro(rs, 'echo', {'some_data'})
+_, rs = next(vshard.storage.internal.replicasets)
+rs.callro(rs, 'echo', {'some_data'})
+
+-- Error during reload process.
+_, rs = next(vshard.storage.internal.replicasets)
+rs:callro('echo', {'some_data'})
+vshard.storage.internal.errinj.ERRINJ_CFG = true
+old_internal = table.copy(vshard.storage.internal)
+package.loaded["vshard.storage"] = nil
+_, err = pcall(require, 'vshard.storage')
+err:match('Error injection:.*')
+vshard.storage.internal.errinj.ERRINJ_CFG = false
+util.has_same_fields(old_internal, vshard.storage.internal)
+_, rs = next(vshard.storage.internal.replicasets)
+_ = rs:callro('echo', {'some_data'})
+
test_run:switch('default')
test_run:drop_cluster(REPLICASET_2)
test_run:drop_cluster(REPLICASET_1)
diff --git a/vshard/cfg.lua b/vshard/cfg.lua
index d5429af..87d0fc8 100644
--- a/vshard/cfg.lua
+++ b/vshard/cfg.lua
@@ -217,6 +217,10 @@ local cfg_template = {
type = 'non-negative number', name = 'Sync timeout', is_optional = true,
default = consts.DEFAULT_SYNC_TIMEOUT
}},
+ {'connection_outdate_delay', {
+ type = 'non-negative number', name = 'Object outdate timeout',
+ is_optional = true, default = nil
+ }},
}
--
@@ -264,6 +268,8 @@ local function remove_non_box_options(cfg)
cfg.collect_bucket_garbage_interval = nil
cfg.collect_lua_garbage = nil
cfg.sync_timeout = nil
+ cfg.sync_timeout = nil
+ cfg.connection_outdate_delay = nil
end
return {
diff --git a/vshard/error.lua b/vshard/error.lua
index cf2f9d2..f79107b 100644
--- a/vshard/error.lua
+++ b/vshard/error.lua
@@ -100,6 +100,11 @@ local error_message_template = {
[19] = {
name = 'REPLICASET_IS_LOCKED',
msg = 'Replicaset is locked'
+ },
+ [20] = {
+ name = 'OBJECT_IS_OUTDATED',
+ msg = 'Object is outdated after module reload/reconfigure. ' ..
+ 'Use new instance.'
}
}
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
index 99f59aa..ec6e95b 100644
--- a/vshard/replicaset.lua
+++ b/vshard/replicaset.lua
@@ -48,7 +48,8 @@ local lerror = require('vshard.error')
local fiber = require('fiber')
local luri = require('uri')
local ffi = require('ffi')
-local gsc = require('vshard.util').generate_self_checker
+local util = require('vshard.util')
+local gsc = util.generate_self_checker
--
-- on_connect() trigger for net.box
@@ -337,27 +338,39 @@ local function replicaset_tostring(replicaset)
master)
end
+local outdate_replicasets
--
--- Rebind connections of old replicas to new ones.
+-- Copy netbox conections from old replica objects to new ones
+-- and outdate old objects.
+-- @param replicasets New replicasets
+-- @param old_replicasets Replicasets and replicas to be outdated.
+-- @param outdate_delay Number of seconds; delay to outdate
+-- old objects.
--
-local function replicaset_rebind_connections(replicaset)
- for _, replica in pairs(replicaset.replicas) do
- local old_replica = replica.old_replica
- if old_replica then
- local conn = old_replica.conn
- replica.conn = conn
- replica.down_ts = old_replica.down_ts
- replica.net_timeout = old_replica.net_timeout
- replica.net_sequential_ok = old_replica.net_sequential_ok
- replica.net_sequential_fail = old_replica.net_sequential_fail
- if conn then
- conn.replica = replica
- conn.replicaset = replicaset
- old_replica.conn = nil
+local function rebind_replicasets(replicasets, old_replicasets, outdate_delay)
+ for replicaset_uuid, replicaset in pairs(replicasets) do
+ local old_replicaset = old_replicasets and
+ old_replicasets[replicaset_uuid]
+ for replica_uuid, replica in pairs(replicaset.replicas) do
+ local old_replica = old_replicaset and
+ old_replicaset.replicas[replica_uuid]
+ if old_replica then
+ local conn = old_replica.conn
+ replica.conn = conn
+ replica.down_ts = old_replica.down_ts
+ replica.net_timeout = old_replica.net_timeout
+ replica.net_sequential_ok = old_replica.net_sequential_ok
+ replica.net_sequential_fail = old_replica.net_sequential_fail
+ if conn then
+ conn.replica = replica
+ conn.replicaset = replicaset
+ end
end
- replica.old_replica = nil
end
end
+ if old_replicasets then
+ util.async_task(outdate_delay, outdate_replicasets, old_replicasets)
+ end
end
--
@@ -369,7 +382,6 @@ local replicaset_mt = {
connect_master = replicaset_connect_master;
connect_all = replicaset_connect_all;
connect_replica = replicaset_connect_to_replica;
- rebind_connections = replicaset_rebind_connections;
down_replica_priority = replicaset_down_replica_priority;
up_replica_priority = replicaset_up_replica_priority;
call = replicaset_master_call;
@@ -412,6 +424,49 @@ for name, func in pairs(replica_mt.__index) do
end
replica_mt.__index = index
+--
+-- Meta-methods of outdated objects
+-- They define only arrtibutes from corresponding metatables to
+-- make user able to access fields of old objects.
+--
+local function outdated_warning(...)
+ return nil, lerror.vshard(lerror.code.OBJECT_IS_OUTDATED)
+end
+
+local outdated_replicaset_mt = {
+ __index = {
+ outdated = true
+ }
+}
+for fname, func in pairs(replicaset_mt.__index) do
+ outdated_replicaset_mt.__index[fname] = outdated_warning
+end
+
+local outdated_replica_mt = {
+ __index = {
+ outdated = true
+ }
+}
+for fname, func in pairs(replica_mt.__index) do
+ outdated_replica_mt.__index[fname] = outdated_warning
+end
+
+--
+-- Outdate replicaset and replica objects:
+-- * Set outdated_metatables.
+-- * Remove connections.
+--
+outdate_replicasets = function(replicasets)
+ for _, replicaset in pairs(replicasets) do
+ setmetatable(replicaset, outdated_replicaset_mt)
+ for _, replica in pairs(replicaset.replicas) do
+ setmetatable(replica, outdated_replica_mt)
+ replica.conn = nil
+ end
+ end
+ log.info('Old replicaset and replica objects are outdated.')
+end
+
--
-- Calculate for each replicaset its etalon bucket count.
-- Iterative algorithm is used to learn the best balance in a
@@ -503,7 +558,7 @@ end
--
-- Update/build replicasets from configuration
--
-local function buildall(sharding_cfg, old_replicasets)
+local function buildall(sharding_cfg)
local new_replicasets = {}
local weights = sharding_cfg.weights
local zone = sharding_cfg.zone
@@ -515,8 +570,6 @@ local function buildall(sharding_cfg, old_replicasets)
end
local curr_ts = fiber.time()
for replicaset_uuid, replicaset in pairs(sharding_cfg.sharding) do
- local old_replicaset = old_replicasets and
- old_replicasets[replicaset_uuid]
local new_replicaset = setmetatable({
replicas = {},
uuid = replicaset_uuid,
@@ -526,8 +579,6 @@ local function buildall(sharding_cfg, old_replicasets)
}, replicaset_mt)
local priority_list = {}
for replica_uuid, replica in pairs(replicaset.replicas) do
- local old_replica = old_replicaset and
- old_replicaset.replicas[replica_uuid]
-- The old replica is saved in the new object to
-- rebind its connection at the end of a
-- router/storage reconfiguration.
@@ -535,7 +586,7 @@ local function buildall(sharding_cfg, old_replicasets)
uri = replica.uri, name = replica.name, uuid = replica_uuid,
zone = replica.zone, net_timeout = consts.CALL_TIMEOUT_MIN,
net_sequential_ok = 0, net_sequential_fail = 0,
- down_ts = curr_ts, old_replica = old_replica,
+ down_ts = curr_ts,
}, replica_mt)
new_replicaset.replicas[replica_uuid] = new_replica
if replica.master then
@@ -596,4 +647,5 @@ return {
buildall = buildall,
calculate_etalon_balance = cluster_calculate_etalon_balance,
wait_masters_connect = wait_masters_connect,
+ rebind_replicasets = rebind_replicasets,
}
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index a143070..a8f6bbc 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -1,5 +1,17 @@
local log = require('log')
local lfiber = require('fiber')
+
+local MODULE_INTERNALS = '__module_vshard_router'
+-- Reload requirements, in case this module is reloaded manually.
+if rawget(_G, MODULE_INTERNALS) then
+ local vshard_modules = {
+ 'vshard.consts', 'vshard.error', 'vshard.cfg',
+ 'vshard.hash', 'vshard.replicaset', 'vshard.util',
+ }
+ for _, module in pairs(vshard_modules) do
+ package.loaded[module] = nil
+ end
+end
local consts = require('vshard.consts')
local lerror = require('vshard.error')
local lcfg = require('vshard.cfg')
@@ -7,7 +19,7 @@ local lhash = require('vshard.hash')
local lreplicaset = require('vshard.replicaset')
local util = require('vshard.util')
-local M = rawget(_G, '__module_vshard_router')
+local M = rawget(_G, MODULE_INTERNALS)
if not M then
M = {
errinj = {
@@ -16,6 +28,8 @@ if not M then
ERRINJ_RELOAD = false,
ERRINJ_LONG_DISCOVERY = false,
},
+ -- Time to outdate old objects on reload.
+ connection_outdate_delay = nil,
-- Bucket map cache.
route_map = {},
-- All known replicasets used for bucket re-balancing
@@ -479,12 +493,13 @@ local function router_cfg(cfg)
else
log.info('Starting router reconfiguration')
end
- local new_replicasets = lreplicaset.buildall(cfg, M.replicasets)
+ local new_replicasets = lreplicaset.buildall(cfg)
local total_bucket_count = cfg.bucket_count
local collect_lua_garbage = cfg.collect_lua_garbage
- lcfg.remove_non_box_options(cfg)
+ local box_cfg = table.deepcopy(cfg)
+ lcfg.remove_non_box_options(box_cfg)
log.info("Calling box.cfg()...")
- for k, v in pairs(cfg) do
+ for k, v in pairs(box_cfg) do
log.info({[k] = v})
end
-- It is considered that all possible errors during cfg
@@ -493,18 +508,18 @@ local function router_cfg(cfg)
if M.errinj.ERRINJ_CFG then
error('Error injection: cfg')
end
- box.cfg(cfg)
+ box.cfg(box_cfg)
log.info("Box has been configured")
+ M.connection_outdate_delay = cfg.connection_outdate_delay
M.total_bucket_count = total_bucket_count
M.collect_lua_garbage = collect_lua_garbage
- M.replicasets = new_replicasets
M.current_cfg = new_cfg
-- Move connections from an old configuration to a new one.
-- It must be done with no yields to prevent usage both of not
-- fully moved old replicasets, and not fully built new ones.
- for _, replicaset in pairs(new_replicasets) do
- replicaset:rebind_connections()
- end
+ lreplicaset.rebind_replicasets(new_replicasets, M.replicasets,
+ M.connection_outdate_delay)
+ M.replicasets = new_replicasets
-- Now the new replicasets are fully built. Can establish
-- connections and yield.
for _, replicaset in pairs(new_replicasets) do
@@ -793,15 +808,16 @@ end
-- About functions, saved in M, and reloading see comment in
-- storage/init.lua.
--
-M.discovery_f = discovery_f
-M.failover_f = failover_f
-
-if not rawget(_G, '__module_vshard_router') then
- rawset(_G, '__module_vshard_router', M)
+if not rawget(_G, MODULE_INTERNALS) then
+ rawset(_G, MODULE_INTERNALS, M)
else
+ router_cfg(M.current_cfg)
M.module_version = M.module_version + 1
end
+M.discovery_f = discovery_f
+M.failover_f = failover_f
+
return {
cfg = router_cfg;
info = router_info;
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 052e94f..bf560e6 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -2,20 +2,34 @@ local log = require('log')
local luri = require('uri')
local lfiber = require('fiber')
local netbox = require('net.box') -- for net.box:self()
+local trigger = require('internal.trigger')
+
+local MODULE_INTERNALS = '__module_vshard_storage'
+-- Reload requirements, in case this module is reloaded manually.
+if rawget(_G, MODULE_INTERNALS) then
+ local vshard_modules = {
+ 'vshard.consts', 'vshard.error', 'vshard.cfg',
+ 'vshard.replicaset', 'vshard.util',
+ }
+ for _, module in pairs(vshard_modules) do
+ package.loaded[module] = nil
+ end
+end
local consts = require('vshard.consts')
local lerror = require('vshard.error')
-local util = require('vshard.util')
local lcfg = require('vshard.cfg')
local lreplicaset = require('vshard.replicaset')
-local trigger = require('internal.trigger')
+local util = require('vshard.util')
-local M = rawget(_G, '__module_vshard_storage')
+local M = rawget(_G, MODULE_INTERNALS)
if not M then
--
-- The module is loaded for the first time.
--
M = {
---------------- Common module attributes ----------------
+ -- The last passed configuration.
+ current_cfg = nil,
--
-- All known replicasets used for bucket re-balancing.
-- See format in replicaset.lua.
@@ -1497,7 +1511,7 @@ local function storage_cfg(cfg, this_replica_uuid)
local this_replicaset
local this_replica
- local new_replicasets = lreplicaset.buildall(cfg, M.replicasets)
+ local new_replicasets = lreplicaset.buildall(cfg)
local min_master
for rs_uuid, rs in pairs(new_replicasets) do
for replica_uuid, replica in pairs(rs.replicas) do
@@ -1576,7 +1590,6 @@ local function storage_cfg(cfg, this_replica_uuid)
--
local old_sync_timeout = M.sync_timeout
M.sync_timeout = cfg.sync_timeout
- lcfg.remove_non_box_options(cfg)
if was_master and not is_master then
local_on_master_disable_prepare()
@@ -1585,7 +1598,9 @@ local function storage_cfg(cfg, this_replica_uuid)
local_on_master_enable_prepare()
end
- local ok, err = pcall(box.cfg, cfg)
+ local box_cfg = table.deepcopy(cfg)
+ lcfg.remove_non_box_options(box_cfg)
+ local ok, err = pcall(box.cfg, box_cfg)
while M.errinj.ERRINJ_CFG_DELAY do
lfiber.sleep(0.01)
end
@@ -1604,10 +1619,8 @@ local function storage_cfg(cfg, this_replica_uuid)
local uri = luri.parse(this_replica.uri)
box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password)
+ lreplicaset.rebind_replicasets(new_replicasets, M.replicasets)
M.replicasets = new_replicasets
- for _, replicaset in pairs(new_replicasets) do
- replicaset:rebind_connections()
- end
M.this_replicaset = this_replicaset
M.this_replica = this_replica
M.total_bucket_count = total_bucket_count
@@ -1846,6 +1859,14 @@ end
-- restarted (or is restarted from M.background_f, which is not
-- changed) and continues use old func1 and func2.
--
+
+if not rawget(_G, MODULE_INTERNALS) then
+ rawset(_G, MODULE_INTERNALS, M)
+else
+ storage_cfg(M.current_cfg, M.this_replica.uuid)
+ M.module_version = M.module_version + 1
+end
+
M.recovery_f = recovery_f
M.collect_garbage_f = collect_garbage_f
M.rebalancer_f = rebalancer_f
@@ -1861,12 +1882,6 @@ M.rebalancer_build_routes = rebalancer_build_routes
M.rebalancer_calculate_metrics = rebalancer_calculate_metrics
M.cached_find_sharded_spaces = find_sharded_spaces
-if not rawget(_G, '__module_vshard_storage') then
- rawset(_G, '__module_vshard_storage', M)
-else
- M.module_version = M.module_version + 1
-end
-
return {
sync = sync,
bucket_force_create = bucket_force_create,
diff --git a/vshard/util.lua b/vshard/util.lua
index ce79930..fb875ce 100644
--- a/vshard/util.lua
+++ b/vshard/util.lua
@@ -100,9 +100,29 @@ end
-- Update latest versions of function
M.reloadable_fiber_f = reloadable_fiber_f
+local function sync_task(delay, task, ...)
+ if delay then
+ fiber.sleep(delay)
+ end
+ task(...)
+end
+
+--
+-- Run a function without interrupting current fiber.
+-- @param delay Delay in seconds before the task should be
+-- executed.
+-- @param task Function to be executed.
+-- @param ... Arguments which would be passed to the `task`.
+--
+local function async_task(delay, task, ...)
+ assert(delay == nil or type(delay) == 'number')
+ fiber.create(sync_task, delay, task, ...)
+end
+
return {
tuple_extract_key = tuple_extract_key,
reloadable_fiber_f = reloadable_fiber_f,
generate_self_checker = generate_self_checker,
+ async_task = async_task,
internal = M,
}
--
2.14.1
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] [PATCH 3/3] Introduce storage reload evolution
2018-07-18 17:47 [tarantool-patches] [PATCH 0/3] vshard reload mechanism AKhatskevich
2018-07-18 17:47 ` [tarantool-patches] [PATCH 1/3] Add test on error during reconfigure AKhatskevich
2018-07-18 17:47 ` [tarantool-patches] [PATCH 2/3] Complete module reload AKhatskevich
@ 2018-07-18 17:47 ` AKhatskevich
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
2 siblings, 1 reply; 16+ messages in thread
From: AKhatskevich @ 2018-07-18 17:47 UTC (permalink / raw)
To: v.shpilevoy, tarantool-patches
Changes:
1. Introduce storage reload evolution.
2. Setup cross-version reload testing.
1:
This mechanism updates Lua objects on reload in case they are
changed in a new vshard.storage version.
Since this commit, any change in vshard.storage.M has to be
reflected in vshard.storage.reload_evolution to guarantee
correct reload.
2:
The testing uses git infrastructure and is performed in the following
way:
1. Copy old version of vshard to a temp folder.
2. Run vshard on this code.
3. Checkout the latest version of the vshard sources.
4. Reload vshard storage.
5. Make sure it works (Perform simple tests).
Notes:
* this patch contains some legacy-driven decisions:
1. SOURCEDIR path retrieved differentpy in case of
packpack build.
2. git directory in the `reload_evolution/storage` test
is copied with respect to Centos 7 and `ro` mode of
SOURCEDIR.
Closes <shut git>112 125
---
.travis.yml | 2 +-
rpm/prebuild.sh | 2 +
test/lua_libs/git_util.lua | 39 +++++++
test/lua_libs/util.lua | 20 ++++
test/reload_evolution/storage.result | 184 +++++++++++++++++++++++++++++++++
test/reload_evolution/storage.test.lua | 64 ++++++++++++
test/reload_evolution/storage_1_a.lua | 144 ++++++++++++++++++++++++++
test/reload_evolution/storage_1_b.lua | 1 +
test/reload_evolution/storage_2_a.lua | 1 +
test/reload_evolution/storage_2_b.lua | 1 +
test/reload_evolution/suite.ini | 6 ++
test/reload_evolution/test.lua | 9 ++
vshard/storage/init.lua | 11 ++
vshard/storage/reload_evolution.lua | 58 +++++++++++
14 files changed, 541 insertions(+), 1 deletion(-)
create mode 100644 test/lua_libs/git_util.lua
create mode 100644 test/reload_evolution/storage.result
create mode 100644 test/reload_evolution/storage.test.lua
create mode 100755 test/reload_evolution/storage_1_a.lua
create mode 120000 test/reload_evolution/storage_1_b.lua
create mode 120000 test/reload_evolution/storage_2_a.lua
create mode 120000 test/reload_evolution/storage_2_b.lua
create mode 100644 test/reload_evolution/suite.ini
create mode 100644 test/reload_evolution/test.lua
create mode 100644 vshard/storage/reload_evolution.lua
diff --git a/.travis.yml b/.travis.yml
index 54bfe44..eff4a51 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,7 +41,7 @@ env:
script:
- git describe --long
- git clone https://github.com/packpack/packpack.git packpack
- - packpack/packpack
+ - packpack/packpack -e PACKPACK_GIT_SOURCEDIR=/source/
before_deploy:
- ls -l build/
diff --git a/rpm/prebuild.sh b/rpm/prebuild.sh
index 768b22b..554032b 100755
--- a/rpm/prebuild.sh
+++ b/rpm/prebuild.sh
@@ -1 +1,3 @@
curl -s https://packagecloud.io/install/repositories/tarantool/1_9/script.rpm.sh | sudo bash
+sudo yum -y install python-devel python-pip
+sudo pip install tarantool msgpack
diff --git a/test/lua_libs/git_util.lua b/test/lua_libs/git_util.lua
new file mode 100644
index 0000000..e2c17d0
--- /dev/null
+++ b/test/lua_libs/git_util.lua
@@ -0,0 +1,39 @@
+--
+-- Lua bridge for some of the git commands.
+--
+local os = require('os')
+
+local temp_file = 'some_strange_rare_unique_file_name_for_git_util'
+local function exec_cmd(options, cmd, args, files, dir, fout)
+ files = files or ''
+ options = options or ''
+ args = args or ''
+ local shell_cmd
+ shell_cmd = string.format('git %s %s %s %s', options, cmd, args, files)
+ if fout then
+ shell_cmd = shell_cmd .. ' >' .. fout
+ end
+ if dir then
+ shell_cmd = string.format('cd %s && %s', dir, shell_cmd)
+ end
+ local res = os.execute(shell_cmd)
+ assert(res == 0, 'Git cmd error: ' .. res)
+end
+
+local function log_hashes(options, args, files, dir)
+ args = args .. " --format='%h'"
+ local local_temp_file = string.format('%s/%s', os.getenv('PWD'), temp_file)
+ exec_cmd(options, 'log', args, files, dir, local_temp_file)
+ local lines = {}
+ for line in io.lines(local_temp_file) do
+ table.insert(lines, line)
+ end
+ os.remove(local_temp_file)
+ return lines
+end
+
+
+return {
+ exec_cmd = exec_cmd,
+ log_hashes = log_hashes
+}
diff --git a/test/lua_libs/util.lua b/test/lua_libs/util.lua
index aeb2342..108510e 100644
--- a/test/lua_libs/util.lua
+++ b/test/lua_libs/util.lua
@@ -1,5 +1,6 @@
local fiber = require('fiber')
local log = require('log')
+local fio = require('fio')
local function check_error(func, ...)
local pstatus, status, err = pcall(func, ...)
@@ -84,10 +85,29 @@ local function has_same_fields(ethalon, data)
return true
end
+-- Git directory of the project. Used in evolution tests to
+-- fetch old versions of vshard.
+local SOURCEDIR = os.getenv('PACKPACK_GIT_SOURCEDIR')
+if not SOURCEDIR then
+ SOURCEDIR = os.getenv('SOURCEDIR')
+end
+if not SOURCEDIR then
+ local script_path = debug.getinfo(1).source:match("@?(.*/)")
+ script_path = fio.abspath(script_path)
+ SOURCEDIR = fio.abspath(script_path .. '/../../../')
+end
+
+local BUILDDIR = os.getenv('BUILDDIR')
+if not BUILDDIR then
+ BUILDDIR = SOURCEDIR
+end
+
return {
check_error = check_error,
shuffle_masters = shuffle_masters,
collect_timeouts = collect_timeouts,
wait_master = wait_master,
has_same_fields = has_same_fields,
+ SOURCEDIR = SOURCEDIR,
+ BUILDDIR = BUILDDIR,
}
diff --git a/test/reload_evolution/storage.result b/test/reload_evolution/storage.result
new file mode 100644
index 0000000..2cf21fd
--- /dev/null
+++ b/test/reload_evolution/storage.result
@@ -0,0 +1,184 @@
+test_run = require('test_run').new()
+---
+...
+git_util = require('git_util')
+---
+...
+util = require('util')
+---
+...
+vshard_copy_path = util.BUILDDIR .. '/test/var/vshard_git_tree_copy'
+---
+...
+evolution_log = git_util.log_hashes('', '', 'vshard/storage/reload_evolution.lua', util.SOURCEDIR)
+---
+...
+-- Cleanup the directory after a previous build.
+_ = os.execute('rm -rf ' .. vshard_copy_path)
+---
+...
+-- 1. `git worktree` cannot be used because PACKPACK mounts
+-- `/source/` in `ro` mode.
+-- 2. Just `cp -rf` cannot be used due to a little different
+-- behavior in Centos 7.
+_ = os.execute('mkdir ' .. vshard_copy_path)
+---
+...
+_ = os.execute("cd " .. util.SOURCEDIR .. ' && cp -rf `ls -A --ignore=build` ' .. vshard_copy_path)
+---
+...
+-- Checkout the first commit with a reload_evolution mechanism.
+git_util.exec_cmd('', 'checkout', '-f', '', vshard_copy_path)
+---
+...
+git_util.exec_cmd('', 'checkout', evolution_log[#evolution_log] .. '~1', '', vshard_copy_path)
+---
+...
+REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
+---
+...
+REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
+---
+...
+test_run:create_cluster(REPLICASET_1, 'reload_evolution')
+---
+...
+test_run:create_cluster(REPLICASET_2, 'reload_evolution')
+---
+...
+util = require('util')
+---
+...
+util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
+---
+...
+util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
+---
+...
+test_run:switch('storage_1_a')
+---
+- true
+...
+vshard.storage.internal.reload_evolution_version
+---
+- null
+...
+vshard.storage.bucket_force_create(1, vshard.consts.DEFAULT_BUCKET_COUNT / 2)
+---
+- true
+...
+box.space.customer:insert({1, 1, 'customer_name'})
+---
+- [1, 1, 'customer_name']
+...
+test_run:switch('storage_2_a')
+---
+- true
+...
+fiber = require('fiber')
+---
+...
+vshard.storage.bucket_force_create(vshard.consts.DEFAULT_BUCKET_COUNT / 2 + 1, vshard.consts.DEFAULT_BUCKET_COUNT / 2)
+---
+- true
+...
+while test_run:grep_log('storage_2_a', 'The cluster is balanced ok') == nil do vshard.storage.rebalancer_wakeup() fiber.sleep(0.1) end
+---
+...
+test_run:switch('default')
+---
+- true
+...
+git_util.exec_cmd('', 'checkout', evolution_log[1], '', vshard_copy_path)
+---
+...
+test_run:switch('storage_1_a')
+---
+- true
+...
+package.loaded["vshard.storage"] = nil
+---
+...
+vshard.storage = require("vshard.storage")
+---
+...
+test_run:grep_log('storage_1_a', 'vshard.storage.reload_evolution: upgraded to') ~= nil
+---
+- true
+...
+vshard.storage.internal.reload_evolution_version
+---
+- 1
+...
+-- Make sure storage operates well.
+vshard.storage.bucket_force_drop(2)
+---
+- true
+...
+vshard.storage.bucket_force_create(2)
+---
+- true
+...
+vshard.storage.buckets_info()[2]
+---
+- status: active
+ id: 2
+...
+vshard.storage.call(1, 'read', 'customer_lookup', {1})
+---
+- true
+- accounts: []
+ customer_id: 1
+ name: customer_name
+...
+vshard.storage.bucket_send(1, replicaset2_uuid)
+---
+- true
+...
+vshard.storage.garbage_collector_wakeup()
+---
+...
+fiber = require('fiber')
+---
+...
+while box.space._bucket:get({1}) do fiber.sleep(0.01) end
+---
+...
+test_run:switch('storage_2_a')
+---
+- true
+...
+vshard.storage.bucket_send(1, replicaset1_uuid)
+---
+- true
+...
+test_run:switch('storage_1_a')
+---
+- true
+...
+vshard.storage.call(1, 'read', 'customer_lookup', {1})
+---
+- true
+- accounts: []
+ customer_id: 1
+ name: customer_name
+...
+-- Check info() does not fail.
+vshard.storage.info() ~= nil
+---
+- true
+...
+test_run:switch('default')
+---
+- true
+...
+test_run:drop_cluster(REPLICASET_2)
+---
+...
+test_run:drop_cluster(REPLICASET_1)
+---
+...
+test_run:cmd('clear filter')
+---
+- true
+...
diff --git a/test/reload_evolution/storage.test.lua b/test/reload_evolution/storage.test.lua
new file mode 100644
index 0000000..fc1bd0c
--- /dev/null
+++ b/test/reload_evolution/storage.test.lua
@@ -0,0 +1,64 @@
+test_run = require('test_run').new()
+
+git_util = require('git_util')
+util = require('util')
+vshard_copy_path = util.BUILDDIR .. '/test/var/vshard_git_tree_copy'
+evolution_log = git_util.log_hashes('', '', 'vshard/storage/reload_evolution.lua', util.SOURCEDIR)
+-- Cleanup the directory after a previous build.
+_ = os.execute('rm -rf ' .. vshard_copy_path)
+-- 1. `git worktree` cannot be used because PACKPACK mounts
+-- `/source/` in `ro` mode.
+-- 2. Just `cp -rf` cannot be used due to a little different
+-- behavior in Centos 7.
+_ = os.execute('mkdir ' .. vshard_copy_path)
+_ = os.execute("cd " .. util.SOURCEDIR .. ' && cp -rf `ls -A --ignore=build` ' .. vshard_copy_path)
+-- Checkout the first commit with a reload_evolution mechanism.
+git_util.exec_cmd('', 'checkout', '-f', '', vshard_copy_path)
+git_util.exec_cmd('', 'checkout', evolution_log[#evolution_log] .. '~1', '', vshard_copy_path)
+
+REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
+REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
+test_run:create_cluster(REPLICASET_1, 'reload_evolution')
+test_run:create_cluster(REPLICASET_2, 'reload_evolution')
+util = require('util')
+util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
+util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
+
+test_run:switch('storage_1_a')
+vshard.storage.internal.reload_evolution_version
+vshard.storage.bucket_force_create(1, vshard.consts.DEFAULT_BUCKET_COUNT / 2)
+box.space.customer:insert({1, 1, 'customer_name'})
+
+test_run:switch('storage_2_a')
+fiber = require('fiber')
+vshard.storage.bucket_force_create(vshard.consts.DEFAULT_BUCKET_COUNT / 2 + 1, vshard.consts.DEFAULT_BUCKET_COUNT / 2)
+while test_run:grep_log('storage_2_a', 'The cluster is balanced ok') == nil do vshard.storage.rebalancer_wakeup() fiber.sleep(0.1) end
+
+test_run:switch('default')
+git_util.exec_cmd('', 'checkout', evolution_log[1], '', vshard_copy_path)
+
+test_run:switch('storage_1_a')
+package.loaded["vshard.storage"] = nil
+vshard.storage = require("vshard.storage")
+test_run:grep_log('storage_1_a', 'vshard.storage.reload_evolution: upgraded to') ~= nil
+vshard.storage.internal.reload_evolution_version
+-- Make sure storage operates well.
+vshard.storage.bucket_force_drop(2)
+vshard.storage.bucket_force_create(2)
+vshard.storage.buckets_info()[2]
+vshard.storage.call(1, 'read', 'customer_lookup', {1})
+vshard.storage.bucket_send(1, replicaset2_uuid)
+vshard.storage.garbage_collector_wakeup()
+fiber = require('fiber')
+while box.space._bucket:get({1}) do fiber.sleep(0.01) end
+test_run:switch('storage_2_a')
+vshard.storage.bucket_send(1, replicaset1_uuid)
+test_run:switch('storage_1_a')
+vshard.storage.call(1, 'read', 'customer_lookup', {1})
+-- Check info() does not fail.
+vshard.storage.info() ~= nil
+
+test_run:switch('default')
+test_run:drop_cluster(REPLICASET_2)
+test_run:drop_cluster(REPLICASET_1)
+test_run:cmd('clear filter')
diff --git a/test/reload_evolution/storage_1_a.lua b/test/reload_evolution/storage_1_a.lua
new file mode 100755
index 0000000..3e03f8f
--- /dev/null
+++ b/test/reload_evolution/storage_1_a.lua
@@ -0,0 +1,144 @@
+#!/usr/bin/env tarantool
+
+require('strict').on()
+
+
+local fio = require('fio')
+
+-- Get instance name
+local fio = require('fio')
+local NAME = fio.basename(arg[0], '.lua')
+local fiber = require('fiber')
+local util = require('util')
+
+-- Run one storage on a different vshard version.
+-- To do that, place vshard src to
+-- BUILDDIR/test/var/vshard_git_tree_copy/.
+if NAME == 'storage_1_a' then
+ local script_path = debug.getinfo(1).source:match("@?(.*/)")
+ vshard_copy = util.BUILDDIR .. '/test/var/vshard_git_tree_copy'
+ package.path = string.format(
+ '%s/?.lua;%s/?/init.lua;%s',
+ vshard_copy, vshard_copy, package.path
+ )
+end
+
+-- Check if we are running under test-run
+if os.getenv('ADMIN') then
+ test_run = require('test_run').new()
+ require('console').listen(os.getenv('ADMIN'))
+end
+
+-- Call a configuration provider
+cfg = require('localcfg')
+-- Name to uuid map
+names = {
+ ['storage_1_a'] = '8a274925-a26d-47fc-9e1b-af88ce939412',
+ ['storage_1_b'] = '3de2e3e1-9ebe-4d0d-abb1-26d301b84633',
+ ['storage_2_a'] = '1e02ae8a-afc0-4e91-ba34-843a356b8ed7',
+ ['storage_2_b'] = '001688c3-66f8-4a31-8e19-036c17d489c2',
+}
+
+replicaset1_uuid = 'cbf06940-0790-498b-948d-042b62cf3d29'
+replicaset2_uuid = 'ac522f65-aa94-4134-9f64-51ee384f1a54'
+replicasets = {replicaset1_uuid, replicaset2_uuid}
+
+-- Start the database with sharding
+vshard = require('vshard')
+vshard.storage.cfg(cfg, names[NAME])
+
+box.once("testapp:schema:1", function()
+ local customer = box.schema.space.create('customer')
+ customer:format({
+ {'customer_id', 'unsigned'},
+ {'bucket_id', 'unsigned'},
+ {'name', 'string'},
+ })
+ customer:create_index('customer_id', {parts = {'customer_id'}})
+ customer:create_index('bucket_id', {parts = {'bucket_id'}, unique = false})
+
+ local account = box.schema.space.create('account')
+ account:format({
+ {'account_id', 'unsigned'},
+ {'customer_id', 'unsigned'},
+ {'bucket_id', 'unsigned'},
+ {'balance', 'unsigned'},
+ {'name', 'string'},
+ })
+ account:create_index('account_id', {parts = {'account_id'}})
+ account:create_index('customer_id', {parts = {'customer_id'}, unique = false})
+ account:create_index('bucket_id', {parts = {'bucket_id'}, unique = false})
+ box.snapshot()
+
+ box.schema.func.create('customer_lookup')
+ box.schema.role.grant('public', 'execute', 'function', 'customer_lookup')
+ box.schema.func.create('customer_add')
+ box.schema.role.grant('public', 'execute', 'function', 'customer_add')
+ box.schema.func.create('echo')
+ box.schema.role.grant('public', 'execute', 'function', 'echo')
+ box.schema.func.create('sleep')
+ box.schema.role.grant('public', 'execute', 'function', 'sleep')
+ box.schema.func.create('raise_luajit_error')
+ box.schema.role.grant('public', 'execute', 'function', 'raise_luajit_error')
+ box.schema.func.create('raise_client_error')
+ box.schema.role.grant('public', 'execute', 'function', 'raise_client_error')
+end)
+
+function customer_add(customer)
+ box.begin()
+ box.space.customer:insert({customer.customer_id, customer.bucket_id,
+ customer.name})
+ for _, account in ipairs(customer.accounts) do
+ box.space.account:insert({
+ account.account_id,
+ customer.customer_id,
+ customer.bucket_id,
+ 0,
+ account.name
+ })
+ end
+ box.commit()
+ return true
+end
+
+function customer_lookup(customer_id)
+ if type(customer_id) ~= 'number' then
+ error('Usage: customer_lookup(customer_id)')
+ end
+
+ local customer = box.space.customer:get(customer_id)
+ if customer == nil then
+ return nil
+ end
+ customer = {
+ customer_id = customer.customer_id;
+ name = customer.name;
+ }
+ local accounts = {}
+ for _, account in box.space.account.index.customer_id:pairs(customer_id) do
+ table.insert(accounts, {
+ account_id = account.account_id;
+ name = account.name;
+ balance = account.balance;
+ })
+ end
+ customer.accounts = accounts;
+ return customer
+end
+
+function echo(...)
+ return ...
+end
+
+function sleep(time)
+ fiber.sleep(time)
+ return true
+end
+
+function raise_luajit_error()
+ assert(1 == 2)
+end
+
+function raise_client_error()
+ box.error(box.error.UNKNOWN)
+end
diff --git a/test/reload_evolution/storage_1_b.lua b/test/reload_evolution/storage_1_b.lua
new file mode 120000
index 0000000..02572da
--- /dev/null
+++ b/test/reload_evolution/storage_1_b.lua
@@ -0,0 +1 @@
+storage_1_a.lua
\ No newline at end of file
diff --git a/test/reload_evolution/storage_2_a.lua b/test/reload_evolution/storage_2_a.lua
new file mode 120000
index 0000000..02572da
--- /dev/null
+++ b/test/reload_evolution/storage_2_a.lua
@@ -0,0 +1 @@
+storage_1_a.lua
\ No newline at end of file
diff --git a/test/reload_evolution/storage_2_b.lua b/test/reload_evolution/storage_2_b.lua
new file mode 120000
index 0000000..02572da
--- /dev/null
+++ b/test/reload_evolution/storage_2_b.lua
@@ -0,0 +1 @@
+storage_1_a.lua
\ No newline at end of file
diff --git a/test/reload_evolution/suite.ini b/test/reload_evolution/suite.ini
new file mode 100644
index 0000000..bb5435b
--- /dev/null
+++ b/test/reload_evolution/suite.ini
@@ -0,0 +1,6 @@
+[default]
+core = tarantool
+description = Reload evolution tests
+script = test.lua
+is_parallel = False
+lua_libs = ../lua_libs/util.lua ../lua_libs/git_util.lua ../../example/localcfg.lua
diff --git a/test/reload_evolution/test.lua b/test/reload_evolution/test.lua
new file mode 100644
index 0000000..ad0543a
--- /dev/null
+++ b/test/reload_evolution/test.lua
@@ -0,0 +1,9 @@
+#!/usr/bin/env tarantool
+
+require('strict').on()
+
+box.cfg{
+ listen = os.getenv("LISTEN"),
+}
+
+require('console').listen(os.getenv('ADMIN'))
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index bf560e6..1740c98 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -10,6 +10,7 @@ if rawget(_G, MODULE_INTERNALS) then
local vshard_modules = {
'vshard.consts', 'vshard.error', 'vshard.cfg',
'vshard.replicaset', 'vshard.util',
+ 'vshard.storage.reload_evolution'
}
for _, module in pairs(vshard_modules) do
package.loaded[module] = nil
@@ -20,12 +21,16 @@ local lerror = require('vshard.error')
local lcfg = require('vshard.cfg')
local lreplicaset = require('vshard.replicaset')
local util = require('vshard.util')
+local reload_evolution = require('vshard.storage.reload_evolution')
local M = rawget(_G, MODULE_INTERNALS)
if not M then
--
-- The module is loaded for the first time.
--
+ -- !!!WARNING: any change of this table must be reflected in
+ -- `vshard.storage.reload_evolution` module to guarantee
+ -- reloadability of the module.
M = {
---------------- Common module attributes ----------------
-- The last passed configuration.
@@ -105,6 +110,11 @@ if not M then
-- a destination replicaset must drop already received
-- data.
rebalancer_sending_bucket = 0,
+
+ ------------------------- Reload -------------------------
+ -- Version of the loaded module. This number is used on
+ -- reload to determine which upgrade scripts to run.
+ reload_evolution_version = reload_evolution.version,
}
end
@@ -1863,6 +1873,7 @@ end
if not rawget(_G, MODULE_INTERNALS) then
rawset(_G, MODULE_INTERNALS, M)
else
+ reload_evolution.upgrade(M)
storage_cfg(M.current_cfg, M.this_replica.uuid)
M.module_version = M.module_version + 1
end
diff --git a/vshard/storage/reload_evolution.lua b/vshard/storage/reload_evolution.lua
new file mode 100644
index 0000000..cfac888
--- /dev/null
+++ b/vshard/storage/reload_evolution.lua
@@ -0,0 +1,58 @@
+--
+-- This module is used to upgrade the vshard.storage on the fly.
+-- It updates internal Lua structures in case they are changed
+-- in a commit.
+--
+local log = require('log')
+
+--
+-- Array of upgrade functions.
+-- magrations[version] = function which upgrades module version
+-- from `version` to `version + 1`.
+--
+local migrations = {}
+
+-- Initialize reload_upgrade mechanism
+migrations[#migrations + 1] = function (M)
+ -- Code to update Lua objects.
+end
+
+--
+-- Perform an update based on a version stored in `M` (internals).
+-- @param M Old module internals which should be updated.
+--
+local function upgrade(M)
+ local start_version = M.reload_evolution_version or 1
+ if start_version > #migrations then
+ local err_msg = string.format(
+ 'vshard.storage.reload_evolution: ' ..
+ 'auto-downgrade is not implemented; ' ..
+ 'loaded version is %d, upgrade script version is %d',
+ start_version, #migrations
+ )
+ log.error(err_msg)
+ error(err_msg)
+ end
+ for i = start_version, #migrations do
+ local ok, err = pcall(migrations[i], M)
+ if ok then
+ log.info('vshard.storage.reload_evolution: upgraded to %d version',
+ i)
+ else
+ local err_msg = string.format(
+ 'vshard.storage.reload_evolution: ' ..
+ 'error during upgrade to %d version: %s', i, err
+ )
+ log.error(err_msg)
+ error(err_msg)
+ end
+ -- Update the version just after upgrade to have an
+ -- actual version in case of an error.
+ M.reload_evolution_version = i
+ end
+end
+
+return {
+ version = #migrations,
+ upgrade = upgrade,
+}
--
2.14.1
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 1/3] Add test on error during reconfigure
2018-07-18 17:47 ` [tarantool-patches] [PATCH 1/3] Add test on error during reconfigure AKhatskevich
@ 2018-07-19 15:14 ` Vladislav Shpilevoy
2018-07-19 20:33 ` Alex Khatskevich
0 siblings, 1 reply; 16+ messages in thread
From: Vladislav Shpilevoy @ 2018-07-19 15:14 UTC (permalink / raw)
To: tarantool-patches, AKhatskevich
Hi! Thanks for the patch! See 3 comments below.
On 18/07/2018 20:47, AKhatskevich wrote:
> In case reconfigure process fails, the node should continue
> work properly.
> ---
> test/lua_libs/util.lua | 16 ++++++++++++++++
> test/router/router.result | 33 +++++++++++++++++++++++++++++++++
> test/router/router.test.lua | 10 ++++++++++
> test/storage/storage.result | 39 +++++++++++++++++++++++++++++++++++++++
> test/storage/storage.test.lua | 12 ++++++++++++
> vshard/router/init.lua | 7 +++++++
> vshard/storage/init.lua | 9 +++++++++
> 7 files changed, 126 insertions(+)
>
> diff --git a/test/lua_libs/util.lua b/test/lua_libs/util.lua
> index f2d3b48..aeb2342 100644
> --- a/test/lua_libs/util.lua
> +++ b/test/lua_libs/util.lua
> @@ -69,9 +69,25 @@ local function wait_master(test_run, replicaset, master)
> log.info('Slaves are connected to a master "%s"', master)
> end
>
> +-- Check that data has at least all fields as an ethalon.
1. Typo here and in other places: ethalon -> etalon.
2. Please, describe the function in doxygen style since
it is not a trival one-liner.
> +local function has_same_fields(ethalon, data)
> + assert(type(ethalon) == 'table' and type(data) == 'table')
> + local diff = {}
> + for k, v in pairs(ethalon) do
> + if v ~= data[k] then
> + table.insert(diff, k)
> + end
> + end
> + if #diff > 0 then
> + return false, diff
> + end
> + return true
> +end
> +
> return {
> check_error = check_error,
> shuffle_masters = shuffle_masters,
> collect_timeouts = collect_timeouts,
> wait_master = wait_master,
> + has_same_fields = has_same_fields,
> }
> diff --git a/test/router/router.result b/test/router/router.result
> index 15f4fd0..7ec3e15 100644
> --- a/test/router/router.result
> +++ b/test/router/router.result
> @@ -1156,6 +1156,39 @@ util.check_error(vshard.router.cfg, non_dynamic_cfg)
> ---
> - Non-dynamic option shard_index cannot be reconfigured
> ...
> +-- Error during reconfigure process.
> +vshard.router.route(1):callro('echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +vshard.router.internal.errinj.ERRINJ_CFG = true
> +---
> +...
> +old_internal = table.copy(vshard.router.internal)
> +---
> +...
> +_, err = pcall(vshard.router.cfg, cfg)
> +---
> +...
> +err:match('Error injection:.*')
3. P l e a s e. Again. Do not use pcall + match. Use
util.check_error.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 3/3] Introduce storage reload evolution
2018-07-18 17:47 ` [tarantool-patches] [PATCH 3/3] Introduce storage reload evolution AKhatskevich
@ 2018-07-19 15:14 ` Vladislav Shpilevoy
2018-07-20 11:32 ` Alex Khatskevich
0 siblings, 1 reply; 16+ messages in thread
From: Vladislav Shpilevoy @ 2018-07-19 15:14 UTC (permalink / raw)
To: tarantool-patches, AKhatskevich
Thanks for the patch! See 17 comments below.
On 18/07/2018 20:47, AKhatskevich wrote:
> Changes:
> 1. Introduce storage reload evolution.
> 2. Setup cross-version reload testing.
>
> 1:
> This mechanism updates Lua objects on reload in case they are
> changed in a new vshard.storage version.
>
> Since this commit, any change in vshard.storage.M has to be
> reflected in vshard.storage.reload_evolution to guarantee
> correct reload.
>
> 2:
> The testing uses git infrastructure and is performed in the following
> way:
> 1. Copy old version of vshard to a temp folder.
> 2. Run vshard on this code.
> 3. Checkout the latest version of the vshard sources.
> 4. Reload vshard storage.
> 5. Make sure it works (Perform simple tests).
>
> Notes:
> * this patch contains some legacy-driven decisions:
> 1. SOURCEDIR path retrieved differentpy in case of
1. Typo: differentpy.
> packpack build.
> 2. git directory in the `reload_evolution/storage` test
> is copied with respect to Centos 7 and `ro` mode of
> SOURCEDIR.
>
> Closes <shut git>112 125
2. Stray 'shut'.
> ---
> .travis.yml | 2 +-
> rpm/prebuild.sh | 2 +
> test/lua_libs/git_util.lua | 39 +++++++
> test/lua_libs/util.lua | 20 ++++
> test/reload_evolution/storage.result | 184 +++++++++++++++++++++++++++++++++
> test/reload_evolution/storage.test.lua | 64 ++++++++++++
> test/reload_evolution/storage_1_a.lua | 144 ++++++++++++++++++++++++++
> test/reload_evolution/storage_1_b.lua | 1 +
> test/reload_evolution/storage_2_a.lua | 1 +
> test/reload_evolution/storage_2_b.lua | 1 +
> test/reload_evolution/suite.ini | 6 ++
> test/reload_evolution/test.lua | 9 ++
> vshard/storage/init.lua | 11 ++
> vshard/storage/reload_evolution.lua | 58 +++++++++++
> 14 files changed, 541 insertions(+), 1 deletion(-)
> create mode 100644 test/lua_libs/git_util.lua
> create mode 100644 test/reload_evolution/storage.result
> create mode 100644 test/reload_evolution/storage.test.lua
> create mode 100755 test/reload_evolution/storage_1_a.lua
> create mode 120000 test/reload_evolution/storage_1_b.lua
> create mode 120000 test/reload_evolution/storage_2_a.lua
> create mode 120000 test/reload_evolution/storage_2_b.lua
> create mode 100644 test/reload_evolution/suite.ini
> create mode 100644 test/reload_evolution/test.lua
> create mode 100644 vshard/storage/reload_evolution.lua
>
> diff --git a/rpm/prebuild.sh b/rpm/prebuild.sh
> index 768b22b..554032b 100755
> --- a/rpm/prebuild.sh
> +++ b/rpm/prebuild.sh
> @@ -1 +1,3 @@
> curl -s https://packagecloud.io/install/repositories/tarantool/1_9/script.rpm.sh | sudo bash
> +sudo yum -y install python-devel python-pip
> +sudo pip install tarantool msgpack
3. Why do you need it? As I understand, 'curl' above installs all
the needed things. Besides, pip install downloads not necessary
Tarantool 1.9.
> diff --git a/test/lua_libs/git_util.lua b/test/lua_libs/git_util.lua
> new file mode 100644
> index 0000000..e2c17d0
> --- /dev/null
> +++ b/test/lua_libs/git_util.lua
> @@ -0,0 +1,39 @@
> +--
> +-- Lua bridge for some of the git commands.
> +--
> +local os = require('os')
> +
> +local temp_file = 'some_strange_rare_unique_file_name_for_git_util'
> +local function exec_cmd(options, cmd, args, files, dir, fout)
4. You can remove 'options' arg (it is always '' as I see).
> + files = files or ''
> + options = options or ''
> + args = args or ''
5. files, args, options are always non-nil.
> + local shell_cmd
> + shell_cmd = string.format('git %s %s %s %s', options, cmd, args, files)
6. Why do you need to announce shell_cmd? Just do
local shell_cmd = ...
> + if fout then
7. 'fout' is always nil.
> + shell_cmd = shell_cmd .. ' >' .. fout
> + end
> + if dir then
8. 'dir' is always non-nil.
> + shell_cmd = string.format('cd %s && %s', dir, shell_cmd)
> + end
> + local res = os.execute(shell_cmd)
> + assert(res == 0, 'Git cmd error: ' .. res)
> +end
> +
> +local function log_hashes(options, args, files, dir)
> + args = args .. " --format='%h'"
> + local local_temp_file = string.format('%s/%s', os.getenv('PWD'), temp_file)
9. Instead of writing output into a temporary file use
http://pgl.yoyo.org/luai/i/io.popen.
> + exec_cmd(options, 'log', args, files, dir, local_temp_file)
> + local lines = {}
> + for line in io.lines(local_temp_file) do
> + table.insert(lines, line)
> + end
> + os.remove(local_temp_file)
> + return lines
> +end
> +
> +
> +return {
> + exec_cmd = exec_cmd,
> + log_hashes = log_hashes
> +}
> diff --git a/test/reload_evolution/storage.result b/test/reload_evolution/storage.result
> new file mode 100644
> index 0000000..2cf21fd
> --- /dev/null
> +++ b/test/reload_evolution/storage.result
> @@ -0,0 +1,184 @@
> +test_run = require('test_run').new()
> +---
> +...
> +git_util = require('git_util')
> +---
> +...
> +util = require('util')
> +---
> +...
> +vshard_copy_path = util.BUILDDIR .. '/test/var/vshard_git_tree_copy'
> +---
> +...
> +evolution_log = git_util.log_hashes('', '', 'vshard/storage/reload_evolution.lua', util.SOURCEDIR)
> +---
> +...
> +-- Cleanup the directory after a previous build.
> +_ = os.execute('rm -rf ' .. vshard_copy_path)
> +---
> +...
> +-- 1. `git worktree` cannot be used because PACKPACK mounts
> +-- `/source/` in `ro` mode.
> +-- 2. Just `cp -rf` cannot be used due to a little different
> +-- behavior in Centos 7.
> +_ = os.execute('mkdir ' .. vshard_copy_path)
> +---
> +...
> +_ = os.execute("cd " .. util.SOURCEDIR .. ' && cp -rf `ls -A --ignore=build` ' .. vshard_copy_path)
> +---
> +...
> +-- Checkout the first commit with a reload_evolution mechanism.
> +git_util.exec_cmd('', 'checkout', '-f', '', vshard_copy_path)
> +---
> +...
> +git_util.exec_cmd('', 'checkout', evolution_log[#evolution_log] .. '~1', '', vshard_copy_path)
> +---
> +...
> +REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
> +---
> +...
> +REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
> +---
> +...
> +test_run:create_cluster(REPLICASET_1, 'reload_evolution')
> +---
> +...
> +test_run:create_cluster(REPLICASET_2, 'reload_evolution')
> +---
> +...
> +util = require('util')
> +---
> +...
> +util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
> +---
> +...
> +util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
> +---
> +...
> +test_run:switch('storage_1_a')
> +---
> +- true
> +...
> +vshard.storage.internal.reload_evolution_version
> +---
> +- null
> +...
> +vshard.storage.bucket_force_create(1, vshard.consts.DEFAULT_BUCKET_COUNT / 2)> +---
> +- true
> +...
> +box.space.customer:insert({1, 1, 'customer_name'})
> +---
> +- [1, 1, 'customer_name']
> +...
> +test_run:switch('storage_2_a')
> +---
> +- true
> +...
> +fiber = require('fiber')
> +---
> +...
> +vshard.storage.bucket_force_create(vshard.consts.DEFAULT_BUCKET_COUNT / 2 + 1, vshard.consts.DEFAULT_BUCKET_COUNT / 2)
> +---
> +- true
> +...
> +while test_run:grep_log('storage_2_a', 'The cluster is balanced ok') == nil do vshard.storage.rebalancer_wakeup() fiber.sleep(0.1) end
> +---
> +...
> +test_run:switch('default')
> +---
> +- true
> +...
> +git_util.exec_cmd('', 'checkout', evolution_log[1], '', vshard_copy_path)
> +---
> +...
> +test_run:switch('storage_1_a')
> +---
> +- true
> +...
> +package.loaded["vshard.storage"] = nil
> +---
> +...
> +vshard.storage = require("vshard.storage")
> +---
> +...
> +test_run:grep_log('storage_1_a', 'vshard.storage.reload_evolution: upgraded to') ~= nil
> +---
> +- true
> +...
> +vshard.storage.internal.reload_evolution_version
> +---
> +- 1
> +...
> +-- Make sure storage operates well.
> +vshard.storage.bucket_force_drop(2)
> +---
> +- true
> +...
> +vshard.storage.bucket_force_create(2)
10. This is too simple test. Force_drop/create merely do DML on _bucket. Lets
test the rebalancer creating a dis-balance. For example, put on storage_1 +10
buckets, and on storage_2 -10 buckets and wait for the balance.
> diff --git a/test/reload_evolution/storage_1_a.lua b/test/reload_evolution/storage_1_a.lua
> new file mode 100755
> index 0000000..3e03f8f
> --- /dev/null
> +++ b/test/reload_evolution/storage_1_a.lua
> @@ -0,0 +1,144 @@
> +#!/usr/bin/env tarantool
11. Just use a symbolic link to the existing storage_1_a. Same
for other storages.
> diff --git a/test/reload_evolution/suite.ini b/test/reload_evolution/suite.ini
> new file mode 100644
> index 0000000..bb5435b
> --- /dev/null
> +++ b/test/reload_evolution/suite.ini
12. You do not need a new test suite for one storage test. Please,
put it into test/storage/
> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
> index bf560e6..1740c98 100644
> --- a/vshard/storage/init.lua
> +++ b/vshard/storage/init.lua
> @@ -105,6 +110,11 @@ if not M then
> -- a destination replicaset must drop already received
> -- data.
> rebalancer_sending_bucket = 0,
> +
> + ------------------------- Reload -------------------------
> + -- Version of the loaded module. This number is used on
> + -- reload to determine which upgrade scripts to run.
> + reload_evolution_version = reload_evolution.version,
13. Use 'version' name.
> }
> end
>
> diff --git a/vshard/storage/reload_evolution.lua b/vshard/storage/reload_evolution.lua
> new file mode 100644
> index 0000000..cfac888
> --- /dev/null
> +++ b/vshard/storage/reload_evolution.lua
> @@ -0,0 +1,58 @@
> +--
> +-- This module is used to upgrade the vshard.storage on the fly.
> +-- It updates internal Lua structures in case they are changed
> +-- in a commit.
> +--
> +local log = require('log')
> +
> +--
> +-- Array of upgrade functions.
> +-- magrations[version] = function which upgrades module version
14. Typo: magrations.
> +-- from `version` to `version + 1`.
15. Not +1. I think, we should use real version numbers:
0.1.1, 0.1.2, etc similar to tarantool.
> +--
> +local migrations = {}
> +
> +-- Initialize reload_upgrade mechanism
> +migrations[#migrations + 1] = function (M)
> + -- Code to update Lua objects.
> +end
> +
> +--
> +-- Perform an update based on a version stored in `M` (internals).
> +-- @param M Old module internals which should be updated.
> +--
> +local function upgrade(M)
> + local start_version = M.reload_evolution_version or 1
> + if start_version > #migrations then
> + local err_msg = string.format(
> + 'vshard.storage.reload_evolution: ' ..
> + 'auto-downgrade is not implemented; ' ..
> + 'loaded version is %d, upgrade script version is %d',
> + start_version, #migrations
16. Did you test it? I do not see a test.
> + )
> + log.error(err_msg)
> + error(err_msg)
> + end
> + for i = start_version, #migrations do
> + local ok, err = pcall(migrations[i], M)
> + if ok then
> + log.info('vshard.storage.reload_evolution: upgraded to %d version',
> + i)
> + else
> + local err_msg = string.format(
> + 'vshard.storage.reload_evolution: ' ..
> + 'error during upgrade to %d version: %s', i, err
> + )
> + log.error(err_msg)
> + error(err_msg)
> + end
> + -- Update the version just after upgrade to have an
> + -- actual version in case of an error.
> + M.reload_evolution_version = i
> + end
> +end
> +
> +return {
> + version = #migrations,
17. Where do you use it?
> + upgrade = upgrade,
> +}
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 2/3] Complete module reload
2018-07-18 17:47 ` [tarantool-patches] [PATCH 2/3] Complete module reload AKhatskevich
@ 2018-07-19 15:14 ` Vladislav Shpilevoy
2018-07-19 20:32 ` Alex Khatskevich
0 siblings, 1 reply; 16+ messages in thread
From: Vladislav Shpilevoy @ 2018-07-19 15:14 UTC (permalink / raw)
To: tarantool-patches, AKhatskevich
Thanks for the patch! See 13 comments below.
On 18/07/2018 20:47, AKhatskevich wrote:
> In case one need to upgrade vshard to a new version, this commit
> improves reload mechanism to allow to do that for a wider variety of
> possible changes (between two versions).
>
> Changes:
> * introduce cfg option `connection_outdate_delay`
> * improve reload mechanism
> * add `util.async_task` method, which runs a function after a
> delay
> * delete replicaset:rebind_connections method as it is replaced
> with `rebind_replicasets` which updates all replicasets at once
>
> Reload mechanism:
> * reload all vshard modules
> * create new `replicaset` and `replica` objects
> * reuse old netbox connections in new replica objects if
> possible
> * update router/storage.internal table
> * after a `connection_outdate_delay` disable old instances of
> `replicaset` and `replica` objects
>
> Reload works for modules:
> * vshard.router
> * vshard.storage
>
> Here is a module reload algorithm:
> * old vshard is working
> * delete old vshard src
> * install new vshard
> * call: package.loaded['vshard.router'] = nil
> * call: old_router = vshard.router -- Save working router copy.
> * call: vshard.router = require('vshard.router')
> * if require fails: continue using old_router
> * if require succeeds: use vshard.router
>
> In case reload process fails, old router/storage module, replicaset and
> replica objects continue working properly. If reload succeeds, all old
> objects would be deprecated.
>
> Extra changes:
> * introduce MODULE_INTERNALS which stores name of the module
> internal data in the global namespace
>
> Part of <shut git>112
1. Stray '<shut git>'.
> ---
> test/router/reload.result | 159 +++++++++++++++++++++++++++++++++++++++++++
> test/router/reload.test.lua | 48 +++++++++++++
> test/router/router.result | 3 +-
> test/storage/reload.result | 68 ++++++++++++++++++
> test/storage/reload.test.lua | 23 +++++++
> vshard/cfg.lua | 6 ++
> vshard/error.lua | 5 ++
> vshard/replicaset.lua | 100 ++++++++++++++++++++-------
> vshard/router/init.lua | 44 ++++++++----
> vshard/storage/init.lua | 45 ++++++++----
> vshard/util.lua | 20 ++++++
> 11 files changed, 466 insertions(+), 55 deletions(-)
>
> diff --git a/test/router/reload.result b/test/router/reload.result
> index 47f3c2e..3fbbe6e 100644
> --- a/test/router/reload.result
> +++ b/test/router/reload.result
> @@ -174,6 +174,165 @@ vshard.router.module_version()
> check_reloaded()
> ---
> ...
> +--
> +-- Outdate old replicaset and replica objets.
> +--
> +rs = vshard.router.route(1)
> +---
> +...
> +rs:callro('echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +package.loaded["vshard.router"] = nil
> +---
> +...
> +_ = require('vshard.router')
> +---
> +...
> +-- Make sure outdate async task has had cpu time.
> +fiber.sleep(0.005)
2. As I asked earlier, please, avoid constant timeouts.
When you want to wait for something, use 'while'.
> +---
> +...
> +rs.callro(rs, 'echo', {'some_data'})
> +---
> +- null
> +- type: ShardingError
> + name: OBJECT_IS_OUTDATED
> + message: Object is outdated after module reload/reconfigure. Use new instance.
> + code: 20
> +...
> +vshard.router = require('vshard.router')
> +---
> +...
> +rs = vshard.router.route(1)
> +---
> +...
> +rs:callro('echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +-- Test `connection_outdate_delay`.
> +old_connection_delay = cfg.connection_outdate_delay
> +---
> +...
> +cfg.connection_outdate_delay = 0.3
> +---
> +...
> +vshard.router.cfg(cfg)
> +---
> +...
> +cfg.connection_outdate_delay = old_connection_delay
> +---
> +...
> +vshard.router.internal.connection_outdate_delay = nil
> +---
> +...
> +vshard.router = require('vshard.router')
3. You have already did it few lines above.
> +---
> +...
> +rs_new = vshard.router.route(1)
> +---
> +...
> +rs_old = rs
> +---
> +...
> +_, replica_old = next(rs_old.replicas)
> +---
> +...
> +rs_new:callro('echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +-- Check old objets are still valid.
> +rs_old:callro('echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +replica_old.conn ~= nil
> +---
> +- true
> +...
> +fiber.sleep(0.2)
> +---
> +...
> +rs_old:callro('echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +replica_old.conn ~= nil
> +---
> +- true
> +...
> +replica_old.outdated == nil
> +---
> +- true
> +...
> +fiber.sleep(0.2)
> +---
> +...
> +rs_old:callro('echo', {'some_data'})
> +---
> +- null
> +- type: ShardingError
> + name: OBJECT_IS_OUTDATED
> + message: Object is outdated after module reload/reconfigure. Use new instance.
> + code: 20
> +...
> +replica_old.conn == nil
> +---
> +- true
> +...
> +replica_old.outdated == true
> +---
> +- true
> +...
> +rs_new:callro('echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +-- Error during reconfigure process.
4. You added this test in the previous commit.
> +_ = vshard.router.route(1):callro('echo', {'some_data'})
> +---
> +...
> +vshard.router.internal.errinj.ERRINJ_CFG = true
> +---
> +...
> +old_internal = table.copy(vshard.router.internal)
> +---
> +...
> +package.loaded["vshard.router"] = nil
> +---
> +...
> +_, err = pcall(require, 'vshard.router')
> +---
> +...
> +err:match('Error injection:.*')
> +---
> +- 'Error injection: cfg'
> +...
> +vshard.router.internal.errinj.ERRINJ_CFG = false
> +---
> +...
> +util.has_same_fields(old_internal, vshard.router.internal)
> +---
> +- true
> +...
> +_ = vshard.router.route(1):callro('echo', {'some_data'})
> +---
> +...
> test_run:switch('default')
> ---
> - true
> diff --git a/test/storage/reload.result b/test/storage/reload.result
> index 531d984..0281e27 100644
> --- a/test/storage/reload.result
> +++ b/test/storage/reload.result
> @@ -174,6 +174,74 @@ vshard.storage.module_version()
> check_reloaded()
> ---
> ...
> +--
> +-- Outdate old replicaset and replica objets.
> +--
> +_, rs = next(vshard.storage.internal.replicasets)
> +---
> +...
> +package.loaded["vshard.storage"] = nil
> +---
> +...
> +_ = require('vshard.storage')
> +---
> +...
> +rs.callro(rs, 'echo', {'some_data'})
> +---
> +- null
> +- type: ShardingError
> + name: OBJECT_IS_OUTDATED
> + message: Object is outdated after module reload/reconfigure. Use new instance.
> + code: 20
> +...
> +_, rs = next(vshard.storage.internal.replicasets)
> +---
> +...
> +rs.callro(rs, 'echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +-- Error during reload process.
> +_, rs = next(vshard.storage.internal.replicasets)
> +---
> +...
> +rs:callro('echo', {'some_data'})
> +---
> +- some_data
> +- null
> +- null
> +...
> +vshard.storage.internal.errinj.ERRINJ_CFG = true
5. Same as 4. We have already added the test in the previous
commit, it is not?
> +---
> +...
> +old_internal = table.copy(vshard.storage.internal)
> +---
> +...
> +package.loaded["vshard.storage"] = nil
> +---
> +...
> +_, err = pcall(require, 'vshard.storage')
> +---
> +...
> +err:match('Error injection:.*')
> +---
> +- 'Error injection: cfg'
> +...
> +vshard.storage.internal.errinj.ERRINJ_CFG = false
> +---
> +...
> +util.has_same_fields(old_internal, vshard.storage.internal)
> +---
> +- true
> +...
> +_, rs = next(vshard.storage.internal.replicasets)
> +---
> +...
> +_ = rs:callro('echo', {'some_data'})
> +---
> +...
> test_run:switch('default')
> ---
> - true
> diff --git a/vshard/cfg.lua b/vshard/cfg.lua
> index d5429af..87d0fc8 100644
> --- a/vshard/cfg.lua
> +++ b/vshard/cfg.lua
> @@ -217,6 +217,10 @@ local cfg_template = {
> type = 'non-negative number', name = 'Sync timeout', is_optional = true,
> default = consts.DEFAULT_SYNC_TIMEOUT
> }},
> + {'connection_outdate_delay', {
> + type = 'non-negative number', name = 'Object outdate timeout',
> + is_optional = true, default = nil
6. default = nil makes no sense for Lua tables.
> + }},
> }
>
> --
> @@ -264,6 +268,8 @@ local function remove_non_box_options(cfg)
> cfg.collect_bucket_garbage_interval = nil
> cfg.collect_lua_garbage = nil
> cfg.sync_timeout = nil
> + cfg.sync_timeout = nil
7. Why do you need the second nullify of sync_timeout?
> + cfg.connection_outdate_delay = nil
> end
>
> return {
> diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
> index 99f59aa..ec6e95b 100644
> --- a/vshard/replicaset.lua
> +++ b/vshard/replicaset.lua> @@ -412,6 +424,49 @@ for name, func in pairs(replica_mt.__index) do
> end
> replica_mt.__index = index
>
> +--
> +-- Meta-methods of outdated objects
8. Put the dot at the end of the sentence.
> +-- They define only arrtibutes from corresponding metatables to
9. Typo: arrtibutes.
> +-- make user able to access fields of old objects.
> +--
> +local function outdated_warning(...)
> + return nil, lerror.vshard(lerror.code.OBJECT_IS_OUTDATED)
> +end
> +
> +local outdated_replicaset_mt = {
> + __index = {
> + outdated = true
> + }
> +}
10. outdated_replicaset/replica_mt are identical. Please,
make one mt object names 'outdated_mt'.
> +for fname, func in pairs(replicaset_mt.__index) do
> + outdated_replicaset_mt.__index[fname] = outdated_warning
> +end
11. As I remember, my proposal was to do not
duplicate each method here, but just on any __index return
callable object that on call invokes outdated_warning.
You can ask, what to do with 'outdated' attribute then, but
you can set it directly in the object in outdate_replicasets()
function.
What is more, please, use 'is_outdated' since this value is a
flag. And add it to the description on top of the file, where
all attributes are documented.
> +
> +local outdated_replica_mt = {
> + __index = {
> + outdated = true
> + }
> +}
> +for fname, func in pairs(replica_mt.__index) do
> + outdated_replica_mt.__index[fname] = outdated_warning
> +end
> +
> +--
> +-- Outdate replicaset and replica objects:
> +-- * Set outdated_metatables.
> +-- * Remove connections.
> +--
> +outdate_replicasets = function(replicasets)
> + for _, replicaset in pairs(replicasets) do
> + setmetatable(replicaset, outdated_replicaset_mt)
> + for _, replica in pairs(replicaset.replicas) do
> + setmetatable(replica, outdated_replica_mt)
> + replica.conn = nil
Here you can put 'replica.is_outdated = true'.
> + end
Here you can put 'replicaset.is_outdated = true'.
> + end
> + log.info('Old replicaset and replica objects are outdated.')
> +end
> +
> --
> -- Calculate for each replicaset its etalon bucket count.
> -- Iterative algorithm is used to learn the best balance in a
> diff --git a/vshard/router/init.lua b/vshard/router/init.lua
> index a143070..a8f6bbc 100644
> --- a/vshard/router/init.lua
> +++ b/vshard/router/init.lua
> @@ -479,12 +493,13 @@ local function router_cfg(cfg)
> else
> log.info('Starting router reconfiguration')
> end
> - local new_replicasets = lreplicaset.buildall(cfg, M.replicasets)
> + local new_replicasets = lreplicaset.buildall(cfg)
> local total_bucket_count = cfg.bucket_count
> local collect_lua_garbage = cfg.collect_lua_garbage
> - lcfg.remove_non_box_options(cfg)
> + local box_cfg = table.deepcopy(cfg)
12. As I remember, I asked to use table.copy when
possible. And this case looks appropriate.
> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
> index 052e94f..bf560e6 100644
> --- a/vshard/storage/init.lua
> +++ b/vshard/storage/init.lua
> @@ -2,20 +2,34 @@ local log = require('log')
> local luri = require('uri')
> local lfiber = require('fiber')
> local netbox = require('net.box') -- for net.box:self()
> +local trigger = require('internal.trigger')
> +
> +local MODULE_INTERNALS = '__module_vshard_storage'
> +-- Reload requirements, in case this module is reloaded manually.
> +if rawget(_G, MODULE_INTERNALS) then
> + local vshard_modules = {
> + 'vshard.consts', 'vshard.error', 'vshard.cfg',
> + 'vshard.replicaset', 'vshard.util',
> + }
> + for _, module in pairs(vshard_modules) do
> + package.loaded[module] = nil
> + end
> +end
> local consts = require('vshard.consts')
> local lerror = require('vshard.error')
> -local util = require('vshard.util')
> local lcfg = require('vshard.cfg')
> local lreplicaset = require('vshard.replicaset')
> -local trigger = require('internal.trigger')
> +local util = require('vshard.util')
>
> -local M = rawget(_G, '__module_vshard_storage')
> +local M = rawget(_G, MODULE_INTERNALS)
> if not M then
> --
> -- The module is loaded for the first time.
> --
> M = {
> ---------------- Common module attributes ----------------
> + -- The last passed configuration.
> + current_cfg = nil,
13. Please, add the same assignment to the router module
initialization.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 2/3] Complete module reload
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
@ 2018-07-19 20:32 ` Alex Khatskevich
2018-07-20 11:34 ` Alex Khatskevich
0 siblings, 1 reply; 16+ messages in thread
From: Alex Khatskevich @ 2018-07-19 20:32 UTC (permalink / raw)
To: Vladislav Shpilevoy, tarantool-patches
>>
>> Part of <shut git>112
>
> 1. Stray '<shut git>'.
Ok
>
>> ---
>> test/router/reload.result | 159
>> +++++++++++++++++++++++++++++++++++++++++++
>> test/router/reload.test.lua | 48 +++++++++++++
>> test/router/router.result | 3 +-
>> test/storage/reload.result | 68 ++++++++++++++++++
>> test/storage/reload.test.lua | 23 +++++++
>> vshard/cfg.lua | 6 ++
>> vshard/error.lua | 5 ++
>> vshard/replicaset.lua | 100 ++++++++++++++++++++-------
>> vshard/router/init.lua | 44 ++++++++----
>> vshard/storage/init.lua | 45 ++++++++----
>> vshard/util.lua | 20 ++++++
>> 11 files changed, 466 insertions(+), 55 deletions(-)
>>
>> diff --git a/test/router/reload.result b/test/router/reload.result
>> index 47f3c2e..3fbbe6e 100644
>> --- a/test/router/reload.result
>> +++ b/test/router/reload.result
>> @@ -174,6 +174,165 @@ vshard.router.module_version()
>> ...
>> +-- Make sure outdate async task has had cpu time.
>> +fiber.sleep(0.005)
>
> 2. As I asked earlier, please, avoid constant timeouts.
> When you want to wait for something, use 'while'.
Fixed
>> +---
>> +...
>> +vshard.router = require('vshard.router')
>
> 3. You have already did it few lines above.
fixed
>> +-- Error during reconfigure process.
>
> 4. You added this test in the previous commit.
Thanks
>
>> +vshard.storage.internal.errinj.ERRINJ_CFG = true
>
> 5. Same as 4. We have already added the test in the previous
> commit, it is not?
thanks
>>
>> diff --git a/vshard/cfg.lua b/vshard/cfg.lua
>> index d5429af..87d0fc8 100644
>> --- a/vshard/cfg.lua
>> +++ b/vshard/cfg.lua
>> @@ -217,6 +217,10 @@ local cfg_template = {
>> type = 'non-negative number', name = 'Sync timeout',
>> is_optional = true,
>> default = consts.DEFAULT_SYNC_TIMEOUT
>> }},
>> + {'connection_outdate_delay', {
>> + type = 'non-negative number', name = 'Object outdate timeout',
>> + is_optional = true, default = nil
>
> 6. default = nil makes no sense for Lua tables.
deleted
>
>> + }},
>> }
>> --
>> @@ -264,6 +268,8 @@ local function remove_non_box_options(cfg)
>> cfg.collect_bucket_garbage_interval = nil
>> cfg.collect_lua_garbage = nil
>> cfg.sync_timeout = nil
>> + cfg.sync_timeout = nil
>
> 7. Why do you need the second nullify of sync_timeout?
rebase consecuence...
>
>> + cfg.connection_outdate_delay = nil
>> end
>> return {
>> diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
>> index 99f59aa..ec6e95b 100644
>> --- a/vshard/replicaset.lua
>> +++ b/vshard/replicaset.lua> @@ -412,6 +424,49 @@ for name, func in
>> pairs(replica_mt.__index) do
>> end
>> replica_mt.__index = index
>> +--
>> +-- Meta-methods of outdated objects
>
> 8. Put the dot at the end of the sentence.
fixed
>
>> +-- They define only arrtibutes from corresponding metatables to
>
> 9. Typo: arrtibutes.
fixed
>
>> +-- make user able to access fields of old objects.
>> +--
>> +local function outdated_warning(...)
>> + return nil, lerror.vshard(lerror.code.OBJECT_IS_OUTDATED)
>> +end
>> +
>> +local outdated_replicaset_mt = {
>> + __index = {
>> + outdated = true
>> + }
>> +}
>
> 10. outdated_replicaset/replica_mt are identical. Please,
> make one mt object names 'outdated_mt'.
>
>> +for fname, func in pairs(replicaset_mt.__index) do
>> + outdated_replicaset_mt.__index[fname] = outdated_warning
>> +end
>
> 11. As I remember, my proposal was to do not
> duplicate each method here, but just on any __index return
> callable object that on call invokes outdated_warning.
>
> You can ask, what to do with 'outdated' attribute then, but
> you can set it directly in the object in outdate_replicasets()
> function.
>
> What is more, please, use 'is_outdated' since this value is a
> flag. And add it to the description on top of the file, where
> all attributes are documented.
>
> Here you can put 'replica.is_outdated = true'.
>
> Here you can put 'replicaset.is_outdated = true'.
10, 11 - to be discussed.
There are two reasons to do it that way:
1. Replacing data objects with funcitons-wrappers do not make any sense,
because it is not called finaly. It just leads to a strange error if it
indexed somewhere.
2. Strange error messages appear in bg fibers logs sometimes because of
reason 1.
>
>> + end
>> + log.info('Old replicaset and replica objects are outdated.')
>> +end
>> +
>> --
>> -- Calculate for each replicaset its etalon bucket count.
>> -- Iterative algorithm is used to learn the best balance in a
>> diff --git a/vshard/router/init.lua b/vshard/router/init.lua
>> index a143070..a8f6bbc 100644
>> --- a/vshard/router/init.lua
>> +++ b/vshard/router/init.lua
>> @@ -479,12 +493,13 @@ local function router_cfg(cfg)
>> else
>> log.info('Starting router reconfiguration')
>> end
>> - local new_replicasets = lreplicaset.buildall(cfg, M.replicasets)
>> + local new_replicasets = lreplicaset.buildall(cfg)
>> local total_bucket_count = cfg.bucket_count
>> local collect_lua_garbage = cfg.collect_lua_garbage
>> - lcfg.remove_non_box_options(cfg)
>> + local box_cfg = table.deepcopy(cfg)
>
> 12. As I remember, I asked to use table.copy when
> possible. And this case looks appropriate.
ok
>> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
>> index 052e94f..bf560e6 100644
>> --- a/vshard/storage/init.lua
>> +++ b/vshard/storage/init.lua
>> @@ -2,20 +2,34 @@ local log = require('log')
>> local luri = require('uri')
>> local lfiber = require('fiber')
>> local netbox = require('net.box') -- for net.box:self()
>> +local trigger = require('internal.trigger')
>> +
>> +local MODULE_INTERNALS = '__module_vshard_storage'
>> +-- Reload requirements, in case this module is reloaded manually.
>> +if rawget(_G, MODULE_INTERNALS) then
>> + local vshard_modules = {
>> + 'vshard.consts', 'vshard.error', 'vshard.cfg',
>> + 'vshard.replicaset', 'vshard.util',
>> + }
>> + for _, module in pairs(vshard_modules) do
>> + package.loaded[module] = nil
>> + end
>> +end
>> local consts = require('vshard.consts')
>> local lerror = require('vshard.error')
>> -local util = require('vshard.util')
>> local lcfg = require('vshard.cfg')
>> local lreplicaset = require('vshard.replicaset')
>> -local trigger = require('internal.trigger')
>> +local util = require('vshard.util')
>> -local M = rawget(_G, '__module_vshard_storage')
>> +local M = rawget(_G, MODULE_INTERNALS)
>> if not M then
>> --
>> -- The module is loaded for the first time.
>> --
>> M = {
>> ---------------- Common module attributes ----------------
>> + -- The last passed configuration.
>> + current_cfg = nil,
>
> 13. Please, add the same assignment to the router module
> initialization.
added
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 1/3] Add test on error during reconfigure
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
@ 2018-07-19 20:33 ` Alex Khatskevich
2018-07-20 11:34 ` Alex Khatskevich
0 siblings, 1 reply; 16+ messages in thread
From: Alex Khatskevich @ 2018-07-19 20:33 UTC (permalink / raw)
To: Vladislav Shpilevoy, tarantool-patches
>> +-- Check that data has at least all fields as an ethalon.
>
> 1. Typo here and in other places: ethalon -> etalon.
Fixed
>
> 2. Please, describe the function in doxygen style since
> it is not a trival one-liner.
Added, however, the comment is bigger than the function...
>> +...
>> +err:match('Error injection:.*')
>
> 3. P l e a s e. Again. Do not use pcall + match. Use
> util.check_error.
Ok. Sorry. It is a very old commit.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 3/3] Introduce storage reload evolution
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
@ 2018-07-20 11:32 ` Alex Khatskevich
2018-07-20 14:15 ` Vladislav Shpilevoy
0 siblings, 1 reply; 16+ messages in thread
From: Alex Khatskevich @ 2018-07-20 11:32 UTC (permalink / raw)
To: Vladislav Shpilevoy, tarantool-patches
>> * this patch contains some legacy-driven decisions:
>> 1. SOURCEDIR path retrieved differentpy in case of
>
> 1. Typo: differentpy.
fixed
>> is copied with respect to Centos 7 and `ro` mode of
>> SOURCEDIR.
>>
>> Closes <shut git>112 125
>
> 2. Stray 'shut'.
ok
>
>>
>> diff --git a/rpm/prebuild.sh b/rpm/prebuild.sh
>> index 768b22b..554032b 100755
>> --- a/rpm/prebuild.sh
>> +++ b/rpm/prebuild.sh
>> @@ -1 +1,3 @@
>> curl -s
>> https://packagecloud.io/install/repositories/tarantool/1_9/script.rpm.sh
>> | sudo bash
>> +sudo yum -y install python-devel python-pip
>> +sudo pip install tarantool msgpack
>
> 3. Why do you need it? As I understand, 'curl' above installs all
> the needed things. Besides, pip install downloads not necessary
> Tarantool 1.9.
I cannot find those dependencies in the 'curl' above.
Without this thing packpack on my laptop cannot pass tests.
What is about tarantool 1.9? I cannot find this dependency in
python-tarantool.
>
>> diff --git a/test/lua_libs/git_util.lua b/test/lua_libs/git_util.lua
>> new file mode 100644
>> index 0000000..e2c17d0
>> --- /dev/null
>> +++ b/test/lua_libs/git_util.lua
>> @@ -0,0 +1,39 @@
>> +--
>> +-- Lua bridge for some of the git commands.
>> +--
>> +local os = require('os')
>> +
>> +local temp_file = 'some_strange_rare_unique_file_name_for_git_util'
>> +local function exec_cmd(options, cmd, args, files, dir, fout)
>
> 4. You can remove 'options' arg (it is always '' as I see).
options are passed as a lua table.
>
>> + files = files or ''
>> + options = options or ''
>> + args = args or ''
>
> 5. files, args, options are always non-nil.
> 6. Why do you need to announce shell_cmd? Just do
>
> local shell_cmd = ...
Refactored
>
>> + if fout then
>
> 7. 'fout' is always nil.
not always (e.g. `log_hashes`)
>
>> + shell_cmd = shell_cmd .. ' >' .. fout
>> + end
>> + if dir then
>
> 8. 'dir' is always non-nil.
I would like to save this check, because this small utilite is not
created especially for the test.
Let it be a little more general than the current testcase.
>> + shell_cmd = string.format('cd %s && %s', dir, shell_cmd)
>> + end
>> + local res = os.execute(shell_cmd)
>> + assert(res == 0, 'Git cmd error: ' .. res)
>> +end
>> +
>> +local function log_hashes(options, args, files, dir)
>> + args = args .. " --format='%h'"
>> + local local_temp_file = string.format('%s/%s', os.getenv('PWD'),
>> temp_file)
>
> 9. Instead of writing output into a temporary file use
> http://pgl.yoyo.org/luai/i/io.popen.
Thanks for advice, I was looking for something similar. However, the
`popen` has a huge disadvantage. It do not perform a join. It makes me
write even more crutches than writing output to temp file.
>> + exec_cmd(options, 'log', args, files, dir, local_temp_file)
>> + local lines = {}
>> + for line in io.lines(local_temp_file) do
>> + table.insert(lines, line)
>> + end
>> + os.remove(local_temp_file)
>> + return lines
>> +end
>> +
>> +
>> +return {
>> + exec_cmd = exec_cmd,
>> + log_hashes = log_hashes
>> +}
>> diff --git a/test/reload_evolution/storage.result
>> b/test/reload_evolution/storage.result
>> new file mode 100644
>> index 0000000..2cf21fd
>> --- /dev/null
>> +++ b/test/reload_evolution/storage.result
>> @@ -0,0 +1,184 @@
>> +test_run = require('test_run').new()
>>
>> +test_run:grep_log('storage_1_a', 'vshard.storage.reload_evolution:
>> upgraded to') ~= nil
>> +---
>> +- true
>> +...
>> +vshard.storage.internal.reload_evolution_version
>> +---
>> +- 1
>> +...
>> +-- Make sure storage operates well.
>> +vshard.storage.bucket_force_drop(2)
>> +---
>> +- true
>> +...
>> +vshard.storage.bucket_force_create(2)
>
> 10. This is too simple test. Force_drop/create merely do DML on
> _bucket. Lets
> test the rebalancer creating a dis-balance. For example, put on
> storage_1 +10
> buckets, and on storage_2 -10 buckets and wait for the balance.
done
I also changed tests so that the tested storage is storage_2_a
(rebalancer lives here),
>
>> diff --git a/test/reload_evolution/storage_1_a.lua
>> b/test/reload_evolution/storage_1_a.lua
>> new file mode 100755
>> index 0000000..3e03f8f
>> --- /dev/null
>> +++ b/test/reload_evolution/storage_1_a.lua
>> @@ -0,0 +1,144 @@
>> +#!/usr/bin/env tarantool
>
> 11. Just use a symbolic link to the existing storage_1_a. Same
> for other storages.
>
>> diff --git a/test/reload_evolution/suite.ini
>> b/test/reload_evolution/suite.ini
>> new file mode 100644
>> index 0000000..bb5435b
>> --- /dev/null
>> +++ b/test/reload_evolution/suite.ini
>
> 12. You do not need a new test suite for one storage test. Please,
> put it into test/storage/
11-12:
I have modified storage_1_a to change luapath.
It is important to change path here, because the default Vshard
initialization is performed here.
>
>> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
>> index bf560e6..1740c98 100644
>> --- a/vshard/storage/init.lua
>> +++ b/vshard/storage/init.lua
>> @@ -105,6 +110,11 @@ if not M then
>> -- a destination replicaset must drop already received
>> -- data.
>> rebalancer_sending_bucket = 0,
>> +
>> + ------------------------- Reload -------------------------
>> + -- Version of the loaded module. This number is used on
>> + -- reload to determine which upgrade scripts to run.
>> + reload_evolution_version = reload_evolution.version,
>
> 13. Use 'version' name.
But it is not a version. Version is a git tag.
We already have module_version and git tag. To make everything clear I
would like to
name the variable in that or similar way.
>
>> }
>> end
>> diff --git a/vshard/storage/reload_evolution.lua
>> b/vshard/storage/reload_evolution.lua
>> new file mode 100644
>> index 0000000..cfac888
>> --- /dev/null
>> +++ b/vshard/storage/reload_evolution.lua
>> @@ -0,0 +1,58 @@
>> +--
>> +-- This module is used to upgrade the vshard.storage on the fly.
>> +-- It updates internal Lua structures in case they are changed
>> +-- in a commit.
>> +--
>> +local log = require('log')
>> +
>> +--
>> +-- Array of upgrade functions.
>> +-- magrations[version] = function which upgrades module version
>
> 14. Typo: magrations.
fixed
>
>> +-- from `version` to `version + 1`.
>
> 15. Not +1. I think, we should use real version numbers:
> 0.1.1, 0.1.2, etc similar to tarantool.
Disagree. That would make us to
1. Tag commit each time M is changed
2. Maintain identical git tags and reload_evolution_version names.
In the introduced all you need to do is just to append callback to the
`migrations` array.
One thing I worry about is out of main branch hotfixes and reload after
it on main vshard branch. But this problem seems to be not solvable in a
general case.
>
>> +--
>> +local migrations = {}
>> +
>> +-- Initialize reload_upgrade mechanism
>> +migrations[#migrations + 1] = function (M)
>> + -- Code to update Lua objects.
>> +end
>> +
>> +--
>> +-- Perform an update based on a version stored in `M` (internals).
>> +-- @param M Old module internals which should be updated.
>> +--
>> +local function upgrade(M)
>> + local start_version = M.reload_evolution_version or 1
>> + if start_version > #migrations then
>> + local err_msg = string.format(
>> + 'vshard.storage.reload_evolution: ' ..
>> + 'auto-downgrade is not implemented; ' ..
>> + 'loaded version is %d, upgrade script version is %d',
>> + start_version, #migrations
>
> 16. Did you test it? I do not see a test.
It is very simple check, so, added this as a unittest.
>
>> + )
>> + log.error(err_msg)
>> + error(err_msg)
>> + end
>> + for i = start_version, #migrations do
>> + local ok, err = pcall(migrations[i], M)
>> + if ok then
>> + log.info('vshard.storage.reload_evolution: upgraded to
>> %d version',
>> + i)
>> + else
>> + local err_msg = string.format(
>> + 'vshard.storage.reload_evolution: ' ..
>> + 'error during upgrade to %d version: %s', i, err
>> + )
>> + log.error(err_msg)
>> + error(err_msg)
>> + end
>> + -- Update the version just after upgrade to have an
>> + -- actual version in case of an error.
>> + M.reload_evolution_version = i
>> + end
>> +end
>> +
>> +return {
>> + version = #migrations,
>
> 17. Where do you use it?
It is default value for storage.internal.M.reload_evolution_version.
Here is a new diff:
commit bce659db650b4a88f4285d90ec9d7726e036878f
Author: AKhatskevich <avkhatskevich@tarantool.org>
Date: Fri Jun 29 20:34:26 2018 +0300
Introduce storage reload evolution
Changes:
1. Introduce storage reload evolution.
2. Setup cross-version reload testing.
1:
This mechanism updates Lua objects on reload in case they are
changed in a new vshard.storage version.
Since this commit, any change in vshard.storage.M has to be
reflected in vshard.storage.reload_evolution to guarantee
correct reload.
2:
The testing uses git infrastructure and is performed in the following
way:
1. Copy old version of vshard to a temp folder.
2. Run vshard on this code.
3. Checkout the latest version of the vshard sources.
4. Reload vshard storage.
5. Make sure it works (Perform simple tests).
Notes:
* this patch contains some legacy-driven decisions:
1. SOURCEDIR path retrieved differently in case of
packpack build.
2. git directory in the `reload_evolution/storage` test
is copied with respect to Centos 7 and `ro` mode of
SOURCEDIR.
Closes #112 #125
diff --git a/.travis.yml b/.travis.yml
index 54bfe44..eff4a51 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,7 +41,7 @@ env:
script:
- git describe --long
- git clone https://github.com/packpack/packpack.git packpack
- - packpack/packpack
+ - packpack/packpack -e PACKPACK_GIT_SOURCEDIR=/source/
before_deploy:
- ls -l build/
diff --git a/rpm/prebuild.sh b/rpm/prebuild.sh
index 768b22b..554032b 100755
--- a/rpm/prebuild.sh
+++ b/rpm/prebuild.sh
@@ -1 +1,3 @@
curl -s
https://packagecloud.io/install/repositories/tarantool/1_9/script.rpm.sh
| sudo bash
+sudo yum -y install python-devel python-pip
+sudo pip install tarantool msgpack
diff --git a/test/lua_libs/git_util.lua b/test/lua_libs/git_util.lua
new file mode 100644
index 0000000..a75bb08
--- /dev/null
+++ b/test/lua_libs/git_util.lua
@@ -0,0 +1,51 @@
+--
+-- Lua bridge for some of the git commands.
+--
+local os = require('os')
+
+local temp_file = 'some_strange_rare_unique_file_name_for_git_util'
+
+--
+-- Exec a git command.
+-- @param params Table of parameters:
+-- * options - git options.
+-- * cmd - git command.
+-- * args - command arguments.
+-- * dir - working directory.
+-- * fout - write output to the file.
+local function exec_cmd(params)
+ local fout = params.fout
+ local shell_cmd = {'git'}
+ for _, param in pairs({'options', 'cmd', 'args'}) do
+ table.insert(shell_cmd, params[param])
+ end
+ if fout then
+ table.insert(shell_cmd, ' >' .. fout)
+ end
+ shell_cmd = table.concat(shell_cmd, ' ')
+ if params.dir then
+ shell_cmd = string.format('cd %s && %s', params.dir, shell_cmd)
+ end
+ local res = os.execute(shell_cmd)
+ assert(res == 0, 'Git cmd error: ' .. res)
+end
+
+local function log_hashes(params)
+ params.args = "--format='%h' " .. params.args
+ local local_temp_file = string.format('%s/%s', os.getenv('PWD'),
temp_file)
+ params.fout = local_temp_file
+ params.cmd = 'log'
+ exec_cmd(params)
+ local lines = {}
+ for line in io.lines(local_temp_file) do
+ table.insert(lines, line)
+ end
+ os.remove(local_temp_file)
+ return lines
+end
+
+
+return {
+ exec_cmd = exec_cmd,
+ log_hashes = log_hashes
+}
diff --git a/test/lua_libs/util.lua b/test/lua_libs/util.lua
index 2d866df..e9bbd9a 100644
--- a/test/lua_libs/util.lua
+++ b/test/lua_libs/util.lua
@@ -1,5 +1,6 @@
local fiber = require('fiber')
local log = require('log')
+local fio = require('fio')
local function check_error(func, ...)
local pstatus, status, err = pcall(func, ...)
@@ -89,10 +90,29 @@ local function has_same_fields(etalon, data)
return true
end
+-- Git directory of the project. Used in evolution tests to
+-- fetch old versions of vshard.
+local SOURCEDIR = os.getenv('PACKPACK_GIT_SOURCEDIR')
+if not SOURCEDIR then
+ SOURCEDIR = os.getenv('SOURCEDIR')
+end
+if not SOURCEDIR then
+ local script_path = debug.getinfo(1).source:match("@?(.*/)")
+ script_path = fio.abspath(script_path)
+ SOURCEDIR = fio.abspath(script_path .. '/../../../')
+end
+
+local BUILDDIR = os.getenv('BUILDDIR')
+if not BUILDDIR then
+ BUILDDIR = SOURCEDIR
+end
+
return {
check_error = check_error,
shuffle_masters = shuffle_masters,
collect_timeouts = collect_timeouts,
wait_master = wait_master,
has_same_fields = has_same_fields,
+ SOURCEDIR = SOURCEDIR,
+ BUILDDIR = BUILDDIR,
}
diff --git a/test/reload_evolution/storage.result
b/test/reload_evolution/storage.result
new file mode 100644
index 0000000..4ffbc26
--- /dev/null
+++ b/test/reload_evolution/storage.result
@@ -0,0 +1,248 @@
+test_run = require('test_run').new()
+---
+...
+git_util = require('git_util')
+---
+...
+util = require('util')
+---
+...
+vshard_copy_path = util.BUILDDIR .. '/test/var/vshard_git_tree_copy'
+---
+...
+evolution_log =
git_util.log_hashes({args='vshard/storage/reload_evolution.lua',
dir=util.SOURCEDIR})
+---
+...
+-- Cleanup the directory after a previous build.
+_ = os.execute('rm -rf ' .. vshard_copy_path)
+---
+...
+-- 1. `git worktree` cannot be used because PACKPACK mounts
+-- `/source/` in `ro` mode.
+-- 2. Just `cp -rf` cannot be used due to a little different
+-- behavior in Centos 7.
+_ = os.execute('mkdir ' .. vshard_copy_path)
+---
+...
+_ = os.execute("cd " .. util.SOURCEDIR .. ' && cp -rf `ls -A
--ignore=build` ' .. vshard_copy_path)
+---
+...
+-- Checkout the first commit with a reload_evolution mechanism.
+git_util.exec_cmd({cmd='checkout', args='-f', dir=vshard_copy_path})
+---
+...
+git_util.exec_cmd({cmd='checkout', args=evolution_log[#evolution_log]
.. '~1', dir=vshard_copy_path})
+---
+...
+REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
+---
+...
+REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
+---
+...
+test_run:create_cluster(REPLICASET_1, 'reload_evolution')
+---
+...
+test_run:create_cluster(REPLICASET_2, 'reload_evolution')
+---
+...
+util = require('util')
+---
+...
+util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
+---
+...
+util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
+---
+...
+test_run:switch('storage_1_a')
+---
+- true
+...
+vshard.storage.bucket_force_create(1,
vshard.consts.DEFAULT_BUCKET_COUNT / 2)
+---
+- true
+...
+bucket_id_to_move = vshard.consts.DEFAULT_BUCKET_COUNT
+---
+...
+test_run:switch('storage_2_a')
+---
+- true
+...
+fiber = require('fiber')
+---
+...
+vshard.storage.bucket_force_create(vshard.consts.DEFAULT_BUCKET_COUNT /
2 + 1, vshard.consts.DEFAULT_BUCKET_COUNT / 2)
+---
+- true
+...
+bucket_id_to_move = vshard.consts.DEFAULT_BUCKET_COUNT
+---
+...
+vshard.storage.internal.reload_evolution_version
+---
+- null
+...
+box.space.customer:insert({bucket_id_to_move, 1, 'customer_name'})
+---
+- [3000, 1, 'customer_name']
+...
+while test_run:grep_log('storage_2_a', 'The cluster is balanced ok') ==
nil do vshard.storage.rebalancer_wakeup() fiber.sleep(0.1) end
+---
+...
+test_run:switch('default')
+---
+- true
+...
+git_util.exec_cmd({cmd='checkout', args=evolution_log[1],
dir=vshard_copy_path})
+---
+...
+test_run:switch('storage_2_a')
+---
+- true
+...
+package.loaded["vshard.storage"] = nil
+---
+...
+vshard.storage = require("vshard.storage")
+---
+...
+test_run:grep_log('storage_2_a', 'vshard.storage.reload_evolution:
upgraded to') ~= nil
+---
+- true
+...
+vshard.storage.internal.reload_evolution_version
+---
+- 1
+...
+-- Make sure storage operates well.
+vshard.storage.bucket_force_drop(2000)
+---
+- true
+...
+vshard.storage.bucket_force_create(2000)
+---
+- true
+...
+vshard.storage.buckets_info()[2000]
+---
+- status: active
+ id: 2000
+...
+vshard.storage.call(bucket_id_to_move, 'read', 'customer_lookup', {1})
+---
+- true
+- null
+...
+vshard.storage.bucket_send(bucket_id_to_move, replicaset1_uuid)
+---
+- true
+...
+vshard.storage.garbage_collector_wakeup()
+---
+...
+fiber = require('fiber')
+---
+...
+while box.space._bucket:get({bucket_id_to_move}) do fiber.sleep(0.01) end
+---
+...
+test_run:switch('storage_1_a')
+---
+- true
+...
+vshard.storage.bucket_send(bucket_id_to_move, replicaset2_uuid)
+---
+- true
+...
+test_run:switch('storage_2_a')
+---
+- true
+...
+vshard.storage.call(bucket_id_to_move, 'read', 'customer_lookup', {1})
+---
+- true
+- null
+...
+-- Check info() does not fail.
+vshard.storage.info() ~= nil
+---
+- true
+...
+--
+-- Send buckets to create a disbalance. Wait until the rebalancer
+-- repairs it. Similar to `tests/rebalancer/rebalancer.test.lua`.
+--
+vshard.storage.rebalancer_disable()
+---
+...
+move_start = vshard.consts.DEFAULT_BUCKET_COUNT / 2 + 1
+---
+...
+move_cnt = 100
+---
+...
+assert(move_start + move_cnt < vshard.consts.DEFAULT_BUCKET_COUNT)
+---
+- true
+...
+for i = move_start, move_start + move_cnt - 1 do
box.space._bucket:delete{i} end
+---
+...
+box.space._bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
+---
+- 1400
+...
+test_run:switch('storage_1_a')
+---
+- true
+...
+move_start = vshard.consts.DEFAULT_BUCKET_COUNT / 2 + 1
+---
+...
+move_cnt = 100
+---
+...
+vshard.storage.bucket_force_create(move_start, move_cnt)
+---
+- true
+...
+box.space._bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
+---
+- 1600
+...
+test_run:switch('storage_2_a')
+---
+- true
+...
+vshard.storage.rebalancer_enable()
+---
+...
+vshard.storage.rebalancer_wakeup()
+---
+...
+wait_rebalancer_state("Rebalance routes are sent", test_run)
+---
+...
+wait_rebalancer_state('The cluster is balanced ok', test_run)
+---
+...
+box.space._bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
+---
+- 1500
+...
+test_run:switch('default')
+---
+- true
+...
+test_run:drop_cluster(REPLICASET_2)
+---
+...
+test_run:drop_cluster(REPLICASET_1)
+---
+...
+test_run:cmd('clear filter')
+---
+- true
+...
diff --git a/test/reload_evolution/storage.test.lua
b/test/reload_evolution/storage.test.lua
new file mode 100644
index 0000000..5deb136
--- /dev/null
+++ b/test/reload_evolution/storage.test.lua
@@ -0,0 +1,88 @@
+test_run = require('test_run').new()
+
+git_util = require('git_util')
+util = require('util')
+vshard_copy_path = util.BUILDDIR .. '/test/var/vshard_git_tree_copy'
+evolution_log =
git_util.log_hashes({args='vshard/storage/reload_evolution.lua',
dir=util.SOURCEDIR})
+-- Cleanup the directory after a previous build.
+_ = os.execute('rm -rf ' .. vshard_copy_path)
+-- 1. `git worktree` cannot be used because PACKPACK mounts
+-- `/source/` in `ro` mode.
+-- 2. Just `cp -rf` cannot be used due to a little different
+-- behavior in Centos 7.
+_ = os.execute('mkdir ' .. vshard_copy_path)
+_ = os.execute("cd " .. util.SOURCEDIR .. ' && cp -rf `ls -A
--ignore=build` ' .. vshard_copy_path)
+-- Checkout the first commit with a reload_evolution mechanism.
+git_util.exec_cmd({cmd='checkout', args='-f', dir=vshard_copy_path})
+git_util.exec_cmd({cmd='checkout', args=evolution_log[#evolution_log]
.. '~1', dir=vshard_copy_path})
+
+REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
+REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
+test_run:create_cluster(REPLICASET_1, 'reload_evolution')
+test_run:create_cluster(REPLICASET_2, 'reload_evolution')
+util = require('util')
+util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
+util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
+
+test_run:switch('storage_1_a')
+vshard.storage.bucket_force_create(1,
vshard.consts.DEFAULT_BUCKET_COUNT / 2)
+bucket_id_to_move = vshard.consts.DEFAULT_BUCKET_COUNT
+
+test_run:switch('storage_2_a')
+fiber = require('fiber')
+vshard.storage.bucket_force_create(vshard.consts.DEFAULT_BUCKET_COUNT /
2 + 1, vshard.consts.DEFAULT_BUCKET_COUNT / 2)
+bucket_id_to_move = vshard.consts.DEFAULT_BUCKET_COUNT
+vshard.storage.internal.reload_evolution_version
+box.space.customer:insert({bucket_id_to_move, 1, 'customer_name'})
+while test_run:grep_log('storage_2_a', 'The cluster is balanced ok') ==
nil do vshard.storage.rebalancer_wakeup() fiber.sleep(0.1) end
+
+test_run:switch('default')
+git_util.exec_cmd({cmd='checkout', args=evolution_log[1],
dir=vshard_copy_path})
+
+test_run:switch('storage_2_a')
+package.loaded["vshard.storage"] = nil
+vshard.storage = require("vshard.storage")
+test_run:grep_log('storage_2_a', 'vshard.storage.reload_evolution:
upgraded to') ~= nil
+vshard.storage.internal.reload_evolution_version
+-- Make sure storage operates well.
+vshard.storage.bucket_force_drop(2000)
+vshard.storage.bucket_force_create(2000)
+vshard.storage.buckets_info()[2000]
+vshard.storage.call(bucket_id_to_move, 'read', 'customer_lookup', {1})
+vshard.storage.bucket_send(bucket_id_to_move, replicaset1_uuid)
+vshard.storage.garbage_collector_wakeup()
+fiber = require('fiber')
+while box.space._bucket:get({bucket_id_to_move}) do fiber.sleep(0.01) end
+test_run:switch('storage_1_a')
+vshard.storage.bucket_send(bucket_id_to_move, replicaset2_uuid)
+test_run:switch('storage_2_a')
+vshard.storage.call(bucket_id_to_move, 'read', 'customer_lookup', {1})
+-- Check info() does not fail.
+vshard.storage.info() ~= nil
+
+--
+-- Send buckets to create a disbalance. Wait until the rebalancer
+-- repairs it. Similar to `tests/rebalancer/rebalancer.test.lua`.
+--
+vshard.storage.rebalancer_disable()
+move_start = vshard.consts.DEFAULT_BUCKET_COUNT / 2 + 1
+move_cnt = 100
+assert(move_start + move_cnt < vshard.consts.DEFAULT_BUCKET_COUNT)
+for i = move_start, move_start + move_cnt - 1 do
box.space._bucket:delete{i} end
+box.space._bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
+test_run:switch('storage_1_a')
+move_start = vshard.consts.DEFAULT_BUCKET_COUNT / 2 + 1
+move_cnt = 100
+vshard.storage.bucket_force_create(move_start, move_cnt)
+box.space._bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
+test_run:switch('storage_2_a')
+vshard.storage.rebalancer_enable()
+vshard.storage.rebalancer_wakeup()
+wait_rebalancer_state("Rebalance routes are sent", test_run)
+wait_rebalancer_state('The cluster is balanced ok', test_run)
+box.space._bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
+
+test_run:switch('default')
+test_run:drop_cluster(REPLICASET_2)
+test_run:drop_cluster(REPLICASET_1)
+test_run:cmd('clear filter')
diff --git a/test/reload_evolution/storage_1_a.lua
b/test/reload_evolution/storage_1_a.lua
new file mode 100755
index 0000000..d8bda60
--- /dev/null
+++ b/test/reload_evolution/storage_1_a.lua
@@ -0,0 +1,154 @@
+#!/usr/bin/env tarantool
+
+require('strict').on()
+
+
+local fio = require('fio')
+
+-- Get instance name
+local fio = require('fio')
+local log = require('log')
+local NAME = fio.basename(arg[0], '.lua')
+local fiber = require('fiber')
+local util = require('util')
+
+-- Run one storage on a different vshard version.
+-- To do that, place vshard src to
+-- BUILDDIR/test/var/vshard_git_tree_copy/.
+if NAME == 'storage_2_a' then
+ local script_path = debug.getinfo(1).source:match("@?(.*/)")
+ vshard_copy = util.BUILDDIR .. '/test/var/vshard_git_tree_copy'
+ package.path = string.format(
+ '%s/?.lua;%s/?/init.lua;%s',
+ vshard_copy, vshard_copy, package.path
+ )
+end
+
+-- Check if we are running under test-run
+if os.getenv('ADMIN') then
+ test_run = require('test_run').new()
+ require('console').listen(os.getenv('ADMIN'))
+end
+
+-- Call a configuration provider
+cfg = require('localcfg')
+-- Name to uuid map
+names = {
+ ['storage_1_a'] = '8a274925-a26d-47fc-9e1b-af88ce939412',
+ ['storage_1_b'] = '3de2e3e1-9ebe-4d0d-abb1-26d301b84633',
+ ['storage_2_a'] = '1e02ae8a-afc0-4e91-ba34-843a356b8ed7',
+ ['storage_2_b'] = '001688c3-66f8-4a31-8e19-036c17d489c2',
+}
+
+replicaset1_uuid = 'cbf06940-0790-498b-948d-042b62cf3d29'
+replicaset2_uuid = 'ac522f65-aa94-4134-9f64-51ee384f1a54'
+replicasets = {replicaset1_uuid, replicaset2_uuid}
+
+-- Start the database with sharding
+vshard = require('vshard')
+vshard.storage.cfg(cfg, names[NAME])
+
+box.once("testapp:schema:1", function()
+ local customer = box.schema.space.create('customer')
+ customer:format({
+ {'customer_id', 'unsigned'},
+ {'bucket_id', 'unsigned'},
+ {'name', 'string'},
+ })
+ customer:create_index('customer_id', {parts = {'customer_id'}})
+ customer:create_index('bucket_id', {parts = {'bucket_id'}, unique =
false})
+
+ local account = box.schema.space.create('account')
+ account:format({
+ {'account_id', 'unsigned'},
+ {'customer_id', 'unsigned'},
+ {'bucket_id', 'unsigned'},
+ {'balance', 'unsigned'},
+ {'name', 'string'},
+ })
+ account:create_index('account_id', {parts = {'account_id'}})
+ account:create_index('customer_id', {parts = {'customer_id'},
unique = false})
+ account:create_index('bucket_id', {parts = {'bucket_id'}, unique =
false})
+ box.snapshot()
+
+ box.schema.func.create('customer_lookup')
+ box.schema.role.grant('public', 'execute', 'function',
'customer_lookup')
+ box.schema.func.create('customer_add')
+ box.schema.role.grant('public', 'execute', 'function', 'customer_add')
+ box.schema.func.create('echo')
+ box.schema.role.grant('public', 'execute', 'function', 'echo')
+ box.schema.func.create('sleep')
+ box.schema.role.grant('public', 'execute', 'function', 'sleep')
+ box.schema.func.create('raise_luajit_error')
+ box.schema.role.grant('public', 'execute', 'function',
'raise_luajit_error')
+ box.schema.func.create('raise_client_error')
+ box.schema.role.grant('public', 'execute', 'function',
'raise_client_error')
+end)
+
+function customer_add(customer)
+ box.begin()
+ box.space.customer:insert({customer.customer_id, customer.bucket_id,
+ customer.name})
+ for _, account in ipairs(customer.accounts) do
+ box.space.account:insert({
+ account.account_id,
+ customer.customer_id,
+ customer.bucket_id,
+ 0,
+ account.name
+ })
+ end
+ box.commit()
+ return true
+end
+
+function customer_lookup(customer_id)
+ if type(customer_id) ~= 'number' then
+ error('Usage: customer_lookup(customer_id)')
+ end
+
+ local customer = box.space.customer:get(customer_id)
+ if customer == nil then
+ return nil
+ end
+ customer = {
+ customer_id = customer.customer_id;
+ name = customer.name;
+ }
+ local accounts = {}
+ for _, account in
box.space.account.index.customer_id:pairs(customer_id) do
+ table.insert(accounts, {
+ account_id = account.account_id;
+ name = account.name;
+ balance = account.balance;
+ })
+ end
+ customer.accounts = accounts;
+ return customer
+end
+
+function echo(...)
+ return ...
+end
+
+function sleep(time)
+ fiber.sleep(time)
+ return true
+end
+
+function raise_luajit_error()
+ assert(1 == 2)
+end
+
+function raise_client_error()
+ box.error(box.error.UNKNOWN)
+end
+
+function wait_rebalancer_state(state, test_run)
+ log.info(string.rep('a', 1000))
+ vshard.storage.rebalancer_wakeup()
+ while not test_run:grep_log(NAME, state, 1000) do
+ fiber.sleep(0.1)
+ vshard.storage.rebalancer_wakeup()
+ end
+end
diff --git a/test/reload_evolution/storage_1_b.lua
b/test/reload_evolution/storage_1_b.lua
new file mode 120000
index 0000000..02572da
--- /dev/null
+++ b/test/reload_evolution/storage_1_b.lua
@@ -0,0 +1 @@
+storage_1_a.lua
\ No newline at end of file
diff --git a/test/reload_evolution/storage_2_a.lua
b/test/reload_evolution/storage_2_a.lua
new file mode 120000
index 0000000..02572da
--- /dev/null
+++ b/test/reload_evolution/storage_2_a.lua
@@ -0,0 +1 @@
+storage_1_a.lua
\ No newline at end of file
diff --git a/test/reload_evolution/storage_2_b.lua
b/test/reload_evolution/storage_2_b.lua
new file mode 120000
index 0000000..02572da
--- /dev/null
+++ b/test/reload_evolution/storage_2_b.lua
@@ -0,0 +1 @@
+storage_1_a.lua
\ No newline at end of file
diff --git a/test/reload_evolution/suite.ini
b/test/reload_evolution/suite.ini
new file mode 100644
index 0000000..bb5435b
--- /dev/null
+++ b/test/reload_evolution/suite.ini
@@ -0,0 +1,6 @@
+[default]
+core = tarantool
+description = Reload evolution tests
+script = test.lua
+is_parallel = False
+lua_libs = ../lua_libs/util.lua ../lua_libs/git_util.lua
../../example/localcfg.lua
diff --git a/test/reload_evolution/test.lua b/test/reload_evolution/test.lua
new file mode 100644
index 0000000..ad0543a
--- /dev/null
+++ b/test/reload_evolution/test.lua
@@ -0,0 +1,9 @@
+#!/usr/bin/env tarantool
+
+require('strict').on()
+
+box.cfg{
+ listen = os.getenv("LISTEN"),
+}
+
+require('console').listen(os.getenv('ADMIN'))
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 07bd00c..3bec09f 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -10,6 +10,7 @@ if rawget(_G, MODULE_INTERNALS) then
local vshard_modules = {
'vshard.consts', 'vshard.error', 'vshard.cfg',
'vshard.replicaset', 'vshard.util',
+ 'vshard.storage.reload_evolution'
}
for _, module in pairs(vshard_modules) do
package.loaded[module] = nil
@@ -20,12 +21,16 @@ local lerror = require('vshard.error')
local lcfg = require('vshard.cfg')
local lreplicaset = require('vshard.replicaset')
local util = require('vshard.util')
+local reload_evolution = require('vshard.storage.reload_evolution')
local M = rawget(_G, MODULE_INTERNALS)
if not M then
--
-- The module is loaded for the first time.
--
+ -- !!!WARNING: any change of this table must be reflected in
+ -- `vshard.storage.reload_evolution` module to guarantee
+ -- reloadability of the module.
M = {
---------------- Common module attributes ----------------
-- The last passed configuration.
@@ -105,6 +110,11 @@ if not M then
-- a destination replicaset must drop already received
-- data.
rebalancer_sending_bucket = 0,
+
+ ------------------------- Reload -------------------------
+ -- Version of the loaded module. This number is used on
+ -- reload to determine which upgrade scripts to run.
+ reload_evolution_version = reload_evolution.version,
}
end
@@ -1863,6 +1873,7 @@ end
if not rawget(_G, MODULE_INTERNALS) then
rawset(_G, MODULE_INTERNALS, M)
else
+ reload_evolution.upgrade(M)
storage_cfg(M.current_cfg, M.this_replica.uuid)
M.module_version = M.module_version + 1
end
diff --git a/vshard/storage/reload_evolution.lua
b/vshard/storage/reload_evolution.lua
new file mode 100644
index 0000000..f25ad49
--- /dev/null
+++ b/vshard/storage/reload_evolution.lua
@@ -0,0 +1,58 @@
+--
+-- This module is used to upgrade the vshard.storage on the fly.
+-- It updates internal Lua structures in case they are changed
+-- in a commit.
+--
+local log = require('log')
+
+--
+-- Array of upgrade functions.
+-- migrations[version] = function which upgrades module version
+-- from `version` to `version + 1`.
+--
+local migrations = {}
+
+-- Initialize reload_upgrade mechanism
+migrations[#migrations + 1] = function (M)
+ -- Code to update Lua objects.
+end
+
+--
+-- Perform an update based on a version stored in `M` (internals).
+-- @param M Old module internals which should be updated.
+--
+local function upgrade(M)
+ local start_version = M.reload_evolution_version or 1
+ if start_version > #migrations then
+ local err_msg = string.format(
+ 'vshard.storage.reload_evolution: ' ..
+ 'auto-downgrade is not implemented; ' ..
+ 'loaded version is %d, upgrade script version is %d',
+ start_version, #migrations
+ )
+ log.error(err_msg)
+ error(err_msg)
+ end
+ for i = start_version, #migrations do
+ local ok, err = pcall(migrations[i], M)
+ if ok then
+ log.info('vshard.storage.reload_evolution: upgraded to %d
version',
+ i)
+ else
+ local err_msg = string.format(
+ 'vshard.storage.reload_evolution: ' ..
+ 'error during upgrade to %d version: %s', i, err
+ )
+ log.error(err_msg)
+ error(err_msg)
+ end
+ -- Update the version just after upgrade to have an
+ -- actual version in case of an error.
+ M.reload_evolution_version = i
+ end
+end
+
+return {
+ version = #migrations,
+ upgrade = upgrade,
+}
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 2/3] Complete module reload
2018-07-19 20:32 ` Alex Khatskevich
@ 2018-07-20 11:34 ` Alex Khatskevich
2018-07-20 14:15 ` Vladislav Shpilevoy
0 siblings, 1 reply; 16+ messages in thread
From: Alex Khatskevich @ 2018-07-20 11:34 UTC (permalink / raw)
To: Vladislav Shpilevoy, tarantool-patches
here is a full diff:
commit cd3740a45322458cde10e67f8018bc4787f443aa
Author: AKhatskevich <avkhatskevich@tarantool.org>
Date: Sat Jun 9 17:23:40 2018 +0300
Complete module reload
In case one need to upgrade vshard to a new version, this commit
improves reload mechanism to allow to do that for a wider variety of
possible changes (between two versions).
Changes:
* introduce cfg option `connection_outdate_delay`
* improve reload mechanism
* add `util.async_task` method, which runs a function after a
delay
* delete replicaset:rebind_connections method as it is replaced
with `rebind_replicasets` which updates all replicasets at once
Reload mechanism:
* reload all vshard modules
* create new `replicaset` and `replica` objects
* reuse old netbox connections in new replica objects if
possible
* update router/storage.internal table
* after a `connection_outdate_delay` disable old instances of
`replicaset` and `replica` objects
Reload works for modules:
* vshard.router
* vshard.storage
Here is a module reload algorithm:
* old vshard is working
* delete old vshard src
* install new vshard
* call: package.loaded['vshard.router'] = nil
* call: old_router = vshard.router -- Save working router copy.
* call: vshard.router = require('vshard.router')
* if require fails: continue using old_router
* if require succeeds: use vshard.router
In case reload process fails, old router/storage module, replicaset and
replica objects continue working properly. If reload succeeds, all old
objects would be deprecated.
Extra changes:
* introduce MODULE_INTERNALS which stores name of the module
internal data in the global namespace
Part of #112
diff --git a/test/router/reload.result b/test/router/reload.result
index 47f3c2e..71f82b2 100644
--- a/test/router/reload.result
+++ b/test/router/reload.result
@@ -174,6 +174,132 @@ vshard.router.module_version()
check_reloaded()
---
...
+--
+-- Outdate old replicaset and replica objets.
+--
+rs = vshard.router.route(1)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+package.loaded["vshard.router"] = nil
+---
+...
+_ = require('vshard.router')
+---
+...
+-- Make sure outdate async task has had cpu time.
+while not rs.outdated do fiber.sleep(0.001) end
+---
+...
+rs.callro(rs, 'echo', {'some_data'})
+---
+- null
+- type: ShardingError
+ name: OBJECT_IS_OUTDATED
+ message: Object is outdated after module reload/reconfigure. Use new
instance.
+ code: 20
+...
+vshard.router = require('vshard.router')
+---
+...
+rs = vshard.router.route(1)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+-- Test `connection_outdate_delay`.
+old_connection_delay = cfg.connection_outdate_delay
+---
+...
+cfg.connection_outdate_delay = 0.3
+---
+...
+vshard.router.cfg(cfg)
+---
+...
+cfg.connection_outdate_delay = old_connection_delay
+---
+...
+vshard.router.internal.connection_outdate_delay = nil
+---
+...
+rs_new = vshard.router.route(1)
+---
+...
+rs_old = rs
+---
+...
+_, replica_old = next(rs_old.replicas)
+---
+...
+rs_new:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+-- Check old objets are still valid.
+rs_old:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+replica_old.conn ~= nil
+---
+- true
+...
+fiber.sleep(0.2)
+---
+...
+rs_old:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+replica_old.conn ~= nil
+---
+- true
+...
+replica_old.outdated == nil
+---
+- true
+...
+fiber.sleep(0.2)
+---
+...
+rs_old:callro('echo', {'some_data'})
+---
+- null
+- type: ShardingError
+ name: OBJECT_IS_OUTDATED
+ message: Object is outdated after module reload/reconfigure. Use new
instance.
+ code: 20
+...
+replica_old.conn == nil
+---
+- true
+...
+replica_old.outdated == true
+---
+- true
+...
+rs_new:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
test_run:switch('default')
---
- true
diff --git a/test/router/reload.test.lua b/test/router/reload.test.lua
index af2939d..d77ebe7 100644
--- a/test/router/reload.test.lua
+++ b/test/router/reload.test.lua
@@ -86,6 +86,42 @@ _ = require('vshard.router')
vshard.router.module_version()
check_reloaded()
+--
+-- Outdate old replicaset and replica objets.
+--
+rs = vshard.router.route(1)
+rs:callro('echo', {'some_data'})
+package.loaded["vshard.router"] = nil
+_ = require('vshard.router')
+-- Make sure outdate async task has had cpu time.
+while not rs.outdated do fiber.sleep(0.001) end
+rs.callro(rs, 'echo', {'some_data'})
+vshard.router = require('vshard.router')
+rs = vshard.router.route(1)
+rs:callro('echo', {'some_data'})
+-- Test `connection_outdate_delay`.
+old_connection_delay = cfg.connection_outdate_delay
+cfg.connection_outdate_delay = 0.3
+vshard.router.cfg(cfg)
+cfg.connection_outdate_delay = old_connection_delay
+vshard.router.internal.connection_outdate_delay = nil
+rs_new = vshard.router.route(1)
+rs_old = rs
+_, replica_old = next(rs_old.replicas)
+rs_new:callro('echo', {'some_data'})
+-- Check old objets are still valid.
+rs_old:callro('echo', {'some_data'})
+replica_old.conn ~= nil
+fiber.sleep(0.2)
+rs_old:callro('echo', {'some_data'})
+replica_old.conn ~= nil
+replica_old.outdated == nil
+fiber.sleep(0.2)
+rs_old:callro('echo', {'some_data'})
+replica_old.conn == nil
+replica_old.outdated == true
+rs_new:callro('echo', {'some_data'})
+
test_run:switch('default')
test_run:cmd('stop server router_1')
test_run:cmd('cleanup server router_1')
diff --git a/test/router/router.result b/test/router/router.result
index 4919962..45394e1 100644
--- a/test/router/router.result
+++ b/test/router/router.result
@@ -1024,11 +1024,10 @@ error_messages
- - Use replicaset:callro(...) instead of replicaset.callro(...)
- Use replicaset:connect_master(...) instead of
replicaset.connect_master(...)
- Use replicaset:connect_replica(...) instead of
replicaset.connect_replica(...)
- - Use replicaset:rebind_connections(...) instead of
replicaset.rebind_connections(...)
- Use replicaset:down_replica_priority(...) instead of
replicaset.down_replica_priority(...)
- Use replicaset:call(...) instead of replicaset.call(...)
- - Use replicaset:up_replica_priority(...) instead of
replicaset.up_replica_priority(...)
- Use replicaset:connect(...) instead of replicaset.connect(...)
+ - Use replicaset:up_replica_priority(...) instead of
replicaset.up_replica_priority(...)
- Use replicaset:callrw(...) instead of replicaset.callrw(...)
- Use replicaset:connect_all(...) instead of replicaset.connect_all(...)
...
diff --git a/test/storage/reload.result b/test/storage/reload.result
index 531d984..c890a9f 100644
--- a/test/storage/reload.result
+++ b/test/storage/reload.result
@@ -174,6 +174,35 @@ vshard.storage.module_version()
check_reloaded()
---
...
+--
+-- Outdate old replicaset and replica objets.
+--
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+package.loaded["vshard.storage"] = nil
+---
+...
+_ = require('vshard.storage')
+---
+...
+rs.callro(rs, 'echo', {'some_data'})
+---
+- null
+- type: ShardingError
+ name: OBJECT_IS_OUTDATED
+ message: Object is outdated after module reload/reconfigure. Use new
instance.
+ code: 20
+...
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+rs.callro(rs, 'echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
test_run:switch('default')
---
- true
diff --git a/test/storage/reload.test.lua b/test/storage/reload.test.lua
index 64c3a60..b351813 100644
--- a/test/storage/reload.test.lua
+++ b/test/storage/reload.test.lua
@@ -87,6 +87,16 @@ _ = require('vshard.storage')
vshard.storage.module_version()
check_reloaded()
+--
+-- Outdate old replicaset and replica objets.
+--
+_, rs = next(vshard.storage.internal.replicasets)
+package.loaded["vshard.storage"] = nil
+_ = require('vshard.storage')
+rs.callro(rs, 'echo', {'some_data'})
+_, rs = next(vshard.storage.internal.replicasets)
+rs.callro(rs, 'echo', {'some_data'})
+
test_run:switch('default')
test_run:drop_cluster(REPLICASET_2)
test_run:drop_cluster(REPLICASET_1)
diff --git a/vshard/cfg.lua b/vshard/cfg.lua
index d5429af..bba12cc 100644
--- a/vshard/cfg.lua
+++ b/vshard/cfg.lua
@@ -217,6 +217,10 @@ local cfg_template = {
type = 'non-negative number', name = 'Sync timeout',
is_optional = true,
default = consts.DEFAULT_SYNC_TIMEOUT
}},
+ {'connection_outdate_delay', {
+ type = 'non-negative number', name = 'Object outdate timeout',
+ is_optional = true
+ }},
}
--
@@ -264,6 +268,7 @@ local function remove_non_box_options(cfg)
cfg.collect_bucket_garbage_interval = nil
cfg.collect_lua_garbage = nil
cfg.sync_timeout = nil
+ cfg.connection_outdate_delay = nil
end
return {
diff --git a/vshard/error.lua b/vshard/error.lua
index cf2f9d2..f79107b 100644
--- a/vshard/error.lua
+++ b/vshard/error.lua
@@ -100,6 +100,11 @@ local error_message_template = {
[19] = {
name = 'REPLICASET_IS_LOCKED',
msg = 'Replicaset is locked'
+ },
+ [20] = {
+ name = 'OBJECT_IS_OUTDATED',
+ msg = 'Object is outdated after module reload/reconfigure. ' ..
+ 'Use new instance.'
}
}
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
index 99f59aa..f6e971b 100644
--- a/vshard/replicaset.lua
+++ b/vshard/replicaset.lua
@@ -21,6 +21,7 @@
-- requests to the replica>,
-- net_sequential_fail = <count of sequential failed
-- requests to the replica>,
+-- outdated = nil/true,
-- }
-- },
-- master = <master server from the array above>,
@@ -34,6 +35,7 @@
-- etalon_bucket_count = <bucket count, that must be stored
-- on this replicaset to reach the
-- balance in a cluster>,
+-- outdated = nil/true,
-- }
--
-- replicasets = {
@@ -48,7 +50,8 @@ local lerror = require('vshard.error')
local fiber = require('fiber')
local luri = require('uri')
local ffi = require('ffi')
-local gsc = require('vshard.util').generate_self_checker
+local util = require('vshard.util')
+local gsc = util.generate_self_checker
--
-- on_connect() trigger for net.box
@@ -337,27 +340,39 @@ local function replicaset_tostring(replicaset)
master)
end
+local outdate_replicasets
--
--- Rebind connections of old replicas to new ones.
+-- Copy netbox conections from old replica objects to new ones
+-- and outdate old objects.
+-- @param replicasets New replicasets
+-- @param old_replicasets Replicasets and replicas to be outdated.
+-- @param outdate_delay Number of seconds; delay to outdate
+-- old objects.
--
-local function replicaset_rebind_connections(replicaset)
- for _, replica in pairs(replicaset.replicas) do
- local old_replica = replica.old_replica
- if old_replica then
- local conn = old_replica.conn
- replica.conn = conn
- replica.down_ts = old_replica.down_ts
- replica.net_timeout = old_replica.net_timeout
- replica.net_sequential_ok = old_replica.net_sequential_ok
- replica.net_sequential_fail = old_replica.net_sequential_fail
- if conn then
- conn.replica = replica
- conn.replicaset = replicaset
- old_replica.conn = nil
+local function rebind_replicasets(replicasets, old_replicasets,
outdate_delay)
+ for replicaset_uuid, replicaset in pairs(replicasets) do
+ local old_replicaset = old_replicasets and
+ old_replicasets[replicaset_uuid]
+ for replica_uuid, replica in pairs(replicaset.replicas) do
+ local old_replica = old_replicaset and
+ old_replicaset.replicas[replica_uuid]
+ if old_replica then
+ local conn = old_replica.conn
+ replica.conn = conn
+ replica.down_ts = old_replica.down_ts
+ replica.net_timeout = old_replica.net_timeout
+ replica.net_sequential_ok = old_replica.net_sequential_ok
+ replica.net_sequential_fail =
old_replica.net_sequential_fail
+ if conn then
+ conn.replica = replica
+ conn.replicaset = replicaset
+ end
end
- replica.old_replica = nil
end
end
+ if old_replicasets then
+ util.async_task(outdate_delay, outdate_replicasets,
old_replicasets)
+ end
end
--
@@ -369,7 +384,6 @@ local replicaset_mt = {
connect_master = replicaset_connect_master;
connect_all = replicaset_connect_all;
connect_replica = replicaset_connect_to_replica;
- rebind_connections = replicaset_rebind_connections;
down_replica_priority = replicaset_down_replica_priority;
up_replica_priority = replicaset_up_replica_priority;
call = replicaset_master_call;
@@ -412,6 +426,49 @@ for name, func in pairs(replica_mt.__index) do
end
replica_mt.__index = index
+--
+-- Meta-methods of outdated objects.
+-- They define only attributes from corresponding metatables to
+-- make user able to access fields of old objects.
+--
+local function outdated_warning(...)
+ return nil, lerror.vshard(lerror.code.OBJECT_IS_OUTDATED)
+end
+
+local outdated_replicaset_mt = {
+ __index = {
+ outdated = true
+ }
+}
+for fname, func in pairs(replicaset_mt.__index) do
+ outdated_replicaset_mt.__index[fname] = outdated_warning
+end
+
+local outdated_replica_mt = {
+ __index = {
+ outdated = true
+ }
+}
+for fname, func in pairs(replica_mt.__index) do
+ outdated_replica_mt.__index[fname] = outdated_warning
+end
+
+--
+-- Outdate replicaset and replica objects:
+-- * Set outdated_metatables.
+-- * Remove connections.
+--
+outdate_replicasets = function(replicasets)
+ for _, replicaset in pairs(replicasets) do
+ setmetatable(replicaset, outdated_replicaset_mt)
+ for _, replica in pairs(replicaset.replicas) do
+ setmetatable(replica, outdated_replica_mt)
+ replica.conn = nil
+ end
+ end
+ log.info('Old replicaset and replica objects are outdated.')
+end
+
--
-- Calculate for each replicaset its etalon bucket count.
-- Iterative algorithm is used to learn the best balance in a
@@ -503,7 +560,7 @@ end
--
-- Update/build replicasets from configuration
--
-local function buildall(sharding_cfg, old_replicasets)
+local function buildall(sharding_cfg)
local new_replicasets = {}
local weights = sharding_cfg.weights
local zone = sharding_cfg.zone
@@ -515,8 +572,6 @@ local function buildall(sharding_cfg, old_replicasets)
end
local curr_ts = fiber.time()
for replicaset_uuid, replicaset in pairs(sharding_cfg.sharding) do
- local old_replicaset = old_replicasets and
- old_replicasets[replicaset_uuid]
local new_replicaset = setmetatable({
replicas = {},
uuid = replicaset_uuid,
@@ -526,8 +581,6 @@ local function buildall(sharding_cfg, old_replicasets)
}, replicaset_mt)
local priority_list = {}
for replica_uuid, replica in pairs(replicaset.replicas) do
- local old_replica = old_replicaset and
- old_replicaset.replicas[replica_uuid]
-- The old replica is saved in the new object to
-- rebind its connection at the end of a
-- router/storage reconfiguration.
@@ -535,7 +588,7 @@ local function buildall(sharding_cfg, old_replicasets)
uri = replica.uri, name = replica.name, uuid =
replica_uuid,
zone = replica.zone, net_timeout =
consts.CALL_TIMEOUT_MIN,
net_sequential_ok = 0, net_sequential_fail = 0,
- down_ts = curr_ts, old_replica = old_replica,
+ down_ts = curr_ts,
}, replica_mt)
new_replicaset.replicas[replica_uuid] = new_replica
if replica.master then
@@ -596,4 +649,5 @@ return {
buildall = buildall,
calculate_etalon_balance = cluster_calculate_etalon_balance,
wait_masters_connect = wait_masters_connect,
+ rebind_replicasets = rebind_replicasets,
}
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index a143070..142ddb6 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -1,5 +1,17 @@
local log = require('log')
local lfiber = require('fiber')
+
+local MODULE_INTERNALS = '__module_vshard_router'
+-- Reload requirements, in case this module is reloaded manually.
+if rawget(_G, MODULE_INTERNALS) then
+ local vshard_modules = {
+ 'vshard.consts', 'vshard.error', 'vshard.cfg',
+ 'vshard.hash', 'vshard.replicaset', 'vshard.util',
+ }
+ for _, module in pairs(vshard_modules) do
+ package.loaded[module] = nil
+ end
+end
local consts = require('vshard.consts')
local lerror = require('vshard.error')
local lcfg = require('vshard.cfg')
@@ -7,15 +19,20 @@ local lhash = require('vshard.hash')
local lreplicaset = require('vshard.replicaset')
local util = require('vshard.util')
-local M = rawget(_G, '__module_vshard_router')
+local M = rawget(_G, MODULE_INTERNALS)
if not M then
M = {
+ ---------------- Common module attributes ----------------
+ -- The last passed configuration.
+ current_cfg = nil,
errinj = {
ERRINJ_CFG = false,
ERRINJ_FAILOVER_CHANGE_CFG = false,
ERRINJ_RELOAD = false,
ERRINJ_LONG_DISCOVERY = false,
},
+ -- Time to outdate old objects on reload.
+ connection_outdate_delay = nil,
-- Bucket map cache.
route_map = {},
-- All known replicasets used for bucket re-balancing
@@ -479,12 +496,13 @@ local function router_cfg(cfg)
else
log.info('Starting router reconfiguration')
end
- local new_replicasets = lreplicaset.buildall(cfg, M.replicasets)
+ local new_replicasets = lreplicaset.buildall(cfg)
local total_bucket_count = cfg.bucket_count
local collect_lua_garbage = cfg.collect_lua_garbage
- lcfg.remove_non_box_options(cfg)
+ local box_cfg = table.copy(cfg)
+ lcfg.remove_non_box_options(box_cfg)
log.info("Calling box.cfg()...")
- for k, v in pairs(cfg) do
+ for k, v in pairs(box_cfg) do
log.info({[k] = v})
end
-- It is considered that all possible errors during cfg
@@ -493,18 +511,18 @@ local function router_cfg(cfg)
if M.errinj.ERRINJ_CFG then
error('Error injection: cfg')
end
- box.cfg(cfg)
+ box.cfg(box_cfg)
log.info("Box has been configured")
+ M.connection_outdate_delay = cfg.connection_outdate_delay
M.total_bucket_count = total_bucket_count
M.collect_lua_garbage = collect_lua_garbage
- M.replicasets = new_replicasets
M.current_cfg = new_cfg
-- Move connections from an old configuration to a new one.
-- It must be done with no yields to prevent usage both of not
-- fully moved old replicasets, and not fully built new ones.
- for _, replicaset in pairs(new_replicasets) do
- replicaset:rebind_connections()
- end
+ lreplicaset.rebind_replicasets(new_replicasets, M.replicasets,
+ M.connection_outdate_delay)
+ M.replicasets = new_replicasets
-- Now the new replicasets are fully built. Can establish
-- connections and yield.
for _, replicaset in pairs(new_replicasets) do
@@ -793,15 +811,16 @@ end
-- About functions, saved in M, and reloading see comment in
-- storage/init.lua.
--
-M.discovery_f = discovery_f
-M.failover_f = failover_f
-
-if not rawget(_G, '__module_vshard_router') then
- rawset(_G, '__module_vshard_router', M)
+if not rawget(_G, MODULE_INTERNALS) then
+ rawset(_G, MODULE_INTERNALS, M)
else
+ router_cfg(M.current_cfg)
M.module_version = M.module_version + 1
end
+M.discovery_f = discovery_f
+M.failover_f = failover_f
+
return {
cfg = router_cfg;
info = router_info;
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 052e94f..07bd00c 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -2,20 +2,34 @@ local log = require('log')
local luri = require('uri')
local lfiber = require('fiber')
local netbox = require('net.box') -- for net.box:self()
+local trigger = require('internal.trigger')
+
+local MODULE_INTERNALS = '__module_vshard_storage'
+-- Reload requirements, in case this module is reloaded manually.
+if rawget(_G, MODULE_INTERNALS) then
+ local vshard_modules = {
+ 'vshard.consts', 'vshard.error', 'vshard.cfg',
+ 'vshard.replicaset', 'vshard.util',
+ }
+ for _, module in pairs(vshard_modules) do
+ package.loaded[module] = nil
+ end
+end
local consts = require('vshard.consts')
local lerror = require('vshard.error')
-local util = require('vshard.util')
local lcfg = require('vshard.cfg')
local lreplicaset = require('vshard.replicaset')
-local trigger = require('internal.trigger')
+local util = require('vshard.util')
-local M = rawget(_G, '__module_vshard_storage')
+local M = rawget(_G, MODULE_INTERNALS)
if not M then
--
-- The module is loaded for the first time.
--
M = {
---------------- Common module attributes ----------------
+ -- The last passed configuration.
+ current_cfg = nil,
--
-- All known replicasets used for bucket re-balancing.
-- See format in replicaset.lua.
@@ -1497,7 +1511,7 @@ local function storage_cfg(cfg, this_replica_uuid)
local this_replicaset
local this_replica
- local new_replicasets = lreplicaset.buildall(cfg, M.replicasets)
+ local new_replicasets = lreplicaset.buildall(cfg)
local min_master
for rs_uuid, rs in pairs(new_replicasets) do
for replica_uuid, replica in pairs(rs.replicas) do
@@ -1576,7 +1590,6 @@ local function storage_cfg(cfg, this_replica_uuid)
--
local old_sync_timeout = M.sync_timeout
M.sync_timeout = cfg.sync_timeout
- lcfg.remove_non_box_options(cfg)
if was_master and not is_master then
local_on_master_disable_prepare()
@@ -1585,7 +1598,9 @@ local function storage_cfg(cfg, this_replica_uuid)
local_on_master_enable_prepare()
end
- local ok, err = pcall(box.cfg, cfg)
+ local box_cfg = table.copy(cfg)
+ lcfg.remove_non_box_options(box_cfg)
+ local ok, err = pcall(box.cfg, box_cfg)
while M.errinj.ERRINJ_CFG_DELAY do
lfiber.sleep(0.01)
end
@@ -1604,10 +1619,8 @@ local function storage_cfg(cfg, this_replica_uuid)
local uri = luri.parse(this_replica.uri)
box.once("vshard:storage:1", storage_schema_v1, uri.login,
uri.password)
+ lreplicaset.rebind_replicasets(new_replicasets, M.replicasets)
M.replicasets = new_replicasets
- for _, replicaset in pairs(new_replicasets) do
- replicaset:rebind_connections()
- end
M.this_replicaset = this_replicaset
M.this_replica = this_replica
M.total_bucket_count = total_bucket_count
@@ -1846,6 +1859,14 @@ end
-- restarted (or is restarted from M.background_f, which is not
-- changed) and continues use old func1 and func2.
--
+
+if not rawget(_G, MODULE_INTERNALS) then
+ rawset(_G, MODULE_INTERNALS, M)
+else
+ storage_cfg(M.current_cfg, M.this_replica.uuid)
+ M.module_version = M.module_version + 1
+end
+
M.recovery_f = recovery_f
M.collect_garbage_f = collect_garbage_f
M.rebalancer_f = rebalancer_f
@@ -1861,12 +1882,6 @@ M.rebalancer_build_routes = rebalancer_build_routes
M.rebalancer_calculate_metrics = rebalancer_calculate_metrics
M.cached_find_sharded_spaces = find_sharded_spaces
-if not rawget(_G, '__module_vshard_storage') then
- rawset(_G, '__module_vshard_storage', M)
-else
- M.module_version = M.module_version + 1
-end
-
return {
sync = sync,
bucket_force_create = bucket_force_create,
diff --git a/vshard/util.lua b/vshard/util.lua
index ce79930..fb875ce 100644
--- a/vshard/util.lua
+++ b/vshard/util.lua
@@ -100,9 +100,29 @@ end
-- Update latest versions of function
M.reloadable_fiber_f = reloadable_fiber_f
+local function sync_task(delay, task, ...)
+ if delay then
+ fiber.sleep(delay)
+ end
+ task(...)
+end
+
+--
+-- Run a function without interrupting current fiber.
+-- @param delay Delay in seconds before the task should be
+-- executed.
+-- @param task Function to be executed.
+-- @param ... Arguments which would be passed to the `task`.
+--
+local function async_task(delay, task, ...)
+ assert(delay == nil or type(delay) == 'number')
+ fiber.create(sync_task, delay, task, ...)
+end
+
return {
tuple_extract_key = tuple_extract_key,
reloadable_fiber_f = reloadable_fiber_f,
generate_self_checker = generate_self_checker,
+ async_task = async_task,
internal = M,
}
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 1/3] Add test on error during reconfigure
2018-07-19 20:33 ` Alex Khatskevich
@ 2018-07-20 11:34 ` Alex Khatskevich
2018-07-20 14:15 ` Vladislav Shpilevoy
0 siblings, 1 reply; 16+ messages in thread
From: Alex Khatskevich @ 2018-07-20 11:34 UTC (permalink / raw)
To: tarantool-patches
Here is a full diff:
commit fd68ec9454ec6b5552a37d15bb3adb33cbf1ea84
Author: AKhatskevich <avkhatskevich@tarantool.org>
Date: Sat Jun 9 19:50:52 2018 +0300
Add test on error during reconfigure
In case reconfigure process fails, the node should continue
work properly.
diff --git a/test/lua_libs/util.lua b/test/lua_libs/util.lua
index f2d3b48..2d866df 100644
--- a/test/lua_libs/util.lua
+++ b/test/lua_libs/util.lua
@@ -69,9 +69,30 @@ local function wait_master(test_run, replicaset, master)
log.info('Slaves are connected to a master "%s"', master)
end
+-- Check that data has at least all etalon's fields and they are
+-- equal.
+-- @param etalon Table which fields should be found in `data`.
+-- @param data Table which is checked against `etalon`.
+-- @return Boolean indicator of equality.
+-- @return Table of names of fields which are different in `data`.
+local function has_same_fields(etalon, data)
+ assert(type(etalon) == 'table' and type(data) == 'table')
+ local diff = {}
+ for k, v in pairs(etalon) do
+ if v ~= data[k] then
+ table.insert(diff, k)
+ end
+ end
+ if #diff > 0 then
+ return false, diff
+ end
+ return true
+end
+
return {
check_error = check_error,
shuffle_masters = shuffle_masters,
collect_timeouts = collect_timeouts,
wait_master = wait_master,
+ has_same_fields = has_same_fields,
}
diff --git a/test/router/router.result b/test/router/router.result
index 15f4fd0..4919962 100644
--- a/test/router/router.result
+++ b/test/router/router.result
@@ -1156,6 +1156,36 @@ util.check_error(vshard.router.cfg, non_dynamic_cfg)
---
- Non-dynamic option shard_index cannot be reconfigured
...
+-- Error during reconfigure process.
+vshard.router.route(1):callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+vshard.router.internal.errinj.ERRINJ_CFG = true
+---
+...
+old_internal = table.copy(vshard.router.internal)
+---
+...
+util.check_error(vshard.router.cfg, cfg)
+---
+- 'Error injection: cfg'
+...
+vshard.router.internal.errinj.ERRINJ_CFG = false
+---
+...
+util.has_same_fields(old_internal, vshard.router.internal)
+---
+- true
+...
+vshard.router.route(1):callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
_ = test_run:cmd("switch default")
---
...
diff --git a/test/router/router.test.lua b/test/router/router.test.lua
index 8006e5d..df2f381 100644
--- a/test/router/router.test.lua
+++ b/test/router/router.test.lua
@@ -444,6 +444,15 @@ non_dynamic_cfg = table.copy(cfg)
non_dynamic_cfg.shard_index = 'non_default_name'
util.check_error(vshard.router.cfg, non_dynamic_cfg)
+-- Error during reconfigure process.
+vshard.router.route(1):callro('echo', {'some_data'})
+vshard.router.internal.errinj.ERRINJ_CFG = true
+old_internal = table.copy(vshard.router.internal)
+util.check_error(vshard.router.cfg, cfg)
+vshard.router.internal.errinj.ERRINJ_CFG = false
+util.has_same_fields(old_internal, vshard.router.internal)
+vshard.router.route(1):callro('echo', {'some_data'})
+
_ = test_run:cmd("switch default")
test_run:drop_cluster(REPLICASET_2)
diff --git a/test/storage/storage.result b/test/storage/storage.result
index 4399ff0..ff07fe9 100644
--- a/test/storage/storage.result
+++ b/test/storage/storage.result
@@ -732,6 +732,45 @@ util.check_error(vshard.storage.cfg,
non_dynamic_cfg, names.storage_1_a)
---
- Non-dynamic option bucket_count cannot be reconfigured
...
+-- Error during reconfigure process.
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
+vshard.storage.internal.errinj.ERRINJ_CFG = true
+---
+...
+old_internal = table.copy(vshard.storage.internal)
+---
+...
+_, err = pcall(vshard.storage.cfg, cfg, names.storage_1_a)
+---
+...
+err:match('Error injection:.*')
+---
+- 'Error injection: cfg'
+...
+vshard.storage.internal.errinj.ERRINJ_CFG = false
+---
+...
+util.has_same_fields(old_internal, vshard.storage.internal)
+---
+- true
+...
+_, rs = next(vshard.storage.internal.replicasets)
+---
+...
+rs:callro('echo', {'some_data'})
+---
+- some_data
+- null
+- null
+...
_ = test_run:cmd("switch default")
---
...
diff --git a/test/storage/storage.test.lua b/test/storage/storage.test.lua
index 72564e1..04bb608 100644
--- a/test/storage/storage.test.lua
+++ b/test/storage/storage.test.lua
@@ -182,6 +182,18 @@ non_dynamic_cfg = table.copy(cfg)
non_dynamic_cfg.bucket_count =
require('vshard.consts').DEFAULT_BUCKET_COUNT + 1
util.check_error(vshard.storage.cfg, non_dynamic_cfg, names.storage_1_a)
+-- Error during reconfigure process.
+_, rs = next(vshard.storage.internal.replicasets)
+rs:callro('echo', {'some_data'})
+vshard.storage.internal.errinj.ERRINJ_CFG = true
+old_internal = table.copy(vshard.storage.internal)
+_, err = pcall(vshard.storage.cfg, cfg, names.storage_1_a)
+err:match('Error injection:.*')
+vshard.storage.internal.errinj.ERRINJ_CFG = false
+util.has_same_fields(old_internal, vshard.storage.internal)
+_, rs = next(vshard.storage.internal.replicasets)
+rs:callro('echo', {'some_data'})
+
_ = test_run:cmd("switch default")
test_run:drop_cluster(REPLICASET_2)
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 4531f3a..a143070 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -11,6 +11,7 @@ local M = rawget(_G, '__module_vshard_router')
if not M then
M = {
errinj = {
+ ERRINJ_CFG = false,
ERRINJ_FAILOVER_CHANGE_CFG = false,
ERRINJ_RELOAD = false,
ERRINJ_LONG_DISCOVERY = false,
@@ -486,6 +487,12 @@ local function router_cfg(cfg)
for k, v in pairs(cfg) do
log.info({[k] = v})
end
+ -- It is considered that all possible errors during cfg
+ -- process occur only before this place.
+ -- This check should be placed as late as possible.
+ if M.errinj.ERRINJ_CFG then
+ error('Error injection: cfg')
+ end
box.cfg(cfg)
log.info("Box has been configured")
M.total_bucket_count = total_bucket_count
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index ff204a4..052e94f 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -33,6 +33,7 @@ if not M then
-- Bucket count stored on all replicasets.
total_bucket_count = 0,
errinj = {
+ ERRINJ_CFG = false,
ERRINJ_BUCKET_FIND_GARBAGE_DELAY = false,
ERRINJ_RELOAD = false,
ERRINJ_CFG_DELAY = false,
@@ -1560,6 +1561,14 @@ local function storage_cfg(cfg, this_replica_uuid)
local shard_index = cfg.shard_index
local collect_bucket_garbage_interval =
cfg.collect_bucket_garbage_interval
local collect_lua_garbage = cfg.collect_lua_garbage
+
+ -- It is considered that all possible errors during cfg
+ -- process occur only before this place.
+ -- This check should be placed as late as possible.
+ if M.errinj.ERRINJ_CFG then
+ error('Error injection: cfg')
+ end
+
--
-- Sync timeout is a special case - it must be updated before
-- all other options to allow a user to demote a master with
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 2/3] Complete module reload
2018-07-20 11:34 ` Alex Khatskevich
@ 2018-07-20 14:15 ` Vladislav Shpilevoy
0 siblings, 0 replies; 16+ messages in thread
From: Vladislav Shpilevoy @ 2018-07-20 14:15 UTC (permalink / raw)
To: tarantool-patches
Thanks for the fixes! See 2 comments below.
On 20/07/2018 14:34, Alex Khatskevich wrote:
> here is a full diff:
>
> commit cd3740a45322458cde10e67f8018bc4787f443aa
> Author: AKhatskevich <avkhatskevich@tarantool.org>
> Date: Sat Jun 9 17:23:40 2018 +0300
>
> Complete module reload
>
> In case one need to upgrade vshard to a new version, this commit
> improves reload mechanism to allow to do that for a wider variety of
> possible changes (between two versions).
>
> Changes:
> * introduce cfg option `connection_outdate_delay`
> * improve reload mechanism
> * add `util.async_task` method, which runs a function after a
> delay
> * delete replicaset:rebind_connections method as it is replaced
> with `rebind_replicasets` which updates all replicasets at once
>
> Reload mechanism:
> * reload all vshard modules
> * create new `replicaset` and `replica` objects
> * reuse old netbox connections in new replica objects if
> possible
> * update router/storage.internal table
> * after a `connection_outdate_delay` disable old instances of
> `replicaset` and `replica` objects
>
> Reload works for modules:
> * vshard.router
> * vshard.storage
>
> Here is a module reload algorithm:
> * old vshard is working
> * delete old vshard src
> * install new vshard
> * call: package.loaded['vshard.router'] = nil
> * call: old_router = vshard.router -- Save working router copy.
> * call: vshard.router = require('vshard.router')
> * if require fails: continue using old_router
> * if require succeeds: use vshard.router
>
> In case reload process fails, old router/storage module, replicaset and
> replica objects continue working properly. If reload succeeds, all old
> objects would be deprecated.
>
> Extra changes:
> * introduce MODULE_INTERNALS which stores name of the module
> internal data in the global namespace
>
> Part of #112
>
> diff --git a/test/router/reload.result b/test/router/reload.result
> index 47f3c2e..71f82b2 100644
> --- a/test/router/reload.result
> +++ b/test/router/reload.result
> @@ -174,6 +174,132 @@ vshard.router.module_version()
> check_reloaded()
> ---
> ...
> +--
> +-- Outdate old replicaset and replica objets.
1. Typo: objets. Please, check the spelling in a text editor
before sending the patch.
> diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
> index 99f59aa..f6e971b 100644
> --- a/vshard/replicaset.lua
> +++ b/vshard/replicaset.lua
> @@ -21,6 +21,7 @@
> -- requests to the replica>,
> -- net_sequential_fail = <count of sequential failed
> -- requests to the replica>,
> +-- outdated = nil/true,
2. In scope of the point 11 of the previous review I asked you to
rename it to 'is_outdated', since it is a flag.
> -- }
> -- },
> -- master = <master server from the array above>,
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 1/3] Add test on error during reconfigure
2018-07-20 11:34 ` Alex Khatskevich
@ 2018-07-20 14:15 ` Vladislav Shpilevoy
0 siblings, 0 replies; 16+ messages in thread
From: Vladislav Shpilevoy @ 2018-07-20 14:15 UTC (permalink / raw)
To: tarantool-patches, Alex Khatskevich
Hi! Thanks for the fixes! I have pushed my review output
in a separate commit on the branch. Please, look, run tests
and squash.
On 20/07/2018 14:34, Alex Khatskevich wrote:
> Here is a full diff:
>
> commit fd68ec9454ec6b5552a37d15bb3adb33cbf1ea84
> Author: AKhatskevich <avkhatskevich@tarantool.org>
> Date: Sat Jun 9 19:50:52 2018 +0300
>
> Add test on error during reconfigure
>
> In case reconfigure process fails, the node should continue
> work properly.
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 3/3] Introduce storage reload evolution
2018-07-20 11:32 ` Alex Khatskevich
@ 2018-07-20 14:15 ` Vladislav Shpilevoy
2018-07-20 23:58 ` Alex Khatskevich
0 siblings, 1 reply; 16+ messages in thread
From: Vladislav Shpilevoy @ 2018-07-20 14:15 UTC (permalink / raw)
To: Alex Khatskevich, tarantool-patches
Thanks for the fixes! See 4 comments below.
>>
>>> diff --git a/test/reload_evolution/storage_1_a.lua b/test/reload_evolution/storage_1_a.lua
>>> new file mode 100755
>>> index 0000000..3e03f8f
>>> --- /dev/null
>>> +++ b/test/reload_evolution/storage_1_a.lua
>>> @@ -0,0 +1,144 @@
>>> +#!/usr/bin/env tarantool
>>
>> 11. Just use a symbolic link to the existing storage_1_a. Same
>> for other storages.
>>
>>> diff --git a/test/reload_evolution/suite.ini b/test/reload_evolution/suite.ini
>>> new file mode 100644
>>> index 0000000..bb5435b
>>> --- /dev/null
>>> +++ b/test/reload_evolution/suite.ini
>>
>> 12. You do not need a new test suite for one storage test. Please,
>> put it into test/storage/
> 11-12:
> I have modified storage_1_a to change luapath.
> It is important to change path here, because the default Vshard initialization is performed here.
1. Then please explain why do you need in storage_1_a.lua
* two "require('fio')";
* '-- Check if we are running under test-run';
* 'names' table, 'replicasets' table;
* 'box.once("testapp:schema:1", function()';
* ...many other things.
If you do not need them, then please, cleanup your storage_1_a.lua
and extract wait_rebalancer_state into test utils since it is now
duplicated in storage/ and reload_evolution/ tests.
>>
>>> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
>>> index bf560e6..1740c98 100644
>>> --- a/vshard/storage/init.lua
>>> +++ b/vshard/storage/init.lua
>>> @@ -105,6 +110,11 @@ if not M then
>>> -- a destination replicaset must drop already received
>>> -- data.
>>> rebalancer_sending_bucket = 0,
>>> +
>>> + ------------------------- Reload -------------------------
>>> + -- Version of the loaded module. This number is used on
>>> + -- reload to determine which upgrade scripts to run.
>>> + reload_evolution_version = reload_evolution.version,
>>
>> 13. Use 'version' name.
> But it is not a version. Version is a git tag.
> We already have module_version and git tag. To make everything clear I would like to
> name the variable in that or similar way.
2. 'git tag' works only when you have git. But when you install vshard from a package,
the system can have no git. Moreover, it contradicts with tarantool versioning, that
exposes the actual version via _schema and box.info.
So please, use real version for this member and name it 'version'. You can find
the code to extract, store and compare versions in box/lua/upgrade.lua.
>>
>>> }
>>> end
>>> diff --git a/vshard/storage/reload_evolution.lua b/vshard/storage/reload_evolution.lua
>>> new file mode 100644
>>> index 0000000..cfac888
>>> --- /dev/null
>>> +++ b/vshard/storage/reload_evolution.lua
>>> @@ -0,0 +1,58 @@
>>> +--
>>> +-- This module is used to upgrade the vshard.storage on the fly.
>>> +-- It updates internal Lua structures in case they are changed
>>> +-- in a commit.
>>> +--
>>> +local log = require('log')
>>> +
>>> +--
>>> +-- Array of upgrade functions.
>>> +-- magrations[version] = function which upgrades module version
>>
>> 14. Typo: magrations.
> fixed
>>
>>> +-- from `version` to `version + 1`.
>>
>> 15. Not +1. I think, we should use real version numbers:
>> 0.1.1, 0.1.2, etc similar to tarantool.
> Disagree. That would make us to
> 1. Tag commit each time M is changed
3. It is not the same as git tag. We can add here a new version and tag it
later, like it is done in tarantool, when we use the new version everywhere
in the new code, but release it when ready.
For example, now vshard has version 1.2.3. We change M and set version in
evolution of M in the 1.2.4 constant. When 1.2.4 is ready, we do git tag
1.2.4.
> 2. Maintain identical git tags and reload_evolution_version names.
> In the introduced all you need to do is just to append callback to the
> `migrations` array.
4. You are trying to make you life easier, but do not think about
users. It is not useful when you are not able to get the actual version,
because you can not use git, or you see the strange versions like 1, 2, 3, 4 ...
Nobody uses natural numbers as versions. It is weird.
I think, we should consult Kostja N. for this though.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 3/3] Introduce storage reload evolution
2018-07-20 14:15 ` Vladislav Shpilevoy
@ 2018-07-20 23:58 ` Alex Khatskevich
0 siblings, 0 replies; 16+ messages in thread
From: Alex Khatskevich @ 2018-07-20 23:58 UTC (permalink / raw)
To: Vladislav Shpilevoy, tarantool-patches
>>> 11. Just use a symbolic link to the existing storage_1_a. Same
>>> for other storages.
>>>
>>> 12. You do not need a new test suite for one storage test. Please,
>>> put it into test/storage/
>> 11-12:
>> I have modified storage_1_a to change luapath.
>> It is important to change path here, because the default Vshard
>> initialization is performed here.
>
> 1. Then please explain why do you need in storage_1_a.lua
> * two "require('fio')";
> * '-- Check if we are running under test-run';
> * 'names' table, 'replicasets' table;
> * 'box.once("testapp:schema:1", function()';
> * ...many other things.
>
> If you do not need them, then please, cleanup your storage_1_a.lua
> and extract wait_rebalancer_state into test utils since it is now
> duplicated in storage/ and reload_evolution/ tests.
>
I have reused some code. Extra commit was added.
>>>
>>>> diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
>>>> index bf560e6..1740c98 100644
>>>> --- a/vshard/storage/init.lua
>>>> +++ b/vshard/storage/init.lua
>>>> @@ -105,6 +110,11 @@ if not M then
>>>> -- a destination replicaset must drop already received
>>>> -- data.
>>>> rebalancer_sending_bucket = 0,
>>>> +
>>>> + ------------------------- Reload -------------------------
>>>> + -- Version of the loaded module. This number is used on
>>>> + -- reload to determine which upgrade scripts to run.
>>>> + reload_evolution_version = reload_evolution.version,
>>>
>>> 13. Use 'version' name.
>> But it is not a version. Version is a git tag.
>> We already have module_version and git tag. To make everything clear
>> I would like to
>> name the variable in that or similar way.
>
> 2. 'git tag' works only when you have git. But when you install vshard
> from a package,
> the system can have no git. Moreover, it contradicts with tarantool
> versioning, that
> exposes the actual version via _schema and box.info.
>
> So please, use real version for this member and name it 'version'. You
> can find
> the code to extract, store and compare versions in box/lua/upgrade.lua.
>
> 3. It is not the same as git tag. We can add here a new version and
> tag it
> later, like it is done in tarantool, when we use the new version
> everywhere
> in the new code, but release it when ready.
>
> For example, now vshard has version 1.2.3. We change M and set version in
> evolution of M in the 1.2.4 constant. When 1.2.4 is ready, we do git tag
> 1.2.4.
>
>
> 4. You are trying to make you life easier, but do not think about
> users. It is not useful when you are not able to get the actual version,
> because you can not use git, or you see the strange versions like 1,
> 2, 3, 4 ...
> Nobody uses natural numbers as versions. It is weird.
>
> I think, we should consult Kostja N. for this though.
After the discussion and implementation I will create another tread for
this patch.
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2018-07-20 23:58 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-18 17:47 [tarantool-patches] [PATCH 0/3] vshard reload mechanism AKhatskevich
2018-07-18 17:47 ` [tarantool-patches] [PATCH 1/3] Add test on error during reconfigure AKhatskevich
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
2018-07-19 20:33 ` Alex Khatskevich
2018-07-20 11:34 ` Alex Khatskevich
2018-07-20 14:15 ` Vladislav Shpilevoy
2018-07-18 17:47 ` [tarantool-patches] [PATCH 2/3] Complete module reload AKhatskevich
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
2018-07-19 20:32 ` Alex Khatskevich
2018-07-20 11:34 ` Alex Khatskevich
2018-07-20 14:15 ` Vladislav Shpilevoy
2018-07-18 17:47 ` [tarantool-patches] [PATCH 3/3] Introduce storage reload evolution AKhatskevich
2018-07-19 15:14 ` [tarantool-patches] " Vladislav Shpilevoy
2018-07-20 11:32 ` Alex Khatskevich
2018-07-20 14:15 ` Vladislav Shpilevoy
2018-07-20 23:58 ` Alex Khatskevich
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox