From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <tarantool-patches-bounce@freelists.org>
Received: from localhost (localhost [127.0.0.1])
	by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 285FA2CDE8
	for <tarantool-patches@freelists.org>; Thu, 22 Mar 2018 18:11:52 -0400 (EDT)
Received: from turing.freelists.org ([127.0.0.1])
	by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024)
	with ESMTP id tXs4sUxZviMY for <tarantool-patches@freelists.org>;
	Thu, 22 Mar 2018 18:11:52 -0400 (EDT)
Received: from smtp60.i.mail.ru (smtp60.i.mail.ru [217.69.128.40])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 411B92B784
	for <tarantool-patches@freelists.org>; Thu, 22 Mar 2018 18:11:50 -0400 (EDT)
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Subject: [tarantool-patches] [PATCH vshard 1/1] storage: do read_only + sync on master demotion before box.cfg
Date: Fri, 23 Mar 2018 01:11:45 +0300
Message-Id: <d3471b7a24a5a43742bca0916380fac865e50db6.1521756609.git.v.shpilevoy@tarantool.org>
Sender: tarantool-patches-bounce@freelists.org
Errors-to: tarantool-patches-bounce@freelists.org
Reply-To: tarantool-patches@freelists.org
List-help: <mailto:ecartis@freelists.org?Subject=help>
List-unsubscribe: <tarantool-patches-request@freelists.org?Subject=unsubscribe>
List-software: Ecartis version 1.0.0
List-Id: tarantool-patches <tarantool-patches.freelists.org>
List-subscribe: <tarantool-patches-request@freelists.org?Subject=subscribe>
List-owner: <mailto:>
List-post: <mailto:tarantool-patches@freelists.org>
List-archive: <http://www.freelists.org/archives/tarantool-patches>
To: tarantool-patches@freelists.org
Cc: georgy@tarantool.org, Vladislav Shpilevoy <v.shpilevoy@tarantool.org>

Before a master is demoted and updated its box.cfg it must turn
read_only mode on and try to sync with slaves. With no sync and
read_only before box.cfg it is possible, that some new data will
appear during box reconfiguration, and it will not be synced.

Closes #84
---

Branch: https://github.com/tarantool/vshard/tree/gh-84-sync-before-cfg
Issue: https://github.com/tarantool/vshard/issues/84

 test/misc/reconfigure.result             |  14 ++++
 test/misc/reconfigure.test.lua           |   4 ++
 test/storage/demote_sync_errinj.result   | 110 +++++++++++++++++++++++++++++++
 test/storage/demote_sync_errinj.test.lua |  39 +++++++++++
 test/storage/read_only_slave.result      |  27 +++++---
 test/storage/read_only_slave.test.lua    |  16 +++--
 test/unit/config.result                  |  23 +++++++
 test/unit/config.test.lua                |  11 ++++
 vshard/cfg.lua                           |   7 +-
 vshard/consts.lua                        |   3 +-
 vshard/storage/init.lua                  |  76 +++++++++++++++++++--
 11 files changed, 303 insertions(+), 27 deletions(-)
 create mode 100644 test/storage/demote_sync_errinj.result
 create mode 100644 test/storage/demote_sync_errinj.test.lua

diff --git a/test/misc/reconfigure.result b/test/misc/reconfigure.result
index 0a014a3..c7960b3 100644
--- a/test/misc/reconfigure.result
+++ b/test/misc/reconfigure.result
@@ -100,6 +100,13 @@ not vshard.storage.internal.collect_lua_garbage
 ---
 - true
 ...
+vshard.storage.internal.sync_timeout
+---
+- 1
+...
+cfg.sync_timeout = 100
+---
+...
 cfg.collect_lua_garbage = true
 ---
 ...
@@ -120,6 +127,10 @@ not vshard.storage.internal.collect_lua_garbage
 ---
 - true
 ...
+vshard.storage.internal.sync_timeout
+---
+- 1
+...
 vshard.storage.internal.rebalancer_max_receiving ~= 1000
 ---
 - true
@@ -128,6 +139,9 @@ vshard.storage.internal.collect_bucket_garbage_interval ~= 100
 ---
 - true
 ...
+cfg.sync_timeout = nil
+---
+...
 cfg.collect_lua_garbage = nil
 ---
 ...
diff --git a/test/misc/reconfigure.test.lua b/test/misc/reconfigure.test.lua
index 033214a..25dc2ca 100644
--- a/test/misc/reconfigure.test.lua
+++ b/test/misc/reconfigure.test.lua
@@ -41,14 +41,18 @@ util.check_error(vshard.storage.cfg, cfg, 'unknow uuid')
 -- changed.
 --
 not vshard.storage.internal.collect_lua_garbage
+vshard.storage.internal.sync_timeout
+cfg.sync_timeout = 100
 cfg.collect_lua_garbage = true
 cfg.rebalancer_max_receiving = 1000
 cfg.collect_bucket_garbage_interval = 100
 cfg.invalid_option = 'kek'
 vshard.storage.cfg(cfg, names.storage_1_a)
 not vshard.storage.internal.collect_lua_garbage
+vshard.storage.internal.sync_timeout
 vshard.storage.internal.rebalancer_max_receiving ~= 1000
 vshard.storage.internal.collect_bucket_garbage_interval ~= 100
+cfg.sync_timeout = nil
 cfg.collect_lua_garbage = nil
 cfg.rebalancer_max_receiving = nil
 cfg.collect_bucket_garbage_interval = nil
diff --git a/test/storage/demote_sync_errinj.result b/test/storage/demote_sync_errinj.result
new file mode 100644
index 0000000..2d14504
--- /dev/null
+++ b/test/storage/demote_sync_errinj.result
@@ -0,0 +1,110 @@
+test_run = require('test_run').new()
+---
+...
+REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
+---
+...
+REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
+---
+...
+test_run:create_cluster(REPLICASET_1, 'storage')
+---
+...
+test_run:create_cluster(REPLICASET_2, 'storage')
+---
+...
+util = require('util')
+---
+...
+util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
+---
+...
+util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
+---
+...
+test_run:switch('storage_1_a')
+---
+- true
+...
+fiber = require('fiber')
+---
+...
+s = box.schema.create_space('test')
+---
+...
+pk = s:create_index('pk')
+---
+...
+vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = true
+---
+...
+cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
+---
+...
+cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
+---
+...
+f = fiber.create(function() vshard.storage.cfg(cfg, names.storage_1_a) end)
+---
+...
+f:status()
+---
+- suspended
+...
+-- Can not write - read only mode is already on.
+s:replace{1}
+---
+- error: Can't modify data because this instance is in read-only mode.
+...
+test_run:switch('storage_1_b')
+---
+- true
+...
+cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
+---
+...
+cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
+---
+...
+vshard.storage.cfg(cfg, names.storage_1_b)
+---
+...
+box.space.test:select{}
+---
+- []
+...
+test_run:switch('storage_1_a')
+---
+- true
+...
+vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = false
+---
+...
+while f:status() ~= 'dead' do fiber.sleep(0.1) end
+---
+...
+s:select{}
+---
+- []
+...
+test_run:switch('storage_1_b')
+---
+- true
+...
+box.space.test:select{}
+---
+- []
+...
+box.space.test:drop()
+---
+...
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:drop_cluster(REPLICASET_2)
+---
+...
+test_run:drop_cluster(REPLICASET_1)
+---
+...
diff --git a/test/storage/demote_sync_errinj.test.lua b/test/storage/demote_sync_errinj.test.lua
new file mode 100644
index 0000000..f12ab60
--- /dev/null
+++ b/test/storage/demote_sync_errinj.test.lua
@@ -0,0 +1,39 @@
+test_run = require('test_run').new()
+REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
+REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
+test_run:create_cluster(REPLICASET_1, 'storage')
+test_run:create_cluster(REPLICASET_2, 'storage')
+util = require('util')
+util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
+util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
+
+test_run:switch('storage_1_a')
+fiber = require('fiber')
+s = box.schema.create_space('test')
+pk = s:create_index('pk')
+vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = true
+cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
+cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
+f = fiber.create(function() vshard.storage.cfg(cfg, names.storage_1_a) end)
+f:status()
+-- Can not write - read only mode is already on.
+s:replace{1}
+
+test_run:switch('storage_1_b')
+cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
+cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
+vshard.storage.cfg(cfg, names.storage_1_b)
+box.space.test:select{}
+
+test_run:switch('storage_1_a')
+vshard.storage.internal.errinj.ERRINJ_CFG_DELAY = false
+while f:status() ~= 'dead' do fiber.sleep(0.1) end
+s:select{}
+
+test_run:switch('storage_1_b')
+box.space.test:select{}
+box.space.test:drop()
+
+test_run:cmd("switch default")
+test_run:drop_cluster(REPLICASET_2)
+test_run:drop_cluster(REPLICASET_1)
diff --git a/test/storage/read_only_slave.result b/test/storage/read_only_slave.result
index 9d5c85e..07d3653 100644
--- a/test/storage/read_only_slave.result
+++ b/test/storage/read_only_slave.result
@@ -36,10 +36,17 @@ s = box.schema.create_space('test')
 pk = s:create_index('pk')
 ---
 ...
+ok = nil
+---
+...
+err = nil
+---
+...
 function on_master_enable() s:replace{1} end
 ---
 ...
-function on_master_disable() s:replace{2} end
+-- Test, that in disable trigger already can not write.
+function on_master_disable() ok, err = pcall(s.replace, s, {2}) end
 ---
 ...
 vshard.storage.on_master_enable(on_master_enable)
@@ -85,8 +92,8 @@ vshard.storage.on_master_enable(on_master_enable)
 vshard.storage.on_master_disable(on_master_disable)
 ---
 ...
--- Yes, there is no 3 or 4, because trigger is set after a replica
--- becames slave.
+-- Yes, there is no 3 or 4, because a trigger on disable always
+-- works in readonly.
 s:select{}
 ---
 - - [1]
@@ -128,28 +135,28 @@ box.cfg.read_only
 ---
 - true
 ...
+ok, err
+---
+- false
+- Can't modify data because this instance is in read-only mode.
+...
 fiber = require('fiber')
 ---
 ...
-while s:count() ~= 3 do fiber.sleep(0.1) end
+while s:count() ~= 2 do fiber.sleep(0.1) end
 ---
 ...
 s:select{}
 ---
 - - [1]
-  - [2]
   - [3]
 ...
 test_run:switch('storage_1_b')
 ---
 - true
 ...
--- Yes, there is no {2}, because replication source is unset
--- already.
-s:select{}
+s:drop()
 ---
-- - [1]
-  - [3]
 ...
 test_run:cmd("switch default")
 ---
diff --git a/test/storage/read_only_slave.test.lua b/test/storage/read_only_slave.test.lua
index 3253ad7..d49e6f2 100644
--- a/test/storage/read_only_slave.test.lua
+++ b/test/storage/read_only_slave.test.lua
@@ -11,8 +11,11 @@ test_run:switch('storage_1_a')
 box.cfg.read_only
 s = box.schema.create_space('test')
 pk = s:create_index('pk')
+ok = nil
+err = nil
 function on_master_enable() s:replace{1} end
-function on_master_disable() s:replace{2} end
+-- Test, that in disable trigger already can not write.
+function on_master_disable() ok, err = pcall(s.replace, s, {2}) end
 vshard.storage.on_master_enable(on_master_enable)
 vshard.storage.on_master_disable(on_master_disable)
 s:select{}
@@ -27,8 +30,8 @@ function on_master_enable() s:replace{3} end
 function on_master_disable() if not box.cfg.read_only then s:replace{4} end end
 vshard.storage.on_master_enable(on_master_enable)
 vshard.storage.on_master_disable(on_master_disable)
--- Yes, there is no 3 or 4, because trigger is set after a replica
--- becames slave.
+-- Yes, there is no 3 or 4, because a trigger on disable always
+-- works in readonly.
 s:select{}
 
 -- Check that after master change the read_only is updated, and
@@ -44,14 +47,13 @@ cfg.sharding[replicasets[1]].replicas[names.storage_1_b].master = true
 cfg.sharding[replicasets[1]].replicas[names.storage_1_a].master = false
 vshard.storage.cfg(cfg, names.storage_1_a)
 box.cfg.read_only
+ok, err
 fiber = require('fiber')
-while s:count() ~= 3 do fiber.sleep(0.1) end
+while s:count() ~= 2 do fiber.sleep(0.1) end
 s:select{}
 
 test_run:switch('storage_1_b')
--- Yes, there is no {2}, because replication source is unset
--- already.
-s:select{}
+s:drop()
 
 test_run:cmd("switch default")
 test_run:drop_cluster(REPLICASET_2)
diff --git a/test/unit/config.result b/test/unit/config.result
index 2a5473c..8b098fb 100644
--- a/test/unit/config.result
+++ b/test/unit/config.result
@@ -419,3 +419,26 @@ cfg.collect_lua_garbage = false
 _ = lcfg.check(cfg)
 ---
 ...
+--
+-- gh-84: sync before master demotion, and allow to configure
+-- sync timeout.
+--
+cfg.sync_timeout = -100
+---
+...
+check(cfg)
+---
+- Sync timeout must be non-negative number
+...
+cfg.sync_timeout = 0
+---
+...
+_ = lcfg.check(cfg)
+---
+...
+cfg.sync_timeout = 10.5
+---
+...
+_ = lcfg.check(cfg)
+---
+...
diff --git a/test/unit/config.test.lua b/test/unit/config.test.lua
index 140c38c..0990d0e 100644
--- a/test/unit/config.test.lua
+++ b/test/unit/config.test.lua
@@ -166,3 +166,14 @@ cfg.collect_lua_garbage = true
 _ = lcfg.check(cfg)
 cfg.collect_lua_garbage = false
 _ = lcfg.check(cfg)
+
+--
+-- gh-84: sync before master demotion, and allow to configure
+-- sync timeout.
+--
+cfg.sync_timeout = -100
+check(cfg)
+cfg.sync_timeout = 0
+_ = lcfg.check(cfg)
+cfg.sync_timeout = 10.5
+_ = lcfg.check(cfg)
diff --git a/vshard/cfg.lua b/vshard/cfg.lua
index c578af2..9860676 100644
--- a/vshard/cfg.lua
+++ b/vshard/cfg.lua
@@ -198,7 +198,11 @@ local cfg_template = {
     {'collect_lua_garbage', {
         type = 'boolean', name = 'Garbage Lua collect necessity',
         is_optional = true, default = false
-    }}
+    }},
+    {'sync_timeout', {
+        type = 'non-negative number', name = 'Sync timeout', is_optional = true,
+        default = consts.DEFAULT_SYNC_TIMEOUT
+    }},
 }
 
 --
@@ -227,6 +231,7 @@ local function remove_non_box_options(cfg)
     cfg.shard_index = nil
     cfg.collect_bucket_garbage_interval = nil
     cfg.collect_lua_garbage = nil
+    cfg.sync_timeout = nil
 end
 
 return {
diff --git a/vshard/consts.lua b/vshard/consts.lua
index 47651d7..1a0e4ab 100644
--- a/vshard/consts.lua
+++ b/vshard/consts.lua
@@ -20,7 +20,6 @@ return {
     REPLICATION_THRESHOLD_FAIL = 10,
 
     DEFAULT_BUCKET_COUNT = 3000;
-    BUCKET_SYNC_TIMEOUT = 0.1;
     BUCKET_SENT_GARBAGE_DELAY = 0.5;
     DEFAULT_REBALANCER_DISBALANCE_THRESHOLD = 1;
     REBALANCER_IDLE_INTERVAL = 60 * 60;
@@ -30,7 +29,7 @@ return {
     CALL_TIMEOUT_MAX = 64;
     FAILOVER_UP_TIMEOUT = 5;
     FAILOVER_DOWN_TIMEOUT = 1;
-    SYNC_TIMEOUT = 1;
+    DEFAULT_SYNC_TIMEOUT = 1;
     RECONNECT_TIMEOUT = 0.5;
     DEFAULT_COLLECT_BUCKET_GARBAGE_INTERVAL = 0.5;
     RECOVERY_INTERVAL = 5;
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index a0f216e..6d66b05 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -35,10 +35,12 @@ if not M then
         errinj = {
             ERRINJ_BUCKET_FIND_GARBAGE_DELAY = false,
             ERRINJ_RELOAD = false,
+            ERRINJ_CFG_DELAY = false,
         },
         -- This counter is used to restart background fibers with
         -- new reloaded code.
         module_version = 0,
+        sync_timeout = consts.DEFAULT_SYNC_TIMEOUT,
 
         ------------------- Garbage collection -------------------
         -- Fiber to remove garbage buckets data.
@@ -305,7 +307,7 @@ local function sync(timeout)
     end
 
     log.debug("Synchronizing replicaset...")
-    timeout = timeout or consts.SYNC_TIMEOUT
+    timeout = timeout or M.sync_timeout
     local vclock = box.info.vclock
     local tstart = lfiber.time()
     repeat
@@ -539,14 +541,31 @@ local function buckets_discovery()
     return ret
 end
 
+--
+-- The only thing, that must be done to abort a master demote is
+-- a reset of read_only.
+--
+local function local_on_master_disable_abort()
+    box.cfg{read_only = false}
+end
+
+--
+-- Prepare to a master demotion. Before it, a master must stop
+-- accept writes, and try to wait until all of its data is
+-- replicated to each slave.
+--
+local function local_on_master_disable_prepare()
+    log.info("Resigning from the replicaset master role...")
+    box.cfg({read_only = true})
+    sync(M.sync_timeout)
+end
+
 --
 -- This function executes when a master role is removed from local
 -- instance during configuration
 --
 local function local_on_master_disable()
     M._on_master_disable:run()
-    box.cfg({read_only = true})
-    log.verbose("Resigning from the replicaset master role...")
     -- Stop garbage collecting
     if M.collect_bucket_garbage_fiber ~= nil then
         M.collect_bucket_garbage_fiber:cancel()
@@ -558,13 +577,29 @@ local function local_on_master_disable()
         M.recovery_fiber = nil
         log.info('Recovery stopped')
     end
-    -- Wait until replicas are synchronized before one another become a new master
-    sync(consts.SYNC_TIMEOUT)
     log.info("Resigned from the replicaset master role")
 end
 
 local collect_garbage_f
 
+--
+-- The only thing, that must be done to abort a master promotion
+-- is a set read_only back to true.
+--
+local function local_on_master_enable_abort()
+    box.cfg({read_only = true})
+end
+
+--
+-- Promote does not require sync, because a replica can not have a
+-- data, that is not on a current master - the replica is read
+-- only. But read_only can not be set to false here, because
+-- until box.cfg is called, it can not be guaranteed, that the
+-- promotion will be successful.
+--
+local function local_on_master_enable_prepare()
+    log.info("Taking on replicaset master role...")
+end
 --
 -- This function executes whan a master role is added to local
 -- instance during configuration
@@ -572,7 +607,6 @@ local collect_garbage_f
 local function local_on_master_enable()
     box.cfg({read_only = false})
     M._on_master_enable:run()
-    log.verbose("Taking on replicaset master role...")
     recovery_garbage_receiving_buckets()
     -- Start background process to collect garbage.
     M.collect_bucket_garbage_fiber =
@@ -1348,9 +1382,37 @@ local function storage_cfg(cfg, this_replica_uuid)
     local shard_index = cfg.shard_index
     local collect_bucket_garbage_interval = cfg.collect_bucket_garbage_interval
     local collect_lua_garbage = cfg.collect_lua_garbage
+    --
+    -- Sync timeout is a special case - it must be updated before
+    -- all other options to allow a user to demote a master with
+    -- a new sync timeout.
+    --
+    local old_sync_timeout = M.sync_timeout
+    M.sync_timeout = cfg.sync_timeout
     lcfg.remove_non_box_options(cfg)
 
-    box.cfg(cfg)
+    if was_master and not is_master then
+        local_on_master_disable_prepare()
+    end
+    if not was_master and is_master then
+        local_on_master_enable_prepare()
+    end
+
+    local ok, err = pcall(box.cfg, cfg)
+    while M.errinj.ERRINJ_CFG_DELAY do
+        lfiber.sleep(0.01)
+    end
+    if not ok then
+        M.sync_timeout = old_sync_timeout
+        if was_master and not is_master then
+            local_on_master_disable_abort()
+        end
+        if not was_master and is_master then
+            local_on_master_enable_abort()
+        end
+        error(err)
+    end
+
     log.info("Box has been configured")
     local uri = luri.parse(this_replica.uri)
     box.once("vshard:storage:1", storage_schema_v1, uri.login, uri.password)
-- 
2.14.3 (Apple Git-98)