Tarantool development patches archive
 help / color / mirror / Atom feed
From: sergeyb@tarantool.org
To: tarantool-patches@dev.tarantool.org, v.shpilevoy@tarantool.org,
	sergepetrenko@tarantool.org, gorcunov@gmail.com,
	lvasiliev@tarantool.org
Subject: [Tarantool-patches] [PATCH 2/4] replication: add advanced tests for sync replication
Date: Fri,  3 Jul 2020 00:13:35 +0300	[thread overview]
Message-ID: <012c8c196396cf963a0aa1f2d23814ff84b81cfb.1593723973.git.sergeyb@tarantool.org> (raw)
In-Reply-To: <cover.1593472477.git.v.shpilevoy@tarantool.org>

From: Sergey Bronnikov <sergeyb@tarantool.org>

Part of #5055
---
 test/replication/qsync_advanced.result   | 939 +++++++++++++++++++++++
 test/replication/qsync_advanced.test.lua | 337 ++++++++
 2 files changed, 1276 insertions(+)
 create mode 100644 test/replication/qsync_advanced.result
 create mode 100644 test/replication/qsync_advanced.test.lua

diff --git a/test/replication/qsync_advanced.result b/test/replication/qsync_advanced.result
new file mode 100644
index 000000000..fa94c8339
--- /dev/null
+++ b/test/replication/qsync_advanced.result
@@ -0,0 +1,939 @@
+-- test-run result file version 2
+env = require('test_run')
+ | ---
+ | ...
+test_run = env.new()
+ | ---
+ | ...
+engine = test_run:get_cfg('engine')
+ | ---
+ | ...
+fiber = require('fiber')
+ | ---
+ | ...
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+ | ---
+ | ...
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+ | ---
+ | ...
+
+NUM_INSTANCES = 2
+ | ---
+ | ...
+BROKEN_QUORUM = NUM_INSTANCES + 1
+ | ---
+ | ...
+
+test_run:cmd("setopt delimiter ';'")
+ | ---
+ | - true
+ | ...
+disable_sync_mode = function()
+    local s = box.space._space:get(box.space.sync.id)
+    local new_s = s:update({{'=', 6, {is_sync=false}}})
+    box.space._space:replace(new_s)
+end;
+ | ---
+ | ...
+test_run:cmd("setopt delimiter ''");
+ | ---
+ | - true
+ | ...
+
+box.schema.user.grant('guest', 'replication')
+ | ---
+ | ...
+
+-- Setup an async cluster with two instances.
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=True, wait_load=True')
+ | ---
+ | - true
+ | ...
+
+-- Successful write.
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.space.sync:insert{1} -- success
+ | ---
+ | - [1]
+ | ...
+test_run:cmd('switch replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- Unsuccessfull write.
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.space.sync:insert{1}
+ | ---
+ | - error: Quorum collection for a synchronous transaction is timed out
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- none
+ | ---
+ | - []
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- Updated replication_synchro_quorum doesn't affect existed tx.
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+OP_TIMEOUT = 5
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=OP_TIMEOUT}
+ | ---
+ | ...
+test_run:cmd("setopt delimiter ';'")
+ | ---
+ | - true
+ | ...
+_ = fiber.create(function()
+    box.space.sync:insert{1}
+end);
+ | ---
+ | ...
+test_run:cmd("setopt delimiter ''");
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES}
+ | ---
+ | ...
+fiber.sleep(OP_TIMEOUT) -- to make sure replication_synchro_timeout is exceeded
+ | ---
+ | ...
+box.space.sync:select{} -- none
+ | ---
+ | - []
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- none
+ | ---
+ | - []
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- [RFC, quorum commit] attempt to write multiple transactions, expected the
+-- same order as on client in case of achieved quorum.
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.space.sync:insert{1}
+ | ---
+ | - [1]
+ | ...
+box.space.sync:insert{2}
+ | ---
+ | - [2]
+ | ...
+box.space.sync:insert{3}
+ | ---
+ | - [3]
+ | ...
+box.space.sync:select{} -- 1, 2, 3
+ | ---
+ | - - [1]
+ |   - [2]
+ |   - [3]
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1, 2, 3
+ | ---
+ | - - [1]
+ |   - [2]
+ |   - [3]
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- Synchro timeout is not bigger than replication_synchro_timeout value.
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=orig_synchro_timeout}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+start = os.time()
+ | ---
+ | ...
+box.space.sync:insert{1}
+ | ---
+ | - error: Quorum collection for a synchronous transaction is timed out
+ | ...
+(os.time() - start) == box.cfg.replication_synchro_timeout -- true
+ | ---
+ | - true
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- replication_synchro_quorum
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+INT_MIN = -2147483648
+ | ---
+ | ...
+INT_MAX = 2147483648
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum=INT_MAX} -- error
+ | ---
+ | - error: 'Incorrect value for option ''replication_synchro_quorum'': the value must
+ |     be greater than zero and less than maximal number of replicas'
+ | ...
+box.cfg.replication_synchro_quorum -- old value
+ | ---
+ | - 3
+ | ...
+box.cfg{replication_synchro_quorum=INT_MIN} -- error
+ | ---
+ | - error: 'Incorrect value for option ''replication_synchro_quorum'': the value must
+ |     be greater than zero and less than maximal number of replicas'
+ | ...
+box.cfg.replication_synchro_quorum -- old value
+ | ---
+ | - 3
+ | ...
+
+-- replication_synchro_timeout
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+DOUBLE_MAX = 9007199254740992
+ | ---
+ | ...
+box.cfg{replication_synchro_timeout=DOUBLE_MAX}
+ | ---
+ | ...
+box.cfg.replication_synchro_timeout -- DOUBLE_MAX
+ | ---
+ | - 9007199254740992
+ | ...
+box.cfg{replication_synchro_timeout=DOUBLE_MAX+1}
+ | ---
+ | ...
+box.cfg.replication_synchro_timeout -- DOUBLE_MAX
+ | ---
+ | - 9007199254740992
+ | ...
+box.cfg{replication_synchro_timeout=-1} -- error
+ | ---
+ | - error: 'Incorrect value for option ''replication_synchro_timeout'': the value must
+ |     be greater than zero'
+ | ...
+box.cfg.replication_synchro_timeout -- old value
+ | ---
+ | - 9007199254740992
+ | ...
+box.cfg{replication_synchro_timeout=0} -- error
+ | ---
+ | - error: 'Incorrect value for option ''replication_synchro_timeout'': the value must
+ |     be greater than zero'
+ | ...
+box.cfg.replication_synchro_timeout -- old value
+ | ---
+ | - 9007199254740992
+ | ...
+
+-- TX is in synchronous replication.
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.begin() box.space.sync:insert({1}) box.commit()
+ | ---
+ | ...
+box.begin() box.space.sync:insert({2}) box.commit()
+ | ---
+ | ...
+-- Testcase cleanup.
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- [RFC, summary] switch sync replicas into async ones, expected success and
+-- data consistency on a leader and replicas.
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.space.sync:insert{1}
+ | ---
+ | - [1]
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+-- Disable synchronous mode.
+disable_sync_mode()
+ | ---
+ | ...
+-- Space is in async mode now.
+box.cfg{replication_synchro_quorum=NUM_INSTANCES}
+ | ---
+ | ...
+box.space.sync:insert{2} -- success
+ | ---
+ | - [2]
+ | ...
+box.space.sync:insert{3} -- success
+ | ---
+ | - [3]
+ | ...
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM}
+ | ---
+ | ...
+box.space.sync:insert{4} -- success
+ | ---
+ | - [4]
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES}
+ | ---
+ | ...
+box.space.sync:insert{5} -- success
+ | ---
+ | - [5]
+ | ...
+box.space.sync:select{} -- 1, 2, 3, 4, 5
+ | ---
+ | - - [1]
+ |   - [2]
+ |   - [3]
+ |   - [4]
+ |   - [5]
+ | ...
+test_run:cmd('switch replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1, 2, 3, 4, 5
+ | ---
+ | - - [1]
+ |   - [2]
+ |   - [3]
+ |   - [4]
+ |   - [5]
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- (FLAKY) [RFC, Synchronous replication enabling] "As soon as last operation of
+-- synchronous transaction appeared in leader's WAL, it will cause all
+-- following transactions - no matter if they are synchronous or not - wait for
+-- the quorum. In case quorum is not achieved the 'rollback' operation will
+-- cause rollback of all transactions after the synchronous one."
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.space.sync:insert{1}
+ | ---
+ | - [1]
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+-- OP_TIMEOUT should be enough to make sync operation, disable
+-- sync mode and make an async operation
+OP_TIMEOUT = 10
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=OP_TIMEOUT}
+ | ---
+ | ...
+test_run:cmd("setopt delimiter ';'")
+ | ---
+ | - true
+ | ...
+_ = fiber.create(function()
+    box.space.sync:insert{2}
+end);
+ | ---
+ | ...
+test_run:cmd("setopt delimiter ''");
+ | ---
+ | - true
+ | ...
+-- Disable synchronous mode.
+disable_sync_mode()
+ | ---
+ | - error: A rollback for a synchronous transaction is received
+ | ...
+-- Space is in async mode now.
+box.space.sync:insert{3} -- async operation must wait sync one
+ | ---
+ | - error: Quorum collection for a synchronous transaction is timed out
+ | ...
+fiber.sleep(OP_TIMEOUT + 1)
+ | ---
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:cmd('switch replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- Warn user when setting `replication_synchro_quorum` to a value
+-- greater than number of instances in a cluster, see gh-5122.
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM} -- warning
+ | ---
+ | ...
+
+-- [RFC, summary] switch from leader to replica and vice versa, expected
+-- success and data consistency on a leader and replicas (gh-5124).
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.space.sync:insert{1}
+ | ---
+ | - [1]
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+box.cfg{read_only=false} -- promote replica to master
+ | ---
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{read_only=true} -- demote master to replica
+ | ---
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{2}
+ | ---
+ | - [2]
+ | ...
+box.space.sync:select{} -- 1, 2
+ | ---
+ | - - [1]
+ |   - [2]
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1, 2
+ | ---
+ | - - [1]
+ | ...
+-- Revert cluster configuration.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{read_only=false}
+ | ---
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.cfg{read_only=true}
+ | ---
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- check behaviour with failed write to WAL on master (ERRINJ_WAL_IO)
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.space.sync:insert{1}
+ | ---
+ | - [1]
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+box.error.injection.set('ERRINJ_WAL_IO', true)
+ | ---
+ | - ok
+ | ...
+box.space.sync:insert{2}
+ | ---
+ | - error: Failed to write to disk
+ | ...
+box.error.injection.set('ERRINJ_WAL_IO', false)
+ | ---
+ | - ok
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- [RFC, quorum commit] check behaviour with failure answer from a replica
+-- (ERRINJ_WAL_SYNC) during write, expected disconnect from the replication
+-- (gh-5123, set replication_synchro_quorum to 1).
+-- Testcase setup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Testcase body.
+box.space.sync:insert{1}
+ | ---
+ | - [1]
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.error.injection.set('ERRINJ_WAL_IO', true)
+ | ---
+ | - ok
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{2}
+ | ---
+ | - error: Quorum collection for a synchronous transaction is timed out
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.error.injection.set('ERRINJ_WAL_IO', false)
+ | ---
+ | - ok
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- Teardown.
+test_run:cmd('switch default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+test_run:cleanup_cluster()
+ | ---
+ | ...
+box.schema.user.revoke('guest', 'replication')
+ | ---
+ | ...
+box.cfg{                                                                        \
+    replication_synchro_quorum = orig_synchro_quorum,                           \
+    replication_synchro_timeout = orig_synchro_timeout,                         \
+}
+ | ---
+ | ...
+
+-- Setup an async cluster.
+box.schema.user.grant('guest', 'replication')
+ | ---
+ | ...
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=True, wait_load=True')
+ | ---
+ | - true
+ | ...
+
+-- [RFC, summary] switch async replica into sync one, expected
+-- success and data consistency on a leader and replica.
+-- Testcase setup.
+_ = box.schema.space.create('sync', {engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+box.space.sync:insert{1} -- success
+ | ---
+ | - [1]
+ | ...
+test_run:cmd('switch replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+-- Enable synchronous mode.
+s = box.space._space:get(box.space.sync.id)
+ | ---
+ | ...
+new_s = s:update({{'=', 6, {is_sync=true}}})
+ | ---
+ | ...
+box.space._space:replace(new_s)
+ | ---
+ | - [523, 1, 'sync', 'vinyl', 0, {'is_sync': true}, []]
+ | ...
+-- Space is in sync mode now.
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+box.space.sync:insert{2} -- success
+ | ---
+ | - [2]
+ | ...
+box.space.sync:insert{3} -- success
+ | ---
+ | - [3]
+ | ...
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+box.space.sync:insert{4} -- failure
+ | ---
+ | - error: Quorum collection for a synchronous transaction is timed out
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+box.space.sync:insert{5} -- success
+ | ---
+ | - [5]
+ | ...
+box.space.sync:select{} -- 1, 2, 3, 5
+ | ---
+ | - - [1]
+ |   - [2]
+ |   - [3]
+ |   - [5]
+ | ...
+test_run:cmd('switch replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{} -- 1, 2, 3, 5
+ | ---
+ | - - [1]
+ |   - [2]
+ |   - [3]
+ |   - [5]
+ | ...
+-- Testcase cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+
+-- Teardown.
+test_run:cmd('switch default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+test_run:cleanup_cluster()
+ | ---
+ | ...
+box.schema.user.revoke('guest', 'replication')
+ | ---
+ | ...
+box.cfg{                                                                        \
+    replication_synchro_quorum = orig_synchro_quorum,                           \
+    replication_synchro_timeout = orig_synchro_timeout,                         \
+}
+ | ---
+ | ...
diff --git a/test/replication/qsync_advanced.test.lua b/test/replication/qsync_advanced.test.lua
new file mode 100644
index 000000000..270fd494d
--- /dev/null
+++ b/test/replication/qsync_advanced.test.lua
@@ -0,0 +1,337 @@
+env = require('test_run')
+test_run = env.new()
+engine = test_run:get_cfg('engine')
+fiber = require('fiber')
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+
+NUM_INSTANCES = 2
+BROKEN_QUORUM = NUM_INSTANCES + 1
+
+test_run:cmd("setopt delimiter ';'")
+disable_sync_mode = function()
+    local s = box.space._space:get(box.space.sync.id)
+    local new_s = s:update({{'=', 6, {is_sync=false}}})
+    box.space._space:replace(new_s)
+end;
+test_run:cmd("setopt delimiter ''");
+
+box.schema.user.grant('guest', 'replication')
+
+-- Setup an async cluster with two instances.
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=True, wait_load=True')
+
+-- Successful write.
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.space.sync:insert{1} -- success
+test_run:cmd('switch replica')
+box.space.sync:select{} -- 1
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- Unsuccessfull write.
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.space.sync:insert{1}
+test_run:switch('replica')
+box.space.sync:select{} -- none
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- Updated replication_synchro_quorum doesn't affect existed tx.
+-- Testcase setup.
+test_run:switch('default')
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+OP_TIMEOUT = 5
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=OP_TIMEOUT}
+test_run:cmd("setopt delimiter ';'")
+_ = fiber.create(function()
+    box.space.sync:insert{1}
+end);
+test_run:cmd("setopt delimiter ''");
+box.cfg{replication_synchro_quorum=NUM_INSTANCES}
+fiber.sleep(OP_TIMEOUT) -- to make sure replication_synchro_timeout is exceeded
+box.space.sync:select{} -- none
+test_run:switch('replica')
+box.space.sync:select{} -- none
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- [RFC, quorum commit] attempt to write multiple transactions, expected the
+-- same order as on client in case of achieved quorum.
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.space.sync:insert{1}
+box.space.sync:insert{2}
+box.space.sync:insert{3}
+box.space.sync:select{} -- 1, 2, 3
+test_run:switch('replica')
+box.space.sync:select{} -- 1, 2, 3
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- Synchro timeout is not bigger than replication_synchro_timeout value.
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=orig_synchro_timeout}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+start = os.time()
+box.space.sync:insert{1}
+(os.time() - start) == box.cfg.replication_synchro_timeout -- true
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- replication_synchro_quorum
+test_run:switch('default')
+INT_MIN = -2147483648
+INT_MAX = 2147483648
+box.cfg{replication_synchro_quorum=INT_MAX} -- error
+box.cfg.replication_synchro_quorum -- old value
+box.cfg{replication_synchro_quorum=INT_MIN} -- error
+box.cfg.replication_synchro_quorum -- old value
+
+-- replication_synchro_timeout
+test_run:switch('default')
+DOUBLE_MAX = 9007199254740992
+box.cfg{replication_synchro_timeout=DOUBLE_MAX}
+box.cfg.replication_synchro_timeout -- DOUBLE_MAX
+box.cfg{replication_synchro_timeout=DOUBLE_MAX+1}
+box.cfg.replication_synchro_timeout -- DOUBLE_MAX
+box.cfg{replication_synchro_timeout=-1} -- error
+box.cfg.replication_synchro_timeout -- old value
+box.cfg{replication_synchro_timeout=0} -- error
+box.cfg.replication_synchro_timeout -- old value
+
+-- TX is in synchronous replication.
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.begin() box.space.sync:insert({1}) box.commit()
+box.begin() box.space.sync:insert({2}) box.commit()
+-- Testcase cleanup.
+box.space.sync:drop()
+
+-- [RFC, summary] switch sync replicas into async ones, expected success and
+-- data consistency on a leader and replicas.
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.space.sync:insert{1}
+box.space.sync:select{} -- 1
+test_run:switch('replica')
+box.space.sync:select{} -- 1
+test_run:switch('default')
+-- Disable synchronous mode.
+disable_sync_mode()
+-- Space is in async mode now.
+box.cfg{replication_synchro_quorum=NUM_INSTANCES}
+box.space.sync:insert{2} -- success
+box.space.sync:insert{3} -- success
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM}
+box.space.sync:insert{4} -- success
+box.cfg{replication_synchro_quorum=NUM_INSTANCES}
+box.space.sync:insert{5} -- success
+box.space.sync:select{} -- 1, 2, 3, 4, 5
+test_run:cmd('switch replica')
+box.space.sync:select{} -- 1, 2, 3, 4, 5
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- (FLAKY) [RFC, Synchronous replication enabling] "As soon as last operation of
+-- synchronous transaction appeared in leader's WAL, it will cause all
+-- following transactions - no matter if they are synchronous or not - wait for
+-- the quorum. In case quorum is not achieved the 'rollback' operation will
+-- cause rollback of all transactions after the synchronous one."
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.space.sync:insert{1}
+box.space.sync:select{} -- 1
+test_run:switch('replica')
+box.space.sync:select{} -- 1
+test_run:switch('default')
+-- OP_TIMEOUT should be enough to make sync operation, disable
+-- sync mode and make an async operation
+OP_TIMEOUT = 10
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=OP_TIMEOUT}
+test_run:cmd("setopt delimiter ';'")
+_ = fiber.create(function()
+    box.space.sync:insert{2}
+end);
+test_run:cmd("setopt delimiter ''");
+-- Disable synchronous mode.
+disable_sync_mode()
+-- Space is in async mode now.
+box.space.sync:insert{3} -- async operation must wait sync one
+fiber.sleep(OP_TIMEOUT + 1)
+box.space.sync:select{} -- 1
+test_run:cmd('switch replica')
+box.space.sync:select{} -- 1
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- Warn user when setting `replication_synchro_quorum` to a value
+-- greater than number of instances in a cluster, see gh-5122.
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM} -- warning
+
+-- [RFC, summary] switch from leader to replica and vice versa, expected
+-- success and data consistency on a leader and replicas (gh-5124).
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.space.sync:insert{1}
+box.space.sync:select{} -- 1
+test_run:switch('replica')
+box.space.sync:select{} -- 1
+box.cfg{read_only=false} -- promote replica to master
+test_run:switch('default')
+box.cfg{read_only=true} -- demote master to replica
+test_run:switch('replica')
+box.space.sync:insert{2}
+box.space.sync:select{} -- 1, 2
+test_run:switch('default')
+box.space.sync:select{} -- 1, 2
+-- Revert cluster configuration.
+test_run:switch('default')
+box.cfg{read_only=false}
+test_run:switch('replica')
+box.cfg{read_only=true}
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- Check behaviour with failed write to WAL on master (ERRINJ_WAL_IO).
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.space.sync:insert{1}
+box.space.sync:select{} -- 1
+box.error.injection.set('ERRINJ_WAL_IO', true)
+box.space.sync:insert{2}
+box.error.injection.set('ERRINJ_WAL_IO', false)
+box.space.sync:select{} -- 1
+test_run:switch('replica')
+box.space.sync:select{} -- 1
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- [RFC, quorum commit] check behaviour with failure answer from a replica
+-- (ERRINJ_WAL_SYNC) during write, expected disconnect from the replication
+-- (gh-5123, set replication_synchro_quorum to 1).
+-- Testcase setup.
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+-- Testcase body.
+box.space.sync:insert{1}
+box.space.sync:select{} -- 1
+test_run:switch('replica')
+box.error.injection.set('ERRINJ_WAL_IO', true)
+test_run:switch('default')
+box.space.sync:insert{2}
+test_run:switch('replica')
+box.error.injection.set('ERRINJ_WAL_IO', false)
+box.space.sync:select{} -- 1
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- Teardown.
+test_run:cmd('switch default')
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+test_run:cleanup_cluster()
+box.schema.user.revoke('guest', 'replication')
+box.cfg{                                                                        \
+    replication_synchro_quorum = orig_synchro_quorum,                           \
+    replication_synchro_timeout = orig_synchro_timeout,                         \
+}
+
+-- Setup an async cluster.
+box.schema.user.grant('guest', 'replication')
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=True, wait_load=True')
+
+-- [RFC, summary] switch async replica into sync one, expected
+-- success and data consistency on a leader and replica.
+-- Testcase setup.
+_ = box.schema.space.create('sync', {engine=engine})
+_ = box.space.sync:create_index('pk')
+box.space.sync:insert{1} -- success
+test_run:cmd('switch replica')
+box.space.sync:select{} -- 1
+test_run:switch('default')
+-- Enable synchronous mode.
+s = box.space._space:get(box.space.sync.id)
+new_s = s:update({{'=', 6, {is_sync=true}}})
+box.space._space:replace(new_s)
+-- Space is in sync mode now.
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.space.sync:insert{2} -- success
+box.space.sync:insert{3} -- success
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=0.1}
+box.space.sync:insert{4} -- failure
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.space.sync:insert{5} -- success
+box.space.sync:select{} -- 1, 2, 3, 5
+test_run:cmd('switch replica')
+box.space.sync:select{} -- 1, 2, 3, 5
+-- Testcase cleanup.
+test_run:switch('default')
+box.space.sync:drop()
+
+-- Teardown.
+test_run:cmd('switch default')
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+test_run:cleanup_cluster()
+box.schema.user.revoke('guest', 'replication')
+box.cfg{                                                                        \
+    replication_synchro_quorum = orig_synchro_quorum,                           \
+    replication_synchro_timeout = orig_synchro_timeout,                         \
+}
-- 
2.26.2

  parent reply	other threads:[~2020-07-02 21:15 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <cover.1593723973.git.sergeyb@tarantool.org>
2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 01/19] replication: introduce space.is_sync option Vladislav Shpilevoy
2020-06-30 23:00     ` Vladislav Shpilevoy
2020-07-01 15:55       ` Sergey Ostanevich
2020-07-01 23:46         ` Vladislav Shpilevoy
2020-07-02  8:25       ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 10/19] txn_limbo: add ROLLBACK processing Vladislav Shpilevoy
2020-07-05 15:29     ` Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 11/19] box: rework local_recovery to use async txn_commit Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 12/19] replication: support ROLLBACK and CONFIRM during recovery Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 13/19] replication: add test for synchro CONFIRM/ROLLBACK Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 14/19] applier: remove writer_cond Vladislav Shpilevoy
2020-07-02  9:13     ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 15/19] applier: send heartbeat not only on commit, but on any write Vladislav Shpilevoy
2020-07-01 23:55     ` Vladislav Shpilevoy
2020-07-03 12:23     ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 16/19] txn_limbo: add diag_set in txn_limbo_wait_confirm Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 17/19] replication: delay initial join until confirmation Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 18/19] replication: only send confirmed data during final join Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 19/19] replication: block async transactions when not empty limbo Vladislav Shpilevoy
2020-07-01 17:12     ` Sergey Ostanevich
2020-07-01 23:47       ` Vladislav Shpilevoy
2020-07-03 12:28     ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 02/19] replication: introduce replication_synchro_* cfg options Vladislav Shpilevoy
2020-07-01 16:05     ` Sergey Ostanevich
2020-07-01 23:46       ` Vladislav Shpilevoy
2020-07-02  8:29     ` Serge Petrenko
2020-07-02 23:36       ` Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 03/19] txn: add TXN_WAIT_ACK flag Vladislav Shpilevoy
2020-07-01 17:14     ` Sergey Ostanevich
2020-07-01 23:46     ` Vladislav Shpilevoy
2020-07-02  8:30     ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 04/19] replication: make sync transactions wait quorum Vladislav Shpilevoy
2020-06-30 23:00     ` Vladislav Shpilevoy
2020-07-02  8:48     ` Serge Petrenko
2020-07-03 21:16       ` Vladislav Shpilevoy
2020-07-05 16:05     ` Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 05/19] xrow: introduce CONFIRM and ROLLBACK entries Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 06/19] txn: introduce various reasons for txn rollback Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 07/19] replication: write and read CONFIRM entries Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 08/19] replication: add support of qsync to the snapshot machinery Vladislav Shpilevoy
2020-07-02  8:52     ` Serge Petrenko
2020-07-08 11:43     ` Leonid Vasiliev
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 09/19] txn_limbo: add timeout when waiting for acks Vladislav Shpilevoy
2020-06-29 23:22   ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Vladislav Shpilevoy
2020-06-30 23:00   ` [Tarantool-patches] [PATCH v2 20/19] replication: add test for quorum 1 Vladislav Shpilevoy
2020-07-03 12:32     ` Serge Petrenko
2020-07-02 21:13   ` [Tarantool-patches] [PATCH 1/4] replication: regression test on gh-5119 [not fixed] sergeyb
2020-07-02 21:13   ` sergeyb [this message]
2020-07-02 22:46     ` [Tarantool-patches] [PATCH 2/4] replication: add advanced tests for sync replication Sergey Bronnikov
2020-07-02 23:20     ` Vladislav Shpilevoy
2020-07-06 12:30       ` Sergey Bronnikov
2020-07-06 23:31     ` Vladislav Shpilevoy
2020-07-07 12:12       ` Sergey Bronnikov
2020-07-07 20:57         ` Vladislav Shpilevoy
2020-07-08 12:07           ` Sergey Bronnikov
2020-07-08 22:13             ` Vladislav Shpilevoy
2020-07-09  9:39               ` Sergey Bronnikov
2020-07-02 21:13   ` [Tarantool-patches] [PATCH 3/4] replication: add tests for sync replication with anon replica sergeyb
2020-07-06 23:31     ` Vladislav Shpilevoy
2020-07-02 21:13   ` [Tarantool-patches] [PATCH 4/4] replication: add tests for sync replication with snapshots sergeyb
2020-07-02 22:46     ` Sergey Bronnikov
2020-07-02 23:20     ` Vladislav Shpilevoy
2020-07-06 23:31     ` Vladislav Shpilevoy
2020-07-07 16:00       ` Sergey Bronnikov
2020-07-06 23:31   ` [Tarantool-patches] [PATCH] Add new error injection constant ERRINJ_SYNC_TIMEOUT Vladislav Shpilevoy
2020-07-10  0:50   ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Vladislav Shpilevoy
2020-07-10  7:40   ` Kirill Yukhin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=012c8c196396cf963a0aa1f2d23814ff84b81cfb.1593723973.git.sergeyb@tarantool.org \
    --to=sergeyb@tarantool.org \
    --cc=gorcunov@gmail.com \
    --cc=lvasiliev@tarantool.org \
    --cc=sergepetrenko@tarantool.org \
    --cc=tarantool-patches@dev.tarantool.org \
    --cc=v.shpilevoy@tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH 2/4] replication: add advanced tests for sync replication' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox