From: sergeyb@tarantool.org To: tarantool-patches@dev.tarantool.org, v.shpilevoy@tarantool.org, sergepetrenko@tarantool.org, gorcunov@gmail.com, lvasiliev@tarantool.org Subject: [Tarantool-patches] [PATCH 2/4] replication: add advanced tests for sync replication Date: Fri, 3 Jul 2020 00:13:35 +0300 [thread overview] Message-ID: <012c8c196396cf963a0aa1f2d23814ff84b81cfb.1593723973.git.sergeyb@tarantool.org> (raw) In-Reply-To: <cover.1593472477.git.v.shpilevoy@tarantool.org> From: Sergey Bronnikov <sergeyb@tarantool.org> Part of #5055 --- test/replication/qsync_advanced.result | 939 +++++++++++++++++++++++ test/replication/qsync_advanced.test.lua | 337 ++++++++ 2 files changed, 1276 insertions(+) create mode 100644 test/replication/qsync_advanced.result create mode 100644 test/replication/qsync_advanced.test.lua diff --git a/test/replication/qsync_advanced.result b/test/replication/qsync_advanced.result new file mode 100644 index 000000000..fa94c8339 --- /dev/null +++ b/test/replication/qsync_advanced.result @@ -0,0 +1,939 @@ +-- test-run result file version 2 +env = require('test_run') + | --- + | ... +test_run = env.new() + | --- + | ... +engine = test_run:get_cfg('engine') + | --- + | ... +fiber = require('fiber') + | --- + | ... + +orig_synchro_quorum = box.cfg.replication_synchro_quorum + | --- + | ... +orig_synchro_timeout = box.cfg.replication_synchro_timeout + | --- + | ... + +NUM_INSTANCES = 2 + | --- + | ... +BROKEN_QUORUM = NUM_INSTANCES + 1 + | --- + | ... + +test_run:cmd("setopt delimiter ';'") + | --- + | - true + | ... +disable_sync_mode = function() + local s = box.space._space:get(box.space.sync.id) + local new_s = s:update({{'=', 6, {is_sync=false}}}) + box.space._space:replace(new_s) +end; + | --- + | ... +test_run:cmd("setopt delimiter ''"); + | --- + | - true + | ... + +box.schema.user.grant('guest', 'replication') + | --- + | ... + +-- Setup an async cluster with two instances. +test_run:cmd('create server replica with rpl_master=default,\ + script="replication/replica.lua"') + | --- + | - true + | ... +test_run:cmd('start server replica with wait=True, wait_load=True') + | --- + | - true + | ... + +-- Successful write. +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.space.sync:insert{1} -- success + | --- + | - [1] + | ... +test_run:cmd('switch replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- Unsuccessfull write. +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.space.sync:insert{1} + | --- + | - error: Quorum collection for a synchronous transaction is timed out + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.space.sync:select{} -- none + | --- + | - [] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- Updated replication_synchro_quorum doesn't affect existed tx. +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +OP_TIMEOUT = 5 + | --- + | ... +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=OP_TIMEOUT} + | --- + | ... +test_run:cmd("setopt delimiter ';'") + | --- + | - true + | ... +_ = fiber.create(function() + box.space.sync:insert{1} +end); + | --- + | ... +test_run:cmd("setopt delimiter ''"); + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES} + | --- + | ... +fiber.sleep(OP_TIMEOUT) -- to make sure replication_synchro_timeout is exceeded + | --- + | ... +box.space.sync:select{} -- none + | --- + | - [] + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.space.sync:select{} -- none + | --- + | - [] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- [RFC, quorum commit] attempt to write multiple transactions, expected the +-- same order as on client in case of achieved quorum. +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.space.sync:insert{1} + | --- + | - [1] + | ... +box.space.sync:insert{2} + | --- + | - [2] + | ... +box.space.sync:insert{3} + | --- + | - [3] + | ... +box.space.sync:select{} -- 1, 2, 3 + | --- + | - - [1] + | - [2] + | - [3] + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1, 2, 3 + | --- + | - - [1] + | - [2] + | - [3] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- Synchro timeout is not bigger than replication_synchro_timeout value. +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=orig_synchro_timeout} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +start = os.time() + | --- + | ... +box.space.sync:insert{1} + | --- + | - error: Quorum collection for a synchronous transaction is timed out + | ... +(os.time() - start) == box.cfg.replication_synchro_timeout -- true + | --- + | - true + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- replication_synchro_quorum +test_run:switch('default') + | --- + | - true + | ... +INT_MIN = -2147483648 + | --- + | ... +INT_MAX = 2147483648 + | --- + | ... +box.cfg{replication_synchro_quorum=INT_MAX} -- error + | --- + | - error: 'Incorrect value for option ''replication_synchro_quorum'': the value must + | be greater than zero and less than maximal number of replicas' + | ... +box.cfg.replication_synchro_quorum -- old value + | --- + | - 3 + | ... +box.cfg{replication_synchro_quorum=INT_MIN} -- error + | --- + | - error: 'Incorrect value for option ''replication_synchro_quorum'': the value must + | be greater than zero and less than maximal number of replicas' + | ... +box.cfg.replication_synchro_quorum -- old value + | --- + | - 3 + | ... + +-- replication_synchro_timeout +test_run:switch('default') + | --- + | - true + | ... +DOUBLE_MAX = 9007199254740992 + | --- + | ... +box.cfg{replication_synchro_timeout=DOUBLE_MAX} + | --- + | ... +box.cfg.replication_synchro_timeout -- DOUBLE_MAX + | --- + | - 9007199254740992 + | ... +box.cfg{replication_synchro_timeout=DOUBLE_MAX+1} + | --- + | ... +box.cfg.replication_synchro_timeout -- DOUBLE_MAX + | --- + | - 9007199254740992 + | ... +box.cfg{replication_synchro_timeout=-1} -- error + | --- + | - error: 'Incorrect value for option ''replication_synchro_timeout'': the value must + | be greater than zero' + | ... +box.cfg.replication_synchro_timeout -- old value + | --- + | - 9007199254740992 + | ... +box.cfg{replication_synchro_timeout=0} -- error + | --- + | - error: 'Incorrect value for option ''replication_synchro_timeout'': the value must + | be greater than zero' + | ... +box.cfg.replication_synchro_timeout -- old value + | --- + | - 9007199254740992 + | ... + +-- TX is in synchronous replication. +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.begin() box.space.sync:insert({1}) box.commit() + | --- + | ... +box.begin() box.space.sync:insert({2}) box.commit() + | --- + | ... +-- Testcase cleanup. +box.space.sync:drop() + | --- + | ... + +-- [RFC, summary] switch sync replicas into async ones, expected success and +-- data consistency on a leader and replicas. +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.space.sync:insert{1} + | --- + | - [1] + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:switch('default') + | --- + | - true + | ... +-- Disable synchronous mode. +disable_sync_mode() + | --- + | ... +-- Space is in async mode now. +box.cfg{replication_synchro_quorum=NUM_INSTANCES} + | --- + | ... +box.space.sync:insert{2} -- success + | --- + | - [2] + | ... +box.space.sync:insert{3} -- success + | --- + | - [3] + | ... +box.cfg{replication_synchro_quorum=BROKEN_QUORUM} + | --- + | ... +box.space.sync:insert{4} -- success + | --- + | - [4] + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES} + | --- + | ... +box.space.sync:insert{5} -- success + | --- + | - [5] + | ... +box.space.sync:select{} -- 1, 2, 3, 4, 5 + | --- + | - - [1] + | - [2] + | - [3] + | - [4] + | - [5] + | ... +test_run:cmd('switch replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1, 2, 3, 4, 5 + | --- + | - - [1] + | - [2] + | - [3] + | - [4] + | - [5] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- (FLAKY) [RFC, Synchronous replication enabling] "As soon as last operation of +-- synchronous transaction appeared in leader's WAL, it will cause all +-- following transactions - no matter if they are synchronous or not - wait for +-- the quorum. In case quorum is not achieved the 'rollback' operation will +-- cause rollback of all transactions after the synchronous one." +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.space.sync:insert{1} + | --- + | - [1] + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:switch('default') + | --- + | - true + | ... +-- OP_TIMEOUT should be enough to make sync operation, disable +-- sync mode and make an async operation +OP_TIMEOUT = 10 + | --- + | ... +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=OP_TIMEOUT} + | --- + | ... +test_run:cmd("setopt delimiter ';'") + | --- + | - true + | ... +_ = fiber.create(function() + box.space.sync:insert{2} +end); + | --- + | ... +test_run:cmd("setopt delimiter ''"); + | --- + | - true + | ... +-- Disable synchronous mode. +disable_sync_mode() + | --- + | - error: A rollback for a synchronous transaction is received + | ... +-- Space is in async mode now. +box.space.sync:insert{3} -- async operation must wait sync one + | --- + | - error: Quorum collection for a synchronous transaction is timed out + | ... +fiber.sleep(OP_TIMEOUT + 1) + | --- + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:cmd('switch replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- Warn user when setting `replication_synchro_quorum` to a value +-- greater than number of instances in a cluster, see gh-5122. +box.cfg{replication_synchro_quorum=BROKEN_QUORUM} -- warning + | --- + | ... + +-- [RFC, summary] switch from leader to replica and vice versa, expected +-- success and data consistency on a leader and replicas (gh-5124). +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.space.sync:insert{1} + | --- + | - [1] + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +box.cfg{read_only=false} -- promote replica to master + | --- + | ... +test_run:switch('default') + | --- + | - true + | ... +box.cfg{read_only=true} -- demote master to replica + | --- + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.space.sync:insert{2} + | --- + | - [2] + | ... +box.space.sync:select{} -- 1, 2 + | --- + | - - [1] + | - [2] + | ... +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:select{} -- 1, 2 + | --- + | - - [1] + | ... +-- Revert cluster configuration. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{read_only=false} + | --- + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.cfg{read_only=true} + | --- + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- check behaviour with failed write to WAL on master (ERRINJ_WAL_IO) +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.space.sync:insert{1} + | --- + | - [1] + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +box.error.injection.set('ERRINJ_WAL_IO', true) + | --- + | - ok + | ... +box.space.sync:insert{2} + | --- + | - error: Failed to write to disk + | ... +box.error.injection.set('ERRINJ_WAL_IO', false) + | --- + | - ok + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- [RFC, quorum commit] check behaviour with failure answer from a replica +-- (ERRINJ_WAL_SYNC) during write, expected disconnect from the replication +-- (gh-5123, set replication_synchro_quorum to 1). +-- Testcase setup. +test_run:switch('default') + | --- + | - true + | ... +box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.1} + | --- + | ... +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +-- Testcase body. +box.space.sync:insert{1} + | --- + | - [1] + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.error.injection.set('ERRINJ_WAL_IO', true) + | --- + | - ok + | ... +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:insert{2} + | --- + | - error: Quorum collection for a synchronous transaction is timed out + | ... +test_run:switch('replica') + | --- + | - true + | ... +box.error.injection.set('ERRINJ_WAL_IO', false) + | --- + | - ok + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- Teardown. +test_run:cmd('switch default') + | --- + | - true + | ... +test_run:cmd('stop server replica') + | --- + | - true + | ... +test_run:cmd('delete server replica') + | --- + | - true + | ... +test_run:cleanup_cluster() + | --- + | ... +box.schema.user.revoke('guest', 'replication') + | --- + | ... +box.cfg{ \ + replication_synchro_quorum = orig_synchro_quorum, \ + replication_synchro_timeout = orig_synchro_timeout, \ +} + | --- + | ... + +-- Setup an async cluster. +box.schema.user.grant('guest', 'replication') + | --- + | ... +test_run:cmd('create server replica with rpl_master=default,\ + script="replication/replica.lua"') + | --- + | - true + | ... +test_run:cmd('start server replica with wait=True, wait_load=True') + | --- + | - true + | ... + +-- [RFC, summary] switch async replica into sync one, expected +-- success and data consistency on a leader and replica. +-- Testcase setup. +_ = box.schema.space.create('sync', {engine=engine}) + | --- + | ... +_ = box.space.sync:create_index('pk') + | --- + | ... +box.space.sync:insert{1} -- success + | --- + | - [1] + | ... +test_run:cmd('switch replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1 + | --- + | - - [1] + | ... +test_run:switch('default') + | --- + | - true + | ... +-- Enable synchronous mode. +s = box.space._space:get(box.space.sync.id) + | --- + | ... +new_s = s:update({{'=', 6, {is_sync=true}}}) + | --- + | ... +box.space._space:replace(new_s) + | --- + | - [523, 1, 'sync', 'vinyl', 0, {'is_sync': true}, []] + | ... +-- Space is in sync mode now. +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +box.space.sync:insert{2} -- success + | --- + | - [2] + | ... +box.space.sync:insert{3} -- success + | --- + | - [3] + | ... +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=0.1} + | --- + | ... +box.space.sync:insert{4} -- failure + | --- + | - error: Quorum collection for a synchronous transaction is timed out + | ... +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} + | --- + | ... +box.space.sync:insert{5} -- success + | --- + | - [5] + | ... +box.space.sync:select{} -- 1, 2, 3, 5 + | --- + | - - [1] + | - [2] + | - [3] + | - [5] + | ... +test_run:cmd('switch replica') + | --- + | - true + | ... +box.space.sync:select{} -- 1, 2, 3, 5 + | --- + | - - [1] + | - [2] + | - [3] + | - [5] + | ... +-- Testcase cleanup. +test_run:switch('default') + | --- + | - true + | ... +box.space.sync:drop() + | --- + | ... + +-- Teardown. +test_run:cmd('switch default') + | --- + | - true + | ... +test_run:cmd('stop server replica') + | --- + | - true + | ... +test_run:cmd('delete server replica') + | --- + | - true + | ... +test_run:cleanup_cluster() + | --- + | ... +box.schema.user.revoke('guest', 'replication') + | --- + | ... +box.cfg{ \ + replication_synchro_quorum = orig_synchro_quorum, \ + replication_synchro_timeout = orig_synchro_timeout, \ +} + | --- + | ... diff --git a/test/replication/qsync_advanced.test.lua b/test/replication/qsync_advanced.test.lua new file mode 100644 index 000000000..270fd494d --- /dev/null +++ b/test/replication/qsync_advanced.test.lua @@ -0,0 +1,337 @@ +env = require('test_run') +test_run = env.new() +engine = test_run:get_cfg('engine') +fiber = require('fiber') + +orig_synchro_quorum = box.cfg.replication_synchro_quorum +orig_synchro_timeout = box.cfg.replication_synchro_timeout + +NUM_INSTANCES = 2 +BROKEN_QUORUM = NUM_INSTANCES + 1 + +test_run:cmd("setopt delimiter ';'") +disable_sync_mode = function() + local s = box.space._space:get(box.space.sync.id) + local new_s = s:update({{'=', 6, {is_sync=false}}}) + box.space._space:replace(new_s) +end; +test_run:cmd("setopt delimiter ''"); + +box.schema.user.grant('guest', 'replication') + +-- Setup an async cluster with two instances. +test_run:cmd('create server replica with rpl_master=default,\ + script="replication/replica.lua"') +test_run:cmd('start server replica with wait=True, wait_load=True') + +-- Successful write. +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.space.sync:insert{1} -- success +test_run:cmd('switch replica') +box.space.sync:select{} -- 1 +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- Unsuccessfull write. +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.space.sync:insert{1} +test_run:switch('replica') +box.space.sync:select{} -- none +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- Updated replication_synchro_quorum doesn't affect existed tx. +-- Testcase setup. +test_run:switch('default') +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +OP_TIMEOUT = 5 +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=OP_TIMEOUT} +test_run:cmd("setopt delimiter ';'") +_ = fiber.create(function() + box.space.sync:insert{1} +end); +test_run:cmd("setopt delimiter ''"); +box.cfg{replication_synchro_quorum=NUM_INSTANCES} +fiber.sleep(OP_TIMEOUT) -- to make sure replication_synchro_timeout is exceeded +box.space.sync:select{} -- none +test_run:switch('replica') +box.space.sync:select{} -- none +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- [RFC, quorum commit] attempt to write multiple transactions, expected the +-- same order as on client in case of achieved quorum. +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.space.sync:insert{1} +box.space.sync:insert{2} +box.space.sync:insert{3} +box.space.sync:select{} -- 1, 2, 3 +test_run:switch('replica') +box.space.sync:select{} -- 1, 2, 3 +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- Synchro timeout is not bigger than replication_synchro_timeout value. +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=orig_synchro_timeout} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +start = os.time() +box.space.sync:insert{1} +(os.time() - start) == box.cfg.replication_synchro_timeout -- true +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- replication_synchro_quorum +test_run:switch('default') +INT_MIN = -2147483648 +INT_MAX = 2147483648 +box.cfg{replication_synchro_quorum=INT_MAX} -- error +box.cfg.replication_synchro_quorum -- old value +box.cfg{replication_synchro_quorum=INT_MIN} -- error +box.cfg.replication_synchro_quorum -- old value + +-- replication_synchro_timeout +test_run:switch('default') +DOUBLE_MAX = 9007199254740992 +box.cfg{replication_synchro_timeout=DOUBLE_MAX} +box.cfg.replication_synchro_timeout -- DOUBLE_MAX +box.cfg{replication_synchro_timeout=DOUBLE_MAX+1} +box.cfg.replication_synchro_timeout -- DOUBLE_MAX +box.cfg{replication_synchro_timeout=-1} -- error +box.cfg.replication_synchro_timeout -- old value +box.cfg{replication_synchro_timeout=0} -- error +box.cfg.replication_synchro_timeout -- old value + +-- TX is in synchronous replication. +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.begin() box.space.sync:insert({1}) box.commit() +box.begin() box.space.sync:insert({2}) box.commit() +-- Testcase cleanup. +box.space.sync:drop() + +-- [RFC, summary] switch sync replicas into async ones, expected success and +-- data consistency on a leader and replicas. +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.space.sync:insert{1} +box.space.sync:select{} -- 1 +test_run:switch('replica') +box.space.sync:select{} -- 1 +test_run:switch('default') +-- Disable synchronous mode. +disable_sync_mode() +-- Space is in async mode now. +box.cfg{replication_synchro_quorum=NUM_INSTANCES} +box.space.sync:insert{2} -- success +box.space.sync:insert{3} -- success +box.cfg{replication_synchro_quorum=BROKEN_QUORUM} +box.space.sync:insert{4} -- success +box.cfg{replication_synchro_quorum=NUM_INSTANCES} +box.space.sync:insert{5} -- success +box.space.sync:select{} -- 1, 2, 3, 4, 5 +test_run:cmd('switch replica') +box.space.sync:select{} -- 1, 2, 3, 4, 5 +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- (FLAKY) [RFC, Synchronous replication enabling] "As soon as last operation of +-- synchronous transaction appeared in leader's WAL, it will cause all +-- following transactions - no matter if they are synchronous or not - wait for +-- the quorum. In case quorum is not achieved the 'rollback' operation will +-- cause rollback of all transactions after the synchronous one." +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.space.sync:insert{1} +box.space.sync:select{} -- 1 +test_run:switch('replica') +box.space.sync:select{} -- 1 +test_run:switch('default') +-- OP_TIMEOUT should be enough to make sync operation, disable +-- sync mode and make an async operation +OP_TIMEOUT = 10 +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=OP_TIMEOUT} +test_run:cmd("setopt delimiter ';'") +_ = fiber.create(function() + box.space.sync:insert{2} +end); +test_run:cmd("setopt delimiter ''"); +-- Disable synchronous mode. +disable_sync_mode() +-- Space is in async mode now. +box.space.sync:insert{3} -- async operation must wait sync one +fiber.sleep(OP_TIMEOUT + 1) +box.space.sync:select{} -- 1 +test_run:cmd('switch replica') +box.space.sync:select{} -- 1 +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- Warn user when setting `replication_synchro_quorum` to a value +-- greater than number of instances in a cluster, see gh-5122. +box.cfg{replication_synchro_quorum=BROKEN_QUORUM} -- warning + +-- [RFC, summary] switch from leader to replica and vice versa, expected +-- success and data consistency on a leader and replicas (gh-5124). +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.space.sync:insert{1} +box.space.sync:select{} -- 1 +test_run:switch('replica') +box.space.sync:select{} -- 1 +box.cfg{read_only=false} -- promote replica to master +test_run:switch('default') +box.cfg{read_only=true} -- demote master to replica +test_run:switch('replica') +box.space.sync:insert{2} +box.space.sync:select{} -- 1, 2 +test_run:switch('default') +box.space.sync:select{} -- 1, 2 +-- Revert cluster configuration. +test_run:switch('default') +box.cfg{read_only=false} +test_run:switch('replica') +box.cfg{read_only=true} +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- Check behaviour with failed write to WAL on master (ERRINJ_WAL_IO). +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.space.sync:insert{1} +box.space.sync:select{} -- 1 +box.error.injection.set('ERRINJ_WAL_IO', true) +box.space.sync:insert{2} +box.error.injection.set('ERRINJ_WAL_IO', false) +box.space.sync:select{} -- 1 +test_run:switch('replica') +box.space.sync:select{} -- 1 +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- [RFC, quorum commit] check behaviour with failure answer from a replica +-- (ERRINJ_WAL_SYNC) during write, expected disconnect from the replication +-- (gh-5123, set replication_synchro_quorum to 1). +-- Testcase setup. +test_run:switch('default') +box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.1} +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +_ = box.space.sync:create_index('pk') +-- Testcase body. +box.space.sync:insert{1} +box.space.sync:select{} -- 1 +test_run:switch('replica') +box.error.injection.set('ERRINJ_WAL_IO', true) +test_run:switch('default') +box.space.sync:insert{2} +test_run:switch('replica') +box.error.injection.set('ERRINJ_WAL_IO', false) +box.space.sync:select{} -- 1 +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- Teardown. +test_run:cmd('switch default') +test_run:cmd('stop server replica') +test_run:cmd('delete server replica') +test_run:cleanup_cluster() +box.schema.user.revoke('guest', 'replication') +box.cfg{ \ + replication_synchro_quorum = orig_synchro_quorum, \ + replication_synchro_timeout = orig_synchro_timeout, \ +} + +-- Setup an async cluster. +box.schema.user.grant('guest', 'replication') +test_run:cmd('create server replica with rpl_master=default,\ + script="replication/replica.lua"') +test_run:cmd('start server replica with wait=True, wait_load=True') + +-- [RFC, summary] switch async replica into sync one, expected +-- success and data consistency on a leader and replica. +-- Testcase setup. +_ = box.schema.space.create('sync', {engine=engine}) +_ = box.space.sync:create_index('pk') +box.space.sync:insert{1} -- success +test_run:cmd('switch replica') +box.space.sync:select{} -- 1 +test_run:switch('default') +-- Enable synchronous mode. +s = box.space._space:get(box.space.sync.id) +new_s = s:update({{'=', 6, {is_sync=true}}}) +box.space._space:replace(new_s) +-- Space is in sync mode now. +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +box.space.sync:insert{2} -- success +box.space.sync:insert{3} -- success +box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=0.1} +box.space.sync:insert{4} -- failure +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +box.space.sync:insert{5} -- success +box.space.sync:select{} -- 1, 2, 3, 5 +test_run:cmd('switch replica') +box.space.sync:select{} -- 1, 2, 3, 5 +-- Testcase cleanup. +test_run:switch('default') +box.space.sync:drop() + +-- Teardown. +test_run:cmd('switch default') +test_run:cmd('stop server replica') +test_run:cmd('delete server replica') +test_run:cleanup_cluster() +box.schema.user.revoke('guest', 'replication') +box.cfg{ \ + replication_synchro_quorum = orig_synchro_quorum, \ + replication_synchro_timeout = orig_synchro_timeout, \ +} -- 2.26.2
next prev parent reply other threads:[~2020-07-02 21:15 UTC|newest] Thread overview: 68+ messages / expand[flat|nested] mbox.gz Atom feed top [not found] <cover.1593723973.git.sergeyb@tarantool.org> 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 01/19] replication: introduce space.is_sync option Vladislav Shpilevoy 2020-06-30 23:00 ` Vladislav Shpilevoy 2020-07-01 15:55 ` Sergey Ostanevich 2020-07-01 23:46 ` Vladislav Shpilevoy 2020-07-02 8:25 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 10/19] txn_limbo: add ROLLBACK processing Vladislav Shpilevoy 2020-07-05 15:29 ` Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 11/19] box: rework local_recovery to use async txn_commit Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 12/19] replication: support ROLLBACK and CONFIRM during recovery Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 13/19] replication: add test for synchro CONFIRM/ROLLBACK Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 14/19] applier: remove writer_cond Vladislav Shpilevoy 2020-07-02 9:13 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 15/19] applier: send heartbeat not only on commit, but on any write Vladislav Shpilevoy 2020-07-01 23:55 ` Vladislav Shpilevoy 2020-07-03 12:23 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 16/19] txn_limbo: add diag_set in txn_limbo_wait_confirm Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 17/19] replication: delay initial join until confirmation Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 18/19] replication: only send confirmed data during final join Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 19/19] replication: block async transactions when not empty limbo Vladislav Shpilevoy 2020-07-01 17:12 ` Sergey Ostanevich 2020-07-01 23:47 ` Vladislav Shpilevoy 2020-07-03 12:28 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 02/19] replication: introduce replication_synchro_* cfg options Vladislav Shpilevoy 2020-07-01 16:05 ` Sergey Ostanevich 2020-07-01 23:46 ` Vladislav Shpilevoy 2020-07-02 8:29 ` Serge Petrenko 2020-07-02 23:36 ` Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 03/19] txn: add TXN_WAIT_ACK flag Vladislav Shpilevoy 2020-07-01 17:14 ` Sergey Ostanevich 2020-07-01 23:46 ` Vladislav Shpilevoy 2020-07-02 8:30 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 04/19] replication: make sync transactions wait quorum Vladislav Shpilevoy 2020-06-30 23:00 ` Vladislav Shpilevoy 2020-07-02 8:48 ` Serge Petrenko 2020-07-03 21:16 ` Vladislav Shpilevoy 2020-07-05 16:05 ` Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 05/19] xrow: introduce CONFIRM and ROLLBACK entries Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 06/19] txn: introduce various reasons for txn rollback Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 07/19] replication: write and read CONFIRM entries Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 08/19] replication: add support of qsync to the snapshot machinery Vladislav Shpilevoy 2020-07-02 8:52 ` Serge Petrenko 2020-07-08 11:43 ` Leonid Vasiliev 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 09/19] txn_limbo: add timeout when waiting for acks Vladislav Shpilevoy 2020-06-29 23:22 ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Vladislav Shpilevoy 2020-06-30 23:00 ` [Tarantool-patches] [PATCH v2 20/19] replication: add test for quorum 1 Vladislav Shpilevoy 2020-07-03 12:32 ` Serge Petrenko 2020-07-02 21:13 ` [Tarantool-patches] [PATCH 1/4] replication: regression test on gh-5119 [not fixed] sergeyb 2020-07-02 21:13 ` sergeyb [this message] 2020-07-02 22:46 ` [Tarantool-patches] [PATCH 2/4] replication: add advanced tests for sync replication Sergey Bronnikov 2020-07-02 23:20 ` Vladislav Shpilevoy 2020-07-06 12:30 ` Sergey Bronnikov 2020-07-06 23:31 ` Vladislav Shpilevoy 2020-07-07 12:12 ` Sergey Bronnikov 2020-07-07 20:57 ` Vladislav Shpilevoy 2020-07-08 12:07 ` Sergey Bronnikov 2020-07-08 22:13 ` Vladislav Shpilevoy 2020-07-09 9:39 ` Sergey Bronnikov 2020-07-02 21:13 ` [Tarantool-patches] [PATCH 3/4] replication: add tests for sync replication with anon replica sergeyb 2020-07-06 23:31 ` Vladislav Shpilevoy 2020-07-02 21:13 ` [Tarantool-patches] [PATCH 4/4] replication: add tests for sync replication with snapshots sergeyb 2020-07-02 22:46 ` Sergey Bronnikov 2020-07-02 23:20 ` Vladislav Shpilevoy 2020-07-06 23:31 ` Vladislav Shpilevoy 2020-07-07 16:00 ` Sergey Bronnikov 2020-07-06 23:31 ` [Tarantool-patches] [PATCH] Add new error injection constant ERRINJ_SYNC_TIMEOUT Vladislav Shpilevoy 2020-07-10 0:50 ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Vladislav Shpilevoy 2020-07-10 7:40 ` Kirill Yukhin
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=012c8c196396cf963a0aa1f2d23814ff84b81cfb.1593723973.git.sergeyb@tarantool.org \ --to=sergeyb@tarantool.org \ --cc=gorcunov@gmail.com \ --cc=lvasiliev@tarantool.org \ --cc=sergepetrenko@tarantool.org \ --cc=tarantool-patches@dev.tarantool.org \ --cc=v.shpilevoy@tarantool.org \ --subject='Re: [Tarantool-patches] [PATCH 2/4] replication: add advanced tests for sync replication' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox