From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id DBFC22A6E5 for ; Thu, 21 Mar 2019 02:03:12 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id SuWBJ18yxBMi for ; Thu, 21 Mar 2019 02:03:12 -0400 (EDT) Received: from smtp41.i.mail.ru (smtp41.i.mail.ru [94.100.177.101]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 3769B2A6E4 for ; Thu, 21 Mar 2019 02:03:10 -0400 (EDT) From: "Alexander V. Tikhonov" Subject: [tarantool-patches] [PATCH v2 1/2] test: tunned timeouts and added statuses checks Date: Thu, 21 Mar 2019 09:03:05 +0300 Message-Id: <6726cdd338a2c24005e72ceec9a6badc072bcdbd.1553148177.git.avtikhon@tarantool.org> Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-Help: List-Unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-Subscribe: List-Owner: List-post: List-Archive: To: Alexander Turenko Cc: "Alexander V. Tikhonov" , tarantool-patches@freelists.org Tunned timeouts to the common value. Added for status checks the the wait_cond loops with diagnostic, changed naming of the tests replicas, separated replicas between subtests in tests, changed hardcoded replicas creation/deletion to the standalone routine. --- test/replication/autobootstrap.lua | 2 +- test/replication/autobootstrap_guest.lua | 2 +- test/replication/autobootstrap_guest.result | 5 +- test/replication/autobootstrap_guest.test.lua | 3 +- test/replication/before_replace.lua | 37 ++++ test/replication/before_replace.result | 55 ++---- test/replication/before_replace.test.lua | 45 ++--- test/replication/before_replace1.lua | 1 + test/replication/before_replace2.lua | 1 + test/replication/before_replace3.lua | 1 + test/replication/catch.result | 32 ++- test/replication/catch.test.lua | 21 +- test/replication/ddl.lua | 6 +- test/replication/errinj.result | 98 +++++----- test/replication/errinj.test.lua | 86 ++++---- test/replication/force_recovery.result | 28 ++- test/replication/force_recovery.test.lua | 19 +- test/replication/gc.result | 92 ++++----- test/replication/gc.test.lua | 80 ++++---- test/replication/gc_no_space.result | 50 ++--- test/replication/gc_no_space.test.lua | 36 ++-- test/replication/hot_standby.result | 44 ++--- test/replication/hot_standby.test.lua | 32 ++- test/replication/join_vclock.result | 22 ++- test/replication/join_vclock.test.lua | 16 +- test/replication/local_spaces.result | 29 +-- test/replication/local_spaces.test.lua | 20 +- test/replication/lua/fast_replica.lua | 125 +++++++++--- test/replication/master.lua | 2 +- test/replication/master_quorum.lua | 2 +- test/replication/misc.lua | 37 ++++ test/replication/misc.result | 183 +++++++----------- test/replication/misc.test.lua | 137 ++++++------- test/replication/misc1.lua | 1 + test/replication/misc2.lua | 1 + test/replication/misc3.lua | 1 + test/replication/on_replace.lua | 2 +- test/replication/on_replace.result | 27 +-- test/replication/on_replace.test.lua | 19 +- test/replication/prune.result | 38 ++-- test/replication/prune.test.lua | 35 ++-- test/replication/quorum.lua | 2 +- test/replication/quorum.result | 93 ++++----- test/replication/quorum.test.lua | 58 +++--- test/replication/rebootstrap.lua | 2 +- test/replication/rebootstrap.result | 8 +- test/replication/rebootstrap.test.lua | 6 +- test/replication/recover_missing_xlog.lua | 37 ++++ test/replication/recover_missing_xlog.result | 16 +- .../replication/recover_missing_xlog.test.lua | 16 +- test/replication/recover_missing_xlog1.lua | 1 + test/replication/recover_missing_xlog2.lua | 1 + test/replication/recover_missing_xlog3.lua | 1 + test/replication/replica.lua | 4 +- test/replication/replica_auth.lua | 2 +- test/replication/replica_quorum.lua | 2 +- test/replication/replica_rejoin.result | 73 ++++--- test/replication/replica_rejoin.test.lua | 66 +++---- test/replication/replica_timeout.lua | 2 +- test/replication/replica_uuid_ro1.lua | 1 - test/replication/replica_uuid_ro2.lua | 1 - test/replication/replica_uuid_ro3.lua | 1 - ...a_uuid_ro.lua => replicaset_ro_mostly.lua} | 6 +- test/replication/replicaset_ro_mostly.result | 6 +- .../replication/replicaset_ro_mostly.test.lua | 6 +- test/replication/replicaset_ro_mostly1.lua | 1 + test/replication/replicaset_ro_mostly2.lua | 1 + test/replication/replicaset_ro_mostly3.lua | 1 + test/replication/show_error_on_disconnect.lua | 38 ++++ .../show_error_on_disconnect.result | 26 +-- .../show_error_on_disconnect.test.lua | 20 +- .../replication/show_error_on_disconnect1.lua | 1 + .../replication/show_error_on_disconnect2.lua | 1 + .../replication/show_error_on_disconnect3.lua | 1 + test/replication/skip_conflict_row.result | 89 ++++----- test/replication/skip_conflict_row.test.lua | 60 +++--- test/replication/status.result | 58 ++---- test/replication/status.test.lua | 39 ++-- test/replication/suite.ini | 1 + test/replication/sync.result | 101 ++++------ test/replication/sync.test.lua | 64 +++--- test/replication/wal_off.result | 19 +- test/replication/wal_off.test.lua | 13 +- test/replication/wal_rw_stress.result | 30 ++- test/replication/wal_rw_stress.test.lua | 21 +- 85 files changed, 1225 insertions(+), 1244 deletions(-) create mode 100644 test/replication/before_replace.lua create mode 120000 test/replication/before_replace1.lua create mode 120000 test/replication/before_replace2.lua create mode 120000 test/replication/before_replace3.lua create mode 100644 test/replication/misc.lua create mode 120000 test/replication/misc1.lua create mode 120000 test/replication/misc2.lua create mode 120000 test/replication/misc3.lua create mode 100644 test/replication/recover_missing_xlog.lua create mode 120000 test/replication/recover_missing_xlog1.lua create mode 120000 test/replication/recover_missing_xlog2.lua create mode 120000 test/replication/recover_missing_xlog3.lua delete mode 120000 test/replication/replica_uuid_ro1.lua delete mode 120000 test/replication/replica_uuid_ro2.lua delete mode 120000 test/replication/replica_uuid_ro3.lua rename test/replication/{replica_uuid_ro.lua => replicaset_ro_mostly.lua} (83%) create mode 120000 test/replication/replicaset_ro_mostly1.lua create mode 120000 test/replication/replicaset_ro_mostly2.lua create mode 120000 test/replication/replicaset_ro_mostly3.lua create mode 100644 test/replication/show_error_on_disconnect.lua create mode 120000 test/replication/show_error_on_disconnect1.lua create mode 120000 test/replication/show_error_on_disconnect2.lua create mode 120000 test/replication/show_error_on_disconnect3.lua diff --git a/test/replication/autobootstrap.lua b/test/replication/autobootstrap.lua index 856b36e66..ef8b149e9 100644 --- a/test/replication/autobootstrap.lua +++ b/test/replication/autobootstrap.lua @@ -6,7 +6,7 @@ local USER = 'cluster' local PASSWORD = 'somepassword' local SOCKET_DIR = require('fio').cwd() local TIMEOUT = tonumber(arg[1]) -local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 30.0 +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 local function instance_uri(instance_id) --return 'localhost:'..(3310 + instance_id) diff --git a/test/replication/autobootstrap_guest.lua b/test/replication/autobootstrap_guest.lua index d7176ae5b..78cfd381b 100644 --- a/test/replication/autobootstrap_guest.lua +++ b/test/replication/autobootstrap_guest.lua @@ -6,7 +6,7 @@ local INSTANCE_ID = string.match(arg[0], "%d") local SOCKET_DIR = require('fio').cwd() local TIMEOUT = tonumber(arg[1]) -local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 30.0 +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 local function instance_uri(instance_id) --return 'localhost:'..(3310 + instance_id) diff --git a/test/replication/autobootstrap_guest.result b/test/replication/autobootstrap_guest.result index 1efef310c..dc1098765 100644 --- a/test/replication/autobootstrap_guest.result +++ b/test/replication/autobootstrap_guest.result @@ -1,12 +1,9 @@ -env = require('test_run') +test_run = require('test_run').new() --- ... vclock_diff = require('fast_replica').vclock_diff --- ... -test_run = env.new() ---- -... SERVERS = { 'autobootstrap_guest1', 'autobootstrap_guest2', 'autobootstrap_guest3' } --- ... diff --git a/test/replication/autobootstrap_guest.test.lua b/test/replication/autobootstrap_guest.test.lua index 3aad8a4da..23329811c 100644 --- a/test/replication/autobootstrap_guest.test.lua +++ b/test/replication/autobootstrap_guest.test.lua @@ -1,6 +1,5 @@ -env = require('test_run') +test_run = require('test_run').new() vclock_diff = require('fast_replica').vclock_diff -test_run = env.new() SERVERS = { 'autobootstrap_guest1', 'autobootstrap_guest2', 'autobootstrap_guest3' } diff --git a/test/replication/before_replace.lua b/test/replication/before_replace.lua new file mode 100644 index 000000000..093676548 --- /dev/null +++ b/test/replication/before_replace.lua @@ -0,0 +1,37 @@ +#!/usr/bin/env tarantool + +-- get instance name from filename (before_replace1.lua => before_replace1) +local INSTANCE_ID = string.match(arg[0], "%d") +local USER = 'cluster' +local PASSWORD = 'somepassword' +local SOCKET_DIR = require('fio').cwd() +local TIMEOUT = tonumber(arg[1]) +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 + +local function instance_uri(instance_id) + --return 'localhost:'..(3310 + instance_id) + return SOCKET_DIR..'/before_replace'..instance_id..'.sock'; +end + +-- start console first +require('console').listen(os.getenv('ADMIN')) + +box.cfg({ + listen = instance_uri(INSTANCE_ID); +-- log_level = 7; + replication = { + USER..':'..PASSWORD..'@'..instance_uri(1); + USER..':'..PASSWORD..'@'..instance_uri(2); + USER..':'..PASSWORD..'@'..instance_uri(3); + }; + replication_timeout = TIMEOUT; + replication_connect_timeout = CON_TIMEOUT; +}) + +box.once("bootstrap", function() + local test_run = require('test_run').new() + box.schema.user.create(USER, { password = PASSWORD }) + box.schema.user.grant(USER, 'replication') + box.schema.space.create('test', {engine = test_run:get_cfg('engine')}) + box.space.test:create_index('primary') +end) diff --git a/test/replication/before_replace.result b/test/replication/before_replace.result index ced40547e..1b75085ab 100644 --- a/test/replication/before_replace.result +++ b/test/replication/before_replace.result @@ -1,16 +1,13 @@ -- -- Using space:before_replace to resolve replication conflicts. -- -env = require('test_run') ---- -... -test_run = env.new() +test_run = require('test_run').new() --- ... engine = test_run:get_cfg('engine') --- ... -SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +SERVERS = { 'before_replace1', 'before_replace2', 'before_replace3' } --- ... -- Deploy a cluster. @@ -26,7 +23,7 @@ test_run:cmd("setopt delimiter ';'") --- - true ... -test_run:cmd("switch autobootstrap1"); +test_run:cmd("switch before_replace1"); --- - true ... @@ -37,7 +34,7 @@ _ = box.space.test:before_replace(function(old, new) end); --- ... -test_run:cmd("switch autobootstrap2"); +test_run:cmd("switch before_replace2"); --- - true ... @@ -48,7 +45,7 @@ _ = box.space.test:before_replace(function(old, new) end); --- ... -test_run:cmd("switch autobootstrap3"); +test_run:cmd("switch before_replace3"); --- - true ... @@ -75,7 +72,7 @@ test_run:cmd("setopt delimiter ''"); ... -- Stall replication and generate incompatible data -- on the replicas. -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch before_replace1") --- - true ... @@ -86,7 +83,7 @@ box.error.injection.set('ERRINJ_RELAY_TIMEOUT', 0.01) for i = 1, 10 do box.space.test:replace{i, i % 3 == 1 and i * 10 or i} end --- ... -test_run:cmd("switch autobootstrap2") +test_run:cmd("switch before_replace2") --- - true ... @@ -97,7 +94,7 @@ box.error.injection.set('ERRINJ_RELAY_TIMEOUT', 0.01) for i = 1, 10 do box.space.test:replace{i, i % 3 == 2 and i * 10 or i} end --- ... -test_run:cmd("switch autobootstrap3") +test_run:cmd("switch before_replace3") --- - true ... @@ -121,7 +118,7 @@ vclock2 = test_run:wait_cluster_vclock(SERVERS, vclock) ... -- Check that all replicas converged to the same data -- and the state persists after restart. -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch before_replace1") --- - true ... @@ -138,7 +135,7 @@ box.space.test:select() - [9, 90] - [10, 100] ... -test_run:cmd('restart server autobootstrap1 with args="0.1 0.5"') +test_run:cmd('restart server before_replace1 with args="0.1 0.5"') box.space.test:select() --- - - [1, 10] @@ -152,7 +149,7 @@ box.space.test:select() - [9, 90] - [10, 100] ... -test_run:cmd("switch autobootstrap2") +test_run:cmd("switch before_replace2") --- - true ... @@ -169,7 +166,7 @@ box.space.test:select() - [9, 90] - [10, 100] ... -test_run:cmd('restart server autobootstrap2 with args="0.1 0.5"') +test_run:cmd('restart server before_replace2 with args="0.1 0.5"') box.space.test:select() --- - - [1, 10] @@ -183,7 +180,7 @@ box.space.test:select() - [9, 90] - [10, 100] ... -test_run:cmd("switch autobootstrap3") +test_run:cmd("switch before_replace3") --- - true ... @@ -204,7 +201,7 @@ push_err --- - Session 'applier' does not support push() ... -test_run:cmd('restart server autobootstrap3 with args="0.1 0.5"') +test_run:cmd('restart server before_replace3 with args="0.1 0.5"') box.space.test:select() --- - - [1, 10] @@ -243,15 +240,13 @@ _ = box.space.test:create_index('primary') box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set = require('fast_replica') --- -- true ... -test_run:cmd("start server replica") +replica_set.join(test_run, 'before_replace_gh3722') --- -- true ... -test_run:cmd("switch replica") +test_run:cmd("switch before_replace_gh3722") --- - true ... @@ -266,12 +261,12 @@ box.space.test:replace{1, 1} --- - [1, 1] ... -_ = test_run:wait_vclock('replica', test_run:get_vclock('default')) +_ = test_run:wait_vclock('before_replace_gh3722', test_run:get_vclock('default')) --- ... -- Check that replace{1, 2} coming from the master was suppressed -- by the before_replace trigger on the replica. -test_run:cmd("switch replica") +test_run:cmd("switch before_replace_gh3722") --- - true ... @@ -282,7 +277,7 @@ box.space.test:select() -- [1, 2] -- Check that master's component of replica's vclock was bumped -- so that the replica doesn't apply replace{1, 2} after restart -- while syncing with the master. -test_run:cmd("restart server replica") +test_run:cmd("restart server before_replace_gh3722") box.space.test:select() -- [1, 2] --- - - [1, 2] @@ -291,15 +286,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'before_replace_gh3722') --- - true ... diff --git a/test/replication/before_replace.test.lua b/test/replication/before_replace.test.lua index bcc6dc00d..cce2bb72e 100644 --- a/test/replication/before_replace.test.lua +++ b/test/replication/before_replace.test.lua @@ -1,11 +1,10 @@ -- -- Using space:before_replace to resolve replication conflicts. -- -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() engine = test_run:get_cfg('engine') -SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +SERVERS = { 'before_replace1', 'before_replace2', 'before_replace3' } -- Deploy a cluster. test_run:create_cluster(SERVERS, "replication", {args="0.1"}) @@ -14,19 +13,19 @@ test_run:wait_fullmesh(SERVERS) -- Setup space:before_replace trigger on all replicas. -- The trigger favors tuples with a greater value. test_run:cmd("setopt delimiter ';'") -test_run:cmd("switch autobootstrap1"); +test_run:cmd("switch before_replace1"); _ = box.space.test:before_replace(function(old, new) if old ~= nil and new ~= nil then return new[2] > old[2] and new or old end end); -test_run:cmd("switch autobootstrap2"); +test_run:cmd("switch before_replace2"); _ = box.space.test:before_replace(function(old, new) if old ~= nil and new ~= nil then return new[2] > old[2] and new or old end end); -test_run:cmd("switch autobootstrap3"); +test_run:cmd("switch before_replace3"); -- -- gh-2677 - test that an applier can not push() messages. Applier -- session is available in Lua, so the test is here instead of @@ -46,13 +45,13 @@ test_run:cmd("setopt delimiter ''"); -- Stall replication and generate incompatible data -- on the replicas. -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch before_replace1") box.error.injection.set('ERRINJ_RELAY_TIMEOUT', 0.01) for i = 1, 10 do box.space.test:replace{i, i % 3 == 1 and i * 10 or i} end -test_run:cmd("switch autobootstrap2") +test_run:cmd("switch before_replace2") box.error.injection.set('ERRINJ_RELAY_TIMEOUT', 0.01) for i = 1, 10 do box.space.test:replace{i, i % 3 == 2 and i * 10 or i} end -test_run:cmd("switch autobootstrap3") +test_run:cmd("switch before_replace3") box.error.injection.set('ERRINJ_RELAY_TIMEOUT', 0.01) for i = 1, 10 do box.space.test:replace{i, i % 3 == 0 and i * 10 or i} end @@ -63,18 +62,18 @@ vclock2 = test_run:wait_cluster_vclock(SERVERS, vclock) -- Check that all replicas converged to the same data -- and the state persists after restart. -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch before_replace1") box.space.test:select() -test_run:cmd('restart server autobootstrap1 with args="0.1 0.5"') +test_run:cmd('restart server before_replace1 with args="0.1 0.5"') box.space.test:select() -test_run:cmd("switch autobootstrap2") +test_run:cmd("switch before_replace2") box.space.test:select() -test_run:cmd('restart server autobootstrap2 with args="0.1 0.5"') +test_run:cmd('restart server before_replace2 with args="0.1 0.5"') box.space.test:select() -test_run:cmd("switch autobootstrap3") +test_run:cmd("switch before_replace3") box.space.test:select() push_err -test_run:cmd('restart server autobootstrap3 with args="0.1 0.5"') +test_run:cmd('restart server before_replace3 with args="0.1 0.5"') box.space.test:select() -- Cleanup. @@ -92,32 +91,30 @@ _ = box.space.test:create_index('primary') box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") +replica_set = require('fast_replica') +replica_set.join(test_run, 'before_replace_gh3722') -test_run:cmd("switch replica") +test_run:cmd("switch before_replace_gh3722") _ = box.space.test:before_replace(function(old, new) return new:update{{'+', 2, 1}} end) test_run:cmd("switch default") box.space.test:replace{1, 1} -_ = test_run:wait_vclock('replica', test_run:get_vclock('default')) +_ = test_run:wait_vclock('before_replace_gh3722', test_run:get_vclock('default')) -- Check that replace{1, 2} coming from the master was suppressed -- by the before_replace trigger on the replica. -test_run:cmd("switch replica") +test_run:cmd("switch before_replace_gh3722") box.space.test:select() -- [1, 2] -- Check that master's component of replica's vclock was bumped -- so that the replica doesn't apply replace{1, 2} after restart -- while syncing with the master. -test_run:cmd("restart server replica") +test_run:cmd("restart server before_replace_gh3722") box.space.test:select() -- [1, 2] test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'before_replace_gh3722') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/before_replace1.lua b/test/replication/before_replace1.lua new file mode 120000 index 000000000..bef35a453 --- /dev/null +++ b/test/replication/before_replace1.lua @@ -0,0 +1 @@ +before_replace.lua \ No newline at end of file diff --git a/test/replication/before_replace2.lua b/test/replication/before_replace2.lua new file mode 120000 index 000000000..bef35a453 --- /dev/null +++ b/test/replication/before_replace2.lua @@ -0,0 +1 @@ +before_replace.lua \ No newline at end of file diff --git a/test/replication/before_replace3.lua b/test/replication/before_replace3.lua new file mode 120000 index 000000000..bef35a453 --- /dev/null +++ b/test/replication/before_replace3.lua @@ -0,0 +1 @@ +before_replace.lua \ No newline at end of file diff --git a/test/replication/catch.result b/test/replication/catch.result index e1b2995ec..84ca92621 100644 --- a/test/replication/catch.result +++ b/test/replication/catch.result @@ -16,15 +16,16 @@ errinj = box.error.injection box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica_timeout.lua'") +replica_set = require('fast_replica') --- -- true ... -test_run:cmd("start server replica with args='0.1'") +replica_set.join(test_run, 'catch', 1, 'replica_timeout') +--- +... +box.cfg({replication_timeout = 0.1}) --- -- true ... -test_run:cmd("switch replica") +test_run:cmd("switch catch") --- - true ... @@ -39,7 +40,7 @@ s = box.schema.space.create('test', {engine = engine}); index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') }) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch catch") --- - true ... @@ -53,7 +54,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server catch") --- - true ... @@ -66,11 +67,11 @@ errinj.set('ERRINJ_RELAY_SEND_DELAY', true) for i = 1, 100 do s:insert{i, 'this is test message12345'} end --- ... -test_run:cmd("start server replica with args='0.01'") +test_run:cmd("start server catch with args='0.01'") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch catch") --- - true ... @@ -99,7 +100,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("set variable r_uri to 'replica.listen'") +test_run:cmd("set variable r_uri to 'catch.listen'") --- - true ... @@ -116,17 +117,8 @@ errinj.set('ERRINJ_RELAY_SEND_DELAY', false) - ok ... -- Cleanup. -test_run:cmd("stop server replica") +test_run:drop_cluster({'catch'}) --- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") ---- -- true ... test_run:cleanup_cluster() --- diff --git a/test/replication/catch.test.lua b/test/replication/catch.test.lua index 7a531df39..4b3d170b9 100644 --- a/test/replication/catch.test.lua +++ b/test/replication/catch.test.lua @@ -6,28 +6,29 @@ net_box = require('net.box') errinj = box.error.injection box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica_timeout.lua'") -test_run:cmd("start server replica with args='0.1'") -test_run:cmd("switch replica") +replica_set = require('fast_replica') +replica_set.join(test_run, 'catch', 1, 'replica_timeout') +box.cfg({replication_timeout = 0.1}) +test_run:cmd("switch catch") test_run:cmd("switch default") s = box.schema.space.create('test', {engine = engine}); -- Vinyl does not support hash index. index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') }) -test_run:cmd("switch replica") +test_run:cmd("switch catch") fiber = require('fiber') while box.space.test == nil do fiber.sleep(0.01) end test_run:cmd("switch default") -test_run:cmd("stop server replica") +test_run:cmd("stop server catch") -- Insert values on the master while replica is stopped and can't -- fetch them. errinj.set('ERRINJ_RELAY_SEND_DELAY', true) for i = 1, 100 do s:insert{i, 'this is test message12345'} end -test_run:cmd("start server replica with args='0.01'") -test_run:cmd("switch replica") +test_run:cmd("start server catch with args='0.01'") +test_run:cmd("switch catch") -- Check that replica doesn't enter read-write mode before -- catching up with the master: to check that we stop sending @@ -47,7 +48,7 @@ box.space.test:replace{1} -- Case #2: replace tuple on replica by net.box. test_run:cmd("switch default") -test_run:cmd("set variable r_uri to 'replica.listen'") +test_run:cmd("set variable r_uri to 'catch.listen'") c = net_box.connect(r_uri) d = c.space.test:replace{1} @@ -55,9 +56,7 @@ d = c.space.test:replace{1} errinj.set('ERRINJ_RELAY_SEND_DELAY', false) -- Cleanup. -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +test_run:drop_cluster({'catch'}) test_run:cleanup_cluster() box.space.test:drop() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/ddl.lua b/test/replication/ddl.lua index 72cf1db69..ebaee2217 100644 --- a/test/replication/ddl.lua +++ b/test/replication/ddl.lua @@ -1,17 +1,17 @@ #!/usr/bin/env tarantool --- get instance name from filename (autobootstrap1.lua => autobootstrap1) +-- get instance name from filename (ddl1.lua => ddl1) local INSTANCE_ID = string.match(arg[0], "%d") local USER = 'cluster' local PASSWORD = 'somepassword' local SOCKET_DIR = require('fio').cwd() local TIMEOUT = tonumber(arg[1]) -local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 30.0 +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 local function instance_uri(instance_id) --return 'localhost:'..(3310 + instance_id) - return SOCKET_DIR..'/autobootstrap'..instance_id..'.sock'; + return SOCKET_DIR..'/ddl'..instance_id..'.sock'; end -- start console first diff --git a/test/replication/errinj.result b/test/replication/errinj.result index 2e7d367c7..77f824fae 100644 --- a/test/replication/errinj.result +++ b/test/replication/errinj.result @@ -1,7 +1,4 @@ -env = require('test_run') ---- -... -test_run = env.new() +test_run = require('test_run').new() --- ... engine = test_run:get_cfg('engine') @@ -19,15 +16,13 @@ s = box.schema.space.create('test', {engine = engine}); index = s:create_index('primary') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set = require('fast_replica') --- -- true ... -test_run:cmd("start server replica") +replica_set.join(test_run, 'errinj') --- -- true ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -43,7 +38,7 @@ test_run:cmd("setopt delimiter ';'") ... -- vinyl does not support index.len() so we use index.count() instead function wait_repl(cnt) - for i = 1, 20 do + for i = 1, 200 do if s.index[0]:count() >= cnt then return true end @@ -106,7 +101,7 @@ test_f(11, true) test_f(21, true) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -140,7 +135,7 @@ test_f(31, true) test_f(41, true) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -168,7 +163,7 @@ box.snapshot() test_f(51, true) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -195,7 +190,7 @@ box.snapshot() -- 3. Restore replication. -- 4. Generate some records on the master. -- 5. Make sure they'll make it to the slave. -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -211,7 +206,7 @@ s:replace{9000, "won't make it"} --- - [9000, 'won''t make it'] ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -239,7 +234,7 @@ test_run:cmd("switch default") test_f(61, true) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -251,11 +246,11 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server errinj") --- - true ... -test_run:cmd("cleanup server replica") +test_run:cmd("cleanup server errinj") --- - true ... @@ -264,11 +259,11 @@ test_run:cmd("cleanup server replica") box.cfg{replication_timeout = 0.0001} --- ... -test_run:cmd("start server replica") +test_run:cmd("start server errinj") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -291,7 +286,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 5) box.cfg{replication_timeout = 0.05} --- ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -303,11 +298,11 @@ box.info.replication[1].upstream.status --- - follow ... -box.info.replication[1].upstream.lag > 0 +test_run:wait_cond(function() return box.info.replication[1].upstream.lag > 0 end) or box.info.replication[1].upstream.lag --- - true ... -box.info.replication[1].upstream.lag < 1 +test_run:wait_cond(function() return box.info.replication[1].upstream.lag < 1 end) or box.info.replication[1].upstream.lag --- - true ... @@ -326,7 +321,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) box.cfg{replication_timeout = 5} --- ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -346,11 +341,11 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server errinj") --- - true ... -test_run:cmd("cleanup server replica") +test_run:cmd("cleanup server errinj") --- - true ... @@ -361,11 +356,11 @@ errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0.01) --- - ok ... -test_run:cmd("start server replica") +test_run:cmd("start server errinj") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch errinj") --- - true ... @@ -383,7 +378,7 @@ while box.info.replication[1].upstream ~= nil do fiber.sleep(0.0001) end --- ... -- reconnect -box.cfg{replication = old_repl} +box.cfg{replication = old_repl, replication_timeout = 10, replication_connect_timeout = 20} --- ... while box.info.replication[1].upstream.status ~= 'disconnected' do fiber.sleep(0.0001) end @@ -396,11 +391,11 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server errinj") --- - true ... -test_run:cmd("cleanup server replica") +test_run:cmd("cleanup server errinj") --- - true ... @@ -411,15 +406,16 @@ errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0) box.cfg{replication_timeout = 0.01} --- ... -test_run:cmd("create server replica_timeout with rpl_master=default, script='replication/replica_timeout.lua'") +replica_set = require('fast_replica') --- -- true ... -test_run:cmd("start server replica_timeout with args='0.01 0.5'") +replica_set.join(test_run, 'errinj_timeout', 1, 'replica_timeout') --- -- true ... -test_run:cmd("switch replica_timeout") +box.cfg{replication_timeout = 0.01, replication_connect_timeout = 0.5} +--- +... +test_run:cmd("switch errinj_timeout") --- - true ... @@ -441,7 +437,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 5) --- - ok ... -test_run:cmd("switch replica_timeout") +test_run:cmd("switch errinj_timeout") --- - true ... @@ -457,11 +453,11 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica_timeout") +test_run:cmd("stop server errinj_timeout") --- - true ... -test_run:cmd("cleanup server replica_timeout") +test_run:cmd("cleanup server errinj_timeout") --- - true ... @@ -471,11 +467,11 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) ... -- Check replica's ACKs don't prevent the master from sending -- heartbeat messages (gh-3160). -test_run:cmd("start server replica_timeout with args='0.009 0.5'") +test_run:cmd("start server errinj_timeout with args='0.009 0.5'") --- - true ... -test_run:cmd("switch replica_timeout") +test_run:cmd("switch errinj_timeout") --- - true ... @@ -485,26 +481,26 @@ fiber = require('fiber') while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end --- ... -box.info.replication[1].upstream.status -- follow +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- -- follow +- true ... for i = 0, 15 do fiber.sleep(0.01) if box.info.replication[1].upstream.status ~= 'follow' then break end end --- ... -box.info.replication[1].upstream.status -- follow +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- -- follow +- true ... test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica_timeout") +test_run:cmd("stop server errinj_timeout") --- - true ... -test_run:cmd("cleanup server replica_timeout") +test_run:cmd("cleanup server errinj_timeout") --- - true ... @@ -519,11 +515,11 @@ for i = 0, 9999 do box.space.test:replace({i, 4, 5, 'test'}) end -- during the join stage, i.e. a replica with a minuscule -- timeout successfully bootstraps and breaks connection only -- after subscribe. -test_run:cmd("start server replica_timeout with args='0.00001 0.5'") +test_run:cmd("start server errinj_timeout with args='0.00001 0.5'") --- - true ... -test_run:cmd("switch replica_timeout") +test_run:cmd("switch errinj_timeout") --- - true ... @@ -549,11 +545,11 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica_timeout") +test_run:cmd("stop server errinj_timeout") --- - true ... -test_run:cmd("cleanup server replica_timeout") +test_run:cmd("cleanup server errinj_timeout") --- - true ... diff --git a/test/replication/errinj.test.lua b/test/replication/errinj.test.lua index 32e0be912..8e577bf7e 100644 --- a/test/replication/errinj.test.lua +++ b/test/replication/errinj.test.lua @@ -1,5 +1,4 @@ -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() engine = test_run:get_cfg('engine') errinj = box.error.injection @@ -8,9 +7,9 @@ box.schema.user.grant('guest', 'replication') s = box.schema.space.create('test', {engine = engine}); index = s:create_index('primary') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") -test_run:cmd("switch replica") +replica_set = require('fast_replica') +replica_set.join(test_run, 'errinj') +test_run:cmd("switch errinj") fiber = require('fiber') @@ -18,7 +17,7 @@ s = box.space.test test_run:cmd("setopt delimiter ';'") -- vinyl does not support index.len() so we use index.count() instead function wait_repl(cnt) - for i = 1, 20 do + for i = 1, 200 do if s.index[0]:count() >= cnt then return true end @@ -55,7 +54,7 @@ errinj.set("ERRINJ_WAL_WRITE_PARTIAL", -1) test_f(11, true) test_f(21, true) -test_run:cmd("switch replica") +test_run:cmd("switch errinj") wait_repl(30) test_run:cmd("switch default") @@ -69,7 +68,7 @@ errinj.set("ERRINJ_WAL_WRITE_DISK", false) test_f(31, true) test_f(41, true) -test_run:cmd("switch replica") +test_run:cmd("switch errinj") wait_repl(50) test_run:cmd("switch default") @@ -79,7 +78,7 @@ box.space.test.index[0]:count() errinj.set("ERRINJ_WAL_WRITE_EOF", true) box.snapshot() test_f(51, true) -test_run:cmd("switch replica") +test_run:cmd("switch errinj") wait_repl(60) test_run:cmd("switch default") errinj.set("ERRINJ_WAL_WRITE_EOF", false) @@ -92,11 +91,11 @@ box.snapshot() -- 3. Restore replication. -- 4. Generate some records on the master. -- 5. Make sure they'll make it to the slave. -test_run:cmd("switch replica") +test_run:cmd("switch errinj") box.error.injection.set("ERRINJ_WAL_WRITE", true) test_run:cmd("switch default") s:replace{9000, "won't make it"} -test_run:cmd("switch replica") +test_run:cmd("switch errinj") while box.info.replication[1].upstream.status == 'follow' do fiber.sleep(0.0001) end box.error.injection.set("ERRINJ_WAL_WRITE", false) s:replace{9001, "bump lsn"} @@ -104,19 +103,19 @@ box.cfg{replication={}} box.cfg{replication = os.getenv('MASTER')} test_run:cmd("switch default") test_f(61, true) -test_run:cmd("switch replica") +test_run:cmd("switch errinj") wait_repl(70) test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") +test_run:cmd("stop server errinj") +test_run:cmd("cleanup server errinj") -- Set minuscule timeout to make replication stop -- immediately after join. box.cfg{replication_timeout = 0.0001} -test_run:cmd("start server replica") -test_run:cmd("switch replica") +test_run:cmd("start server errinj") +test_run:cmd("switch errinj") fiber = require'fiber' while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end @@ -125,12 +124,12 @@ test_run:cmd("switch default") -- to trigger acks on the replica. errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 5) box.cfg{replication_timeout = 0.05} -test_run:cmd("switch replica") +test_run:cmd("switch errinj") -- wait for reconnect while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end box.info.replication[1].upstream.status -box.info.replication[1].upstream.lag > 0 -box.info.replication[1].upstream.lag < 1 +test_run:wait_cond(function() return box.info.replication[1].upstream.lag > 0 end) or box.info.replication[1].upstream.lag +test_run:wait_cond(function() return box.info.replication[1].upstream.lag < 1 end) or box.info.replication[1].upstream.lag -- wait for ack timeout while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end @@ -138,7 +137,7 @@ test_run:cmd("switch default") errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) box.cfg{replication_timeout = 5} -test_run:cmd("switch replica") +test_run:cmd("switch errinj") -- wait for reconnect while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end -- wait for ack timeout again, should be ok @@ -146,34 +145,35 @@ fiber.sleep(0.01) {box.info.replication[1].upstream.status, box.info.replication[1].upstream.message} test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") +test_run:cmd("stop server errinj") +test_run:cmd("cleanup server errinj") errinj = box.error.injection errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0.01) -test_run:cmd("start server replica") -test_run:cmd("switch replica") +test_run:cmd("start server errinj") +test_run:cmd("switch errinj") fiber = require('fiber') old_repl = box.cfg.replication -- shutdown applier box.cfg{replication = {}, replication_timeout = 0.1} while box.info.replication[1].upstream ~= nil do fiber.sleep(0.0001) end -- reconnect -box.cfg{replication = old_repl} +box.cfg{replication = old_repl, replication_timeout = 10, replication_connect_timeout = 20} while box.info.replication[1].upstream.status ~= 'disconnected' do fiber.sleep(0.0001) end while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") +test_run:cmd("stop server errinj") +test_run:cmd("cleanup server errinj") errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0) box.cfg{replication_timeout = 0.01} -test_run:cmd("create server replica_timeout with rpl_master=default, script='replication/replica_timeout.lua'") -test_run:cmd("start server replica_timeout with args='0.01 0.5'") -test_run:cmd("switch replica_timeout") +replica_set = require('fast_replica') +replica_set.join(test_run, 'errinj_timeout', 1, 'replica_timeout') +box.cfg{replication_timeout = 0.01, replication_connect_timeout = 0.5} +test_run:cmd("switch errinj_timeout") fiber = require('fiber') while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end @@ -182,7 +182,7 @@ box.info.replication[1].upstream.status test_run:cmd("switch default") errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 5) -test_run:cmd("switch replica_timeout") +test_run:cmd("switch errinj_timeout") -- Check replica's disconnection on timeout (gh-3025). -- If master stops send heartbeat messages to replica, -- due to infinite read timeout connection never breaks, @@ -191,25 +191,25 @@ test_run:cmd("switch replica_timeout") while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end test_run:cmd("switch default") -test_run:cmd("stop server replica_timeout") -test_run:cmd("cleanup server replica_timeout") +test_run:cmd("stop server errinj_timeout") +test_run:cmd("cleanup server errinj_timeout") errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) -- Check replica's ACKs don't prevent the master from sending -- heartbeat messages (gh-3160). -test_run:cmd("start server replica_timeout with args='0.009 0.5'") -test_run:cmd("switch replica_timeout") +test_run:cmd("start server errinj_timeout with args='0.009 0.5'") +test_run:cmd("switch errinj_timeout") fiber = require('fiber') while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end -box.info.replication[1].upstream.status -- follow +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status for i = 0, 15 do fiber.sleep(0.01) if box.info.replication[1].upstream.status ~= 'follow' then break end end -box.info.replication[1].upstream.status -- follow +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status test_run:cmd("switch default") -test_run:cmd("stop server replica_timeout") -test_run:cmd("cleanup server replica_timeout") +test_run:cmd("stop server errinj_timeout") +test_run:cmd("cleanup server errinj_timeout") box.snapshot() for i = 0, 9999 do box.space.test:replace({i, 4, 5, 'test'}) end @@ -218,8 +218,8 @@ for i = 0, 9999 do box.space.test:replace({i, 4, 5, 'test'}) end -- during the join stage, i.e. a replica with a minuscule -- timeout successfully bootstraps and breaks connection only -- after subscribe. -test_run:cmd("start server replica_timeout with args='0.00001 0.5'") -test_run:cmd("switch replica_timeout") +test_run:cmd("start server errinj_timeout with args='0.00001 0.5'") +test_run:cmd("switch errinj_timeout") fiber = require('fiber') while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end @@ -227,5 +227,5 @@ test_run:cmd("stop server default") test_run:cmd("deploy server default") test_run:cmd("start server default") test_run:cmd("switch default") -test_run:cmd("stop server replica_timeout") -test_run:cmd("cleanup server replica_timeout") +test_run:cmd("stop server errinj_timeout") +test_run:cmd("cleanup server errinj_timeout") diff --git a/test/replication/force_recovery.result b/test/replication/force_recovery.result index f50452858..1c800213e 100644 --- a/test/replication/force_recovery.result +++ b/test/replication/force_recovery.result @@ -1,6 +1,9 @@ test_run = require('test_run').new() --- ... +replica_set = require('fast_replica') +--- +... fio = require('fio') --- ... @@ -17,20 +20,19 @@ box.schema.user.grant('guest', 'replication') --- ... -- Deploy a replica. -test_run:cmd("create server test with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'force_recovery') --- -- true ... -test_run:cmd("start server test") +test_run:cmd("start server force_recovery") --- - true ... -- Stop the replica and wait for the relay thread to exit. -test_run:cmd("stop server test") +test_run:cmd("stop server force_recovery") --- - true ... -test_run:wait_cond(function() return box.info.replication[2].downstream.status == 'stopped' end, 10) +test_run:wait_cond(function() return box.info.replication[2].downstream.status == 'stopped' end) or box.info.replication[2].downstream.status --- - true ... @@ -63,11 +65,11 @@ fio.unlink(xlog) box.cfg{force_recovery = true} --- ... -test_run:cmd("start server test") +test_run:cmd("start server force_recovery") --- - true ... -test_run:cmd("switch test") +test_run:cmd("switch force_recovery") --- - true ... @@ -75,7 +77,7 @@ box.space.test:select() --- - [] ... -box.info.replication[1].upstream.status == 'stopped' or box.info +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'stopped' end) or box.info --- - true ... @@ -87,15 +89,7 @@ box.cfg{force_recovery = false} --- ... -- Cleanup. -test_run:cmd("stop server test") ---- -- true -... -test_run:cmd("cleanup server test") ---- -- true -... -test_run:cmd("delete server test") +replica_set.drop(test_run, 'force_recovery') --- - true ... diff --git a/test/replication/force_recovery.test.lua b/test/replication/force_recovery.test.lua index 54307814b..ba86671f8 100644 --- a/test/replication/force_recovery.test.lua +++ b/test/replication/force_recovery.test.lua @@ -1,4 +1,5 @@ test_run = require('test_run').new() +replica_set = require('fast_replica') fio = require('fio') -- @@ -9,12 +10,12 @@ _ = box.space.test:create_index('primary') box.schema.user.grant('guest', 'replication') -- Deploy a replica. -test_run:cmd("create server test with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server test") +replica_set.create(test_run, 'force_recovery') +test_run:cmd("start server force_recovery") -- Stop the replica and wait for the relay thread to exit. -test_run:cmd("stop server test") -test_run:wait_cond(function() return box.info.replication[2].downstream.status == 'stopped' end, 10) +test_run:cmd("stop server force_recovery") +test_run:wait_cond(function() return box.info.replication[2].downstream.status == 'stopped' end) or box.info.replication[2].downstream.status -- Delete an xlog file that is needed by the replica. box.snapshot() @@ -27,17 +28,15 @@ fio.unlink(xlog) -- Check that even though box.cfg.force_recovery is set, -- replication will still fail due to LSN gap. box.cfg{force_recovery = true} -test_run:cmd("start server test") -test_run:cmd("switch test") +test_run:cmd("start server force_recovery") +test_run:cmd("switch force_recovery") box.space.test:select() -box.info.replication[1].upstream.status == 'stopped' or box.info +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'stopped' end) or box.info test_run:cmd("switch default") box.cfg{force_recovery = false} -- Cleanup. -test_run:cmd("stop server test") -test_run:cmd("cleanup server test") -test_run:cmd("delete server test") +replica_set.drop(test_run, 'force_recovery') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') box.space.test:drop() diff --git a/test/replication/gc.result b/test/replication/gc.result index 5b7057adf..c50ee1344 100644 --- a/test/replication/gc.result +++ b/test/replication/gc.result @@ -16,9 +16,8 @@ fio = require('fio') test_run:cleanup_cluster() --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'gc') --- -- true ... -- Make each snapshot trigger garbage collection. default_checkpoint_count = box.cfg.checkpoint_count @@ -34,14 +33,14 @@ test_run:cmd("setopt delimiter ';'") function wait_gc(n) return test_run:wait_cond(function() return #box.info.gc().checkpoints == n - end, 10) + end) or box.info.gc() end; --- ... -function wait_xlog(n, timeout) +function wait_xlog(n) return test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == n - end, 10) + end) or fio.glob('./master/*.xlog') end; --- ... @@ -105,7 +104,7 @@ test_run:cmd("setopt delimiter ''"); --- ... -- Start the replica. -test_run:cmd("start server replica") +test_run:cmd("start server gc") --- - true ... @@ -113,11 +112,11 @@ test_run:cmd("start server replica") -- would have normally removed the snapshot the replica was -- bootstrapped from, the replica should still receive all -- data from the master. Check it. -test_run:cmd("switch replica") +test_run:cmd("switch gc") --- - true ... -test_run:wait_cond(function() return box.space.test:count() == 200 end, 10) +test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count() --- - true ... @@ -131,11 +130,11 @@ test_run:cmd("switch default") ... -- Check that garbage collection removed the snapshot once -- the replica released the corresponding checkpoint. -wait_gc(1) or box.info.gc() +wait_gc(1) --- - true ... -wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until +wait_xlog(1) --- - true ... @@ -168,11 +167,11 @@ box.snapshot() --- - ok ... -wait_gc(1) or box.info.gc() +wait_gc(1) --- - true ... -wait_xlog(2) or fio.listdir('./master') +wait_xlog(2) --- - true ... @@ -183,11 +182,11 @@ box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false) - ok ... -- Check that the replica received all data from the master. -test_run:cmd("switch replica") +test_run:cmd("switch gc") --- - true ... -test_run:wait_cond(function() return box.space.test:count() == 300 end, 10) +test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count() --- - true ... @@ -201,11 +200,11 @@ test_run:cmd("switch default") ... -- Now garbage collection should resume and delete files left -- from the old checkpoint. -wait_gc(1) or box.info.gc() +wait_gc(1) --- - true ... -wait_xlog(0) or fio.listdir('./master') +wait_xlog(0) --- - true ... @@ -214,7 +213,7 @@ wait_xlog(0) or fio.listdir('./master') -- replica until it receives a confirmation that the data has -- been applied (gh-2825). -- -test_run:cmd("switch replica") +test_run:cmd("switch gc") --- - true ... @@ -244,15 +243,15 @@ fiber.sleep(0.1) -- wait for master to relay data -- Garbage collection must not delete the old xlog file -- because it is still needed by the replica, but remove -- the old snapshot. -wait_gc(1) or box.info.gc() +wait_gc(1) --- - true ... -wait_xlog(2) or fio.listdir('./master') +wait_xlog(2) --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch gc") --- - true ... @@ -265,13 +264,13 @@ box.cfg{replication = {}} --- ... -- Restart the replica to reestablish replication. -test_run:cmd("restart server replica") +test_run:cmd("restart server gc") -- Wait for the replica to catch up. -test_run:cmd("switch replica") +test_run:cmd("switch gc") --- - true ... -test_run:wait_cond(function() return box.space.test:count() == 310 end, 10) +test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count() --- - true ... @@ -284,20 +283,16 @@ test_run:cmd("switch default") - true ... -- Now it's safe to drop the old xlog. -wait_gc(1) or box.info.gc() +wait_gc(1) --- - true ... -wait_xlog(1) or fio.listdir('./master') +wait_xlog(1) --- - true ... -- Stop the replica. -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") +replica_set.hibernate(test_run, 'gc') --- - true ... @@ -320,11 +315,11 @@ box.snapshot() --- - ok ... -wait_gc(1) or box.info.gc() +wait_gc(1) --- - true ... -wait_xlog(2) or fio.listdir('./master') +wait_xlog(2) --- - true ... @@ -333,11 +328,11 @@ wait_xlog(2) or fio.listdir('./master') test_run:cleanup_cluster() --- ... -wait_gc(1) or box.info.gc() +wait_gc(1) --- - true ... -wait_xlog(1) or fio.listdir('./master') +wait_xlog(1) --- - true ... @@ -354,22 +349,25 @@ box.snapshot() --- - ok ... -replica_set.join(test_run, 3) +test_name = 'gc' +--- +... +replica_set.join(test_run, test_name, 3) --- ... -replica_set.stop_all(test_run) +replica_set.stop_all(test_run, test_name) --- ... for i = 11, 50 do s:replace{i} if i % 10 == 0 then box.snapshot() end end --- ... -replica_set.start_all(test_run) +replica_set.start_all(test_run, test_name) --- ... -replica_set.wait_all(test_run) +replica_set.wait_all(test_run, test_name) --- ... -replica_set.drop_all(test_run) +replica_set.prune_all(test_run, test_name) --- ... -- @@ -381,11 +379,11 @@ fio = require('fio') --- ... -- Start a replica and set it up as a master for this instance. -test_run:cmd("start server replica") +test_run:cmd("start server gc") --- - true ... -replica_port = test_run:eval('replica', 'return box.cfg.listen')[1] +replica_port = test_run:eval('gc', 'return box.cfg.listen')[1] --- ... replica_port ~= nil @@ -396,15 +394,7 @@ box.cfg{replication = replica_port} --- ... -- Stop the replica and write a few WALs. -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'gc') --- - true ... @@ -442,7 +432,7 @@ box.snapshot() --- - ok ... -wait_xlog(0, 10) or fio.listdir('./master') +wait_xlog(0) or fio.listdir('./master') --- - true ... diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua index a92a6ed7e..ccb83670a 100644 --- a/test/replication/gc.test.lua +++ b/test/replication/gc.test.lua @@ -5,7 +5,7 @@ fiber = require('fiber') fio = require('fio') test_run:cleanup_cluster() -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'gc') -- Make each snapshot trigger garbage collection. default_checkpoint_count = box.cfg.checkpoint_count @@ -15,12 +15,12 @@ test_run:cmd("setopt delimiter ';'") function wait_gc(n) return test_run:wait_cond(function() return #box.info.gc().checkpoints == n - end, 10) + end) or box.info.gc() end; -function wait_xlog(n, timeout) +function wait_xlog(n) return test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == n - end, 10) + end) or fio.glob('./master/*.xlog') end; test_run:cmd("setopt delimiter ''"); @@ -56,21 +56,21 @@ end) test_run:cmd("setopt delimiter ''"); -- Start the replica. -test_run:cmd("start server replica") +test_run:cmd("start server gc") -- Despite the fact that we invoked garbage collection that -- would have normally removed the snapshot the replica was -- bootstrapped from, the replica should still receive all -- data from the master. Check it. -test_run:cmd("switch replica") -test_run:wait_cond(function() return box.space.test:count() == 200 end, 10) +test_run:cmd("switch gc") +test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count() box.space.test:count() test_run:cmd("switch default") -- Check that garbage collection removed the snapshot once -- the replica released the corresponding checkpoint. -wait_gc(1) or box.info.gc() -wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until +wait_gc(1) +wait_xlog(1) -- we test garbage collection. box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true) @@ -86,29 +86,29 @@ box.snapshot() -- Invoke garbage collection. Check that it doesn't remove -- xlogs needed by the replica. box.snapshot() -wait_gc(1) or box.info.gc() -wait_xlog(2) or fio.listdir('./master') +wait_gc(1) +wait_xlog(2) -- Resume replication so that the replica catches -- up quickly. box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false) -- Check that the replica received all data from the master. -test_run:cmd("switch replica") -test_run:wait_cond(function() return box.space.test:count() == 300 end, 10) +test_run:cmd("switch gc") +test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count() box.space.test:count() test_run:cmd("switch default") -- Now garbage collection should resume and delete files left -- from the old checkpoint. -wait_gc(1) or box.info.gc() -wait_xlog(0) or fio.listdir('./master') +wait_gc(1) +wait_xlog(0) -- -- Check that the master doesn't delete xlog files sent to the -- replica until it receives a confirmation that the data has -- been applied (gh-2825). -- -test_run:cmd("switch replica") +test_run:cmd("switch gc") -- Prevent the replica from applying any rows. box.error.injection.set("ERRINJ_WAL_DELAY", true) test_run:cmd("switch default") @@ -120,25 +120,24 @@ fiber.sleep(0.1) -- wait for master to relay data -- Garbage collection must not delete the old xlog file -- because it is still needed by the replica, but remove -- the old snapshot. -wait_gc(1) or box.info.gc() -wait_xlog(2) or fio.listdir('./master') -test_run:cmd("switch replica") +wait_gc(1) +wait_xlog(2) +test_run:cmd("switch gc") -- Unblock the replica and break replication. box.error.injection.set("ERRINJ_WAL_DELAY", false) box.cfg{replication = {}} -- Restart the replica to reestablish replication. -test_run:cmd("restart server replica") +test_run:cmd("restart server gc") -- Wait for the replica to catch up. -test_run:cmd("switch replica") -test_run:wait_cond(function() return box.space.test:count() == 310 end, 10) +test_run:cmd("switch gc") +test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count() box.space.test:count() test_run:cmd("switch default") -- Now it's safe to drop the old xlog. -wait_gc(1) or box.info.gc() -wait_xlog(1) or fio.listdir('./master') +wait_gc(1) +wait_xlog(1) -- Stop the replica. -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") +replica_set.hibernate(test_run, 'gc') -- Invoke garbage collection. Check that it removes the old -- checkpoint, but keeps the xlog last used by the replica. @@ -149,14 +148,14 @@ _ = s:auto_increment{} box.snapshot() _ = s:auto_increment{} box.snapshot() -wait_gc(1) or box.info.gc() -wait_xlog(2) or fio.listdir('./master') +wait_gc(1) +wait_xlog(2) -- The xlog should only be deleted after the replica -- is unregistered. test_run:cleanup_cluster() -wait_gc(1) or box.info.gc() -wait_xlog(1) or fio.listdir('./master') +wait_gc(1) +wait_xlog(1) -- -- Test that concurrent invocation of the garbage collector works fine. -- @@ -164,14 +163,15 @@ s:truncate() for i = 1, 10 do s:replace{i} end box.snapshot() -replica_set.join(test_run, 3) -replica_set.stop_all(test_run) +test_name = 'gc' +replica_set.join(test_run, test_name, 3) +replica_set.stop_all(test_run, test_name) for i = 11, 50 do s:replace{i} if i % 10 == 0 then box.snapshot() end end -replica_set.start_all(test_run) -replica_set.wait_all(test_run) -replica_set.drop_all(test_run) +replica_set.start_all(test_run, test_name) +replica_set.wait_all(test_run, test_name) +replica_set.prune_all(test_run, test_name) -- -- Check that once a replica is removed from the cluster table, @@ -181,15 +181,13 @@ replica_set.drop_all(test_run) fio = require('fio') -- Start a replica and set it up as a master for this instance. -test_run:cmd("start server replica") -replica_port = test_run:eval('replica', 'return box.cfg.listen')[1] +test_run:cmd("start server gc") +replica_port = test_run:eval('gc', 'return box.cfg.listen')[1] replica_port ~= nil box.cfg{replication = replica_port} -- Stop the replica and write a few WALs. -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'gc') _ = s:auto_increment{} box.snapshot() _ = s:auto_increment{} @@ -202,7 +200,7 @@ wait_xlog(3) or fio.listdir('./master') -- all xlog files are removed. test_run:cleanup_cluster() box.snapshot() -wait_xlog(0, 10) or fio.listdir('./master') +wait_xlog(0) or fio.listdir('./master') -- Restore the config. box.cfg{replication = {}} diff --git a/test/replication/gc_no_space.result b/test/replication/gc_no_space.result index b2d3e2075..b50ec439c 100644 --- a/test/replication/gc_no_space.result +++ b/test/replication/gc_no_space.result @@ -20,22 +20,24 @@ test_run:cmd("setopt delimiter ';'") --- - true ... -function check_file_count(dir, glob, count) - local files = fio.glob(fio.pathjoin(dir, glob)) - if #files == count then - return true - end - return false, files +function wait_file_count(dir, glob, count) + return test_run:wait_cond(function() + local files = fio.glob(fio.pathjoin(dir, glob)) + if #files == count then + return true + end + return false, files + end) end; --- ... function check_wal_count(count) - return check_file_count(box.cfg.wal_dir, '*.xlog', count) + return wait_file_count(box.cfg.wal_dir, '*.xlog', count) end; --- ... function check_snap_count(count) - return check_file_count(box.cfg.memtx_dir, '*.snap', count) + return wait_file_count(box.cfg.memtx_dir, '*.snap', count) end; --- ... @@ -65,19 +67,13 @@ box.snapshot() -- -- Create a few dead replicas to pin WAL files. -- -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set = require('fast_replica') --- -- true ... -test_run:cmd("start server replica") +replica_set.join(test_run, 'gc_no_space') --- -- true ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") +replica_set.hibernate(test_run, 'gc_no_space') --- - true ... @@ -89,15 +85,11 @@ box.snapshot() --- - ok ... -test_run:cmd("start server replica") ---- -- true -... -test_run:cmd("stop server replica") +replica_set.start(test_run, 'gc_no_space') --- - true ... -test_run:cmd("cleanup server replica") +replica_set.hibernate(test_run, 'gc_no_space') --- - true ... @@ -109,19 +101,11 @@ box.snapshot() --- - ok ... -test_run:cmd("start server replica") ---- -- true -... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") +replica_set.start(test_run, 'gc_no_space') --- - true ... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'gc_no_space') --- - true ... diff --git a/test/replication/gc_no_space.test.lua b/test/replication/gc_no_space.test.lua index 6940996fe..df33d501d 100644 --- a/test/replication/gc_no_space.test.lua +++ b/test/replication/gc_no_space.test.lua @@ -11,18 +11,20 @@ fio = require('fio') errinj = box.error.injection test_run:cmd("setopt delimiter ';'") -function check_file_count(dir, glob, count) - local files = fio.glob(fio.pathjoin(dir, glob)) - if #files == count then - return true - end - return false, files +function wait_file_count(dir, glob, count) + return test_run:wait_cond(function() + local files = fio.glob(fio.pathjoin(dir, glob)) + if #files == count then + return true + end + return false, files + end) end; function check_wal_count(count) - return check_file_count(box.cfg.wal_dir, '*.xlog', count) + return wait_file_count(box.cfg.wal_dir, '*.xlog', count) end; function check_snap_count(count) - return check_file_count(box.cfg.memtx_dir, '*.snap', count) + return wait_file_count(box.cfg.memtx_dir, '*.snap', count) end; test_run:cmd("setopt delimiter ''"); @@ -37,25 +39,21 @@ box.snapshot() -- -- Create a few dead replicas to pin WAL files. -- -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") +replica_set = require('fast_replica') +replica_set.join(test_run, 'gc_no_space') +replica_set.hibernate(test_run, 'gc_no_space') s:auto_increment{} box.snapshot() -test_run:cmd("start server replica") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") +replica_set.start(test_run, 'gc_no_space') +replica_set.hibernate(test_run, 'gc_no_space') s:auto_increment{} box.snapshot() -test_run:cmd("start server replica") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.start(test_run, 'gc_no_space') +replica_set.drop(test_run, 'gc_no_space') -- -- Make a few checkpoints and check that old WAL files are not diff --git a/test/replication/hot_standby.result b/test/replication/hot_standby.result index b140887df..e003055af 100644 --- a/test/replication/hot_standby.result +++ b/test/replication/hot_standby.result @@ -1,10 +1,10 @@ -env = require('test_run') +test_run = require('test_run').new() --- ... -test_run = env.new() +engine = test_run:get_cfg('engine') --- ... -engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') --- ... box.schema.user.grant('guest', 'replication') @@ -16,19 +16,17 @@ box.schema.func.create('_set_pri_lsn') box.schema.user.grant('guest', 'execute', 'function', '_set_pri_lsn') --- ... -test_run:cmd("create server hot_standby with script='replication/hot_standby.lua', rpl_master=default") +replica_set.create(test_run, 'hot_standby', 'hot_standby') --- -- true ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'hot_standby_replica') --- -- true ... test_run:cmd("start server hot_standby") --- - true ... -test_run:cmd("start server replica") +test_run:cmd("start server hot_standby_replica") --- - true ... @@ -36,7 +34,7 @@ test_run:cmd("setopt delimiter ';'") --- - true ... -test_run:cmd("set connection default, hot_standby, replica") +test_run:cmd("set connection default, hot_standby, hot_standby_replica") fiber = require('fiber'); --- ... @@ -95,7 +93,7 @@ test_run:cmd("setopt delimiter ''"); --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch hot_standby_replica") --- - true ... @@ -131,7 +129,7 @@ index = space:create_index('primary', {type = 'tree'}) --- ... -- set begin lsn on master, replica and hot_standby. -test_run:cmd("set variable replica_port to 'replica.listen'") +test_run:cmd("set variable replica_port to 'hot_standby_replica.listen'") --- - true ... @@ -203,7 +201,7 @@ test_run:cmd("switch hot_standby") _wait_lsn(10) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch hot_standby_replica") --- - true ... @@ -234,7 +232,7 @@ test_run:cmd("switch hot_standby") while box.info.status ~= 'running' do fiber.sleep(0.001) end --- ... -test_run:cmd("switch replica") +test_run:cmd("switch hot_standby_replica") --- - true ... @@ -290,7 +288,7 @@ _select(11, 20) - [19, 'the tuple 19'] - [20, 'the tuple 20'] ... -test_run:cmd("switch replica") +test_run:cmd("switch hot_standby_replica") --- - true ... @@ -310,15 +308,10 @@ _select(11, 20) - [19, 'the tuple 19'] - [20, 'the tuple 20'] ... -test_run:cmd("stop server hot_standby") ---- -- true -... -test_run:cmd("cleanup server hot_standby") +replica_set = require('fast_replica') --- -- true ... -test_run:cmd("delete server hot_standby") +replica_set.drop(test_run, 'hot_standby') --- - true ... @@ -334,15 +327,10 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") +replica_set = require('fast_replica') --- -- true ... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'hot_standby_replica') --- - true ... diff --git a/test/replication/hot_standby.test.lua b/test/replication/hot_standby.test.lua index f43982f15..c3afe6d37 100644 --- a/test/replication/hot_standby.test.lua +++ b/test/replication/hot_standby.test.lua @@ -1,17 +1,17 @@ -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') box.schema.user.grant('guest', 'replication') box.schema.func.create('_set_pri_lsn') box.schema.user.grant('guest', 'execute', 'function', '_set_pri_lsn') -test_run:cmd("create server hot_standby with script='replication/hot_standby.lua', rpl_master=default") -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'hot_standby', 'hot_standby') +replica_set.create(test_run, 'hot_standby_replica') test_run:cmd("start server hot_standby") -test_run:cmd("start server replica") +test_run:cmd("start server hot_standby_replica") test_run:cmd("setopt delimiter ';'") -test_run:cmd("set connection default, hot_standby, replica") +test_run:cmd("set connection default, hot_standby, hot_standby_replica") fiber = require('fiber'); while box.info.id == 0 do fiber.sleep(0.01) end; while box.space['_priv']:len() < 1 do fiber.sleep(0.001) end; @@ -60,7 +60,7 @@ do end; test_run:cmd("setopt delimiter ''"); -test_run:cmd("switch replica") +test_run:cmd("switch hot_standby_replica") fiber = require('fiber') test_run:cmd("switch hot_standby") fiber = require('fiber') @@ -73,7 +73,7 @@ space = box.schema.space.create('tweedledum', {engine = engine}) index = space:create_index('primary', {type = 'tree'}) -- set begin lsn on master, replica and hot_standby. -test_run:cmd("set variable replica_port to 'replica.listen'") +test_run:cmd("set variable replica_port to 'hot_standby_replica.listen'") REPLICA = require('uri').parse(tostring(replica_port)) REPLICA ~= nil a = (require 'net.box').connect(REPLICA.host, REPLICA.service) @@ -93,14 +93,14 @@ _select(1, 10) test_run:cmd("switch hot_standby") _wait_lsn(10) -test_run:cmd("switch replica") +test_run:cmd("switch hot_standby_replica") _wait_lsn(10) _select(1, 10) test_run:cmd("stop server default") test_run:cmd("switch hot_standby") while box.info.status ~= 'running' do fiber.sleep(0.001) end -test_run:cmd("switch replica") +test_run:cmd("switch hot_standby_replica") -- hot_standby.listen is garbage, since hot_standby.lua -- uses MASTER environment variable for its listen @@ -115,17 +115,15 @@ test_run:cmd("switch hot_standby") _insert(11, 20) _select(11, 20) -test_run:cmd("switch replica") +test_run:cmd("switch hot_standby_replica") _wait_lsn(10) _select(11, 20) -test_run:cmd("stop server hot_standby") -test_run:cmd("cleanup server hot_standby") -test_run:cmd("delete server hot_standby") +replica_set = require('fast_replica') +replica_set.drop(test_run, 'hot_standby') test_run:cmd("deploy server default") test_run:cmd("start server default") test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set = require('fast_replica') +replica_set.drop(test_run, 'hot_standby_replica') test_run:cleanup_cluster() diff --git a/test/replication/join_vclock.result b/test/replication/join_vclock.result index a9781073d..2387be750 100644 --- a/test/replication/join_vclock.result +++ b/test/replication/join_vclock.result @@ -1,13 +1,13 @@ -fiber = require('fiber') +test_name = 'join_vclock' --- ... -env = require('test_run') +test_run = require('test_run').new() --- ... -replica_set = require('fast_replica') +fiber = require('fiber') --- ... -test_run = env.new() +replica_set = require('fast_replica') --- ... engine = test_run:get_cfg('engine') @@ -41,10 +41,10 @@ function repl_f() local i = 0 while not done do s:replace({i, i}) fiber.sleep(0. _ = fiber.create(repl_f) --- ... -replica_set.join(test_run, 1) +replica_set.join(test_run, test_name) --- ... -test_run:cmd("switch replica1") +test_run:cmd("switch "..test_name) --- - true ... @@ -63,14 +63,15 @@ errinj.set("ERRINJ_RELAY_FINAL_SLEEP", false) --- - ok ... -test_run:cmd("switch replica1") +test_run:cmd("switch "..test_name) --- - true ... -cnt = box.space.test.index[0]:count() +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- +- true ... -box.space.test.index.primary:max()[1] == cnt - 1 +test_run:wait_cond(function() return box.space.test.index.primary:max()[1] == box.space.test.index[0]:count() - 1 end) or box.space.test.index[0]:count() --- - true ... @@ -78,8 +79,9 @@ test_run:cmd("switch default") --- - true ... -replica_set.drop_all(test_run) +replica_set.prune(test_run, test_name) --- +- true ... box.space.test:drop() --- diff --git a/test/replication/join_vclock.test.lua b/test/replication/join_vclock.test.lua index 0b60dffc2..d6550ddd2 100644 --- a/test/replication/join_vclock.test.lua +++ b/test/replication/join_vclock.test.lua @@ -1,7 +1,7 @@ +test_name = 'join_vclock' +test_run = require('test_run').new() fiber = require('fiber') -env = require('test_run') replica_set = require('fast_replica') -test_run = env.new() engine = test_run:get_cfg('engine') errinj = box.error.injection @@ -17,19 +17,19 @@ done = false function repl_f() local i = 0 while not done do s:replace({i, i}) fiber.sleep(0.001) i = i + 1 end ch:put(true) end _ = fiber.create(repl_f) -replica_set.join(test_run, 1) -test_run:cmd("switch replica1") +replica_set.join(test_run, test_name) +test_run:cmd("switch "..test_name) test_run:cmd("switch default") done = true ch:get() errinj.set("ERRINJ_RELAY_FINAL_SLEEP", false) -test_run:cmd("switch replica1") -cnt = box.space.test.index[0]:count() -box.space.test.index.primary:max()[1] == cnt - 1 +test_run:cmd("switch "..test_name) +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.space.test.index.primary:max()[1] == box.space.test.index[0]:count() - 1 end) or box.space.test.index[0]:count() test_run:cmd("switch default") -replica_set.drop_all(test_run) +replica_set.prune(test_run, test_name) box.space.test:drop() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/local_spaces.result b/test/replication/local_spaces.result index ed1b76da8..9bd8854c3 100644 --- a/test/replication/local_spaces.result +++ b/test/replication/local_spaces.result @@ -1,10 +1,10 @@ -env = require('test_run') +test_run = require('test_run').new() --- ... -test_run = env.new() +engine = test_run:get_cfg('engine') --- ... -engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') --- ... -- @@ -96,15 +96,14 @@ _ = s3:insert{2} box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'local_spaces') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server local_spaces") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch local_spaces") --- - true ... @@ -162,10 +161,10 @@ _ = s3:insert{3} vclock = test_run:get_vclock('default') --- ... -_ = test_run:wait_vclock('replica', vclock) +_ = test_run:wait_vclock('local_spaces', vclock) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch local_spaces") --- - true ... @@ -187,7 +186,7 @@ box.space.test3:select() - [2, 2, 2] - [3, 3, 3] ... -test_run:cmd("restart server replica") +test_run:cmd("restart server local_spaces") box.space.test1:select() --- - - [1] @@ -208,15 +207,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'local_spaces') --- - true ... diff --git a/test/replication/local_spaces.test.lua b/test/replication/local_spaces.test.lua index bb7294538..551eb2aed 100644 --- a/test/replication/local_spaces.test.lua +++ b/test/replication/local_spaces.test.lua @@ -1,6 +1,6 @@ -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') -- -- gh-3443: Check that changes done to spaces marked as local @@ -42,10 +42,10 @@ _ = s2:insert{2} _ = s3:insert{2} box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") +replica_set.create(test_run, 'local_spaces') +test_run:cmd("start server local_spaces") -test_run:cmd("switch replica") +test_run:cmd("switch local_spaces") box.space.test1.is_local box.space.test2.is_local box.space.test3.is_local @@ -62,21 +62,19 @@ _ = s1:insert{3} _ = s2:insert{3} _ = s3:insert{3} vclock = test_run:get_vclock('default') -_ = test_run:wait_vclock('replica', vclock) +_ = test_run:wait_vclock('local_spaces', vclock) -test_run:cmd("switch replica") +test_run:cmd("switch local_spaces") box.space.test1:select() box.space.test2:select() box.space.test3:select() -test_run:cmd("restart server replica") +test_run:cmd("restart server local_spaces") box.space.test1:select() box.space.test2:select() box.space.test3:select() test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'local_spaces') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/lua/fast_replica.lua b/test/replication/lua/fast_replica.lua index 8c772c41f..b98598de5 100644 --- a/test/replication/lua/fast_replica.lua +++ b/test/replication/lua/fast_replica.lua @@ -1,18 +1,23 @@ -function join(inspector, n) - local path = os.getenv('TARANTOOL_SRC_DIR') +function create(inspector, name, replica) + replica = replica or 'replica' + os.execute('echo "Creating replica '..name..'"') + os.execute('mkdir -p tmp') + os.execute('cp '..os.getenv('TARANTOOL_SRC_DIR')..'/test/replication/'..replica..'.lua ./tmp/'..name..'.lua') + os.execute('chmod +x ./tmp/'..name..'.lua') + inspector:cmd("create server "..name.." with rpl_master=default, script='"..box.cfg.wal_dir.."/../tmp/"..name..".lua'") +end + +function join(inspector, name, n, replica) + n = n or 1 for i=1,n do local rid = tostring(i) - os.execute('mkdir -p tmp') - os.execute('cp '..path..'/test/replication/replica.lua ./tmp/replica'..rid..'.lua') - os.execute('chmod +x ./tmp/replica'..rid..'.lua') - local out_dir = box.cfg.wal_dir - inspector:cmd("create server replica"..rid.." with rpl_master=default, script='"..out_dir.."/../tmp/replica"..rid..".lua'") - inspector:cmd("start server replica"..rid) + if n == 1 then rid = '' end + create(inspector, name..rid, replica) + start(inspector, name..rid) end end - function call_all(callback) local all = box.space._cluster:select{} for _, tuple in pairs(all) do @@ -24,45 +29,91 @@ function call_all(callback) end function unregister(inspector, id) - box.space._cluster:delete{id} + id = id or 2 + if box.space._cluster:delete{id} then + return true + end + return false +end + +function id_to_str(id) + local strnum + if id == nil then + strnum = '' + else + strnum = tostring(id - 1) + end + return strnum +end + +-- replica commands + +function start(inspector, name, id) + return inspector:cmd('start server '..name..id_to_str(id)) +end + +function stop(inspector, name, id) + return inspector:cmd('stop server '..name..id_to_str(id)) +end + +function cleanup(inspector, name, id) + return inspector:cmd('cleanup server '..name..id_to_str(id)) +end + +function wait(inspector, name, id) + return inspector:wait_lsn(name..id_to_str(id), 'default') end -function start(inspector, id) - inspector:cmd('start server replica'..tostring(id - 1)) +function delete(inspector, name, id) + return inspector:cmd('delete server '..name..id_to_str(id)) end -function stop(inspector, id) - inspector:cmd('stop server replica'..tostring(id - 1)) +-- replica modes + +function hibernate(inspector, name, id) + return stop(inspector, name, id) and cleanup(inspector, name, id) end -function wait(inspector, id) - inspector:wait_lsn('replica'..tostring(id - 1), 'default') +function drop(inspector, name, id) + return hibernate(inspector, name, id) and delete(inspector, name, id) end -function delete(inspector, id) - inspector:cmd('stop server replica'..tostring(id - 1)) - inspector:cmd('delete server replica'..tostring(id - 1)) +function prune(inspector, name, id) + return unregister(inspector, id) and drop(inspector, name, id) +end + +-- multi calls + +function start_all(inspector, name) + call_all(function (id) start(inspector, name, id) end) end -function drop(inspector, id) - unregister(inspector, id) - delete(inspector, id) +function stop_all(inspector, name) + call_all(function (id) stop(inspector, name, id) end) end -function start_all(inspector) - call_all(function (id) start(inspector, id) end) +function cleanup_all(inspector, name) + call_all(function (id) cleanup(inspector, name, id) end) end -function stop_all(inspector) - call_all(function (id) stop(inspector, id) end) +function wait_all(inspector, name) + call_all(function (id) wait(inspector, name, id) end) end -function wait_all(inspector) - call_all(function (id) wait(inspector, id) end) +function delete_all(inspector, name) + call_all(function (id) delete(inspector, name, id) end) end -function drop_all(inspector) - call_all(function (id) drop(inspector, id) end) +function hibernate_all(inspector, name) + call_all(function (id) hibernate(inspector, name, id) end) +end + +function drop_all(inspector, name) + call_all(function (id) drop(inspector, name, id) end) +end + +function prune_all(inspector, name) + call_all(function (id) prune(inspector, name, id) end) end function vclock_diff(left, right) @@ -79,12 +130,24 @@ function vclock_diff(left, right) end return { + create = create; join = join; + start = start; start_all = start_all; + stop = stop; stop_all = stop_all; + cleanup = cleanup; + cleanup_all = cleanup_all; + wait = wait; wait_all = wait_all; + delete = delete; + delete_all = delete_all; + hibernate = hibernate; + hibernate_all = hibernate_all; + drop = drop; drop_all = drop_all; + prune = prune; + prune_all = prune_all; vclock_diff = vclock_diff; unregister = unregister; - delete = delete; } diff --git a/test/replication/master.lua b/test/replication/master.lua index 9b96b7891..340b9cc81 100644 --- a/test/replication/master.lua +++ b/test/replication/master.lua @@ -3,7 +3,7 @@ os = require('os') box.cfg({ listen = os.getenv("LISTEN"), memtx_memory = 107374182, - replication_connect_timeout = 0.5, + replication_connect_timeout = 100, replication_timeout = 0.1 }) diff --git a/test/replication/master_quorum.lua b/test/replication/master_quorum.lua index 05272ac5e..e00f1f932 100644 --- a/test/replication/master_quorum.lua +++ b/test/replication/master_quorum.lua @@ -6,7 +6,7 @@ local INSTANCE_ID = string.match(arg[0], "%d") local SOCKET_DIR = require('fio').cwd() local TIMEOUT = tonumber(arg[1]) -local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 30.0 +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 local function instance_uri(instance_id) --return 'localhost:'..(3310 + instance_id) diff --git a/test/replication/misc.lua b/test/replication/misc.lua new file mode 100644 index 000000000..65cf8c8fe --- /dev/null +++ b/test/replication/misc.lua @@ -0,0 +1,37 @@ +#!/usr/bin/env tarantool + +-- get instance name from filename (misc1.lua => misc1) +local INSTANCE_ID = string.match(arg[0], "%d") +local USER = 'cluster' +local PASSWORD = 'somepassword' +local SOCKET_DIR = require('fio').cwd() +local TIMEOUT = tonumber(arg[1]) +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 + +local function instance_uri(instance_id) + --return 'localhost:'..(3310 + instance_id) + return SOCKET_DIR..'/misc'..instance_id..'.sock'; +end + +-- start console first +require('console').listen(os.getenv('ADMIN')) + +box.cfg({ + listen = instance_uri(INSTANCE_ID); +-- log_level = 7; + replication = { + USER..':'..PASSWORD..'@'..instance_uri(1); + USER..':'..PASSWORD..'@'..instance_uri(2); + USER..':'..PASSWORD..'@'..instance_uri(3); + }; + replication_timeout = TIMEOUT; + replication_connect_timeout = CON_TIMEOUT; +}) + +box.once("bootstrap", function() + local test_run = require('test_run').new() + box.schema.user.create(USER, { password = PASSWORD }) + box.schema.user.grant(USER, 'replication') + box.schema.space.create('test', {engine = test_run:get_cfg('engine')}) + box.space.test:create_index('primary') +end) diff --git a/test/replication/misc.result b/test/replication/misc.result index ab827c501..45da3894d 100644 --- a/test/replication/misc.result +++ b/test/replication/misc.result @@ -4,6 +4,9 @@ uuid = require('uuid') test_run = require('test_run').new() --- ... +replica_set = require('fast_replica') +--- +... box.schema.user.grant('guest', 'replication') --- ... @@ -54,41 +57,32 @@ box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replica_uuid = uuid.new() --- ... -test_run:cmd('create server test with rpl_master=default, script="replication/replica_uuid.lua"') ---- -- true -... -test_run:cmd(string.format('start server test with args="%s"', replica_uuid)) +replica_set.create(test_run, 'misc_gh3111', 'replica_uuid') --- -- true ... -test_run:cmd('stop server test') +test_run:cmd(string.format('start server misc_gh3111 with args="%s"', replica_uuid)) --- - true ... -test_run:cmd('cleanup server test') +replica_set.hibernate(test_run, 'misc_gh3111') --- - true ... box.cfg{read_only = true} --- ... -test_run:cmd(string.format('start server test with args="%s"', replica_uuid)) +test_run:cmd(string.format('start server misc_gh3111 with args="%s"', replica_uuid)) --- - true ... -test_run:cmd('stop server test') ---- -- true -... -test_run:cmd('cleanup server test') +replica_set.hibernate(test_run, 'misc_gh3111') --- - true ... box.cfg{read_only = false} --- ... -test_run:cmd('delete server test') +test_run:cmd('delete server misc_gh3111') --- - true ... @@ -96,7 +90,7 @@ test_run:cleanup_cluster() --- ... -- gh-3160 - Send heartbeats if there are changes from a remote master only -SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +SERVERS = { 'misc1', 'misc2', 'misc3' } --- ... -- Deploy a cluster. @@ -106,27 +100,27 @@ test_run:create_cluster(SERVERS, "replication", {args="0.1"}) test_run:wait_fullmesh(SERVERS) --- ... -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch misc1") --- - true ... test_run = require('test_run').new() --- ... -box.cfg{replication_timeout = 0.01, replication_connect_timeout=0.01} +box.cfg{replication_timeout = 0.03, replication_connect_timeout=0.03} --- ... -test_run:cmd("switch autobootstrap2") +test_run:cmd("switch misc2") --- - true ... test_run = require('test_run').new() --- ... -box.cfg{replication_timeout = 0.01, replication_connect_timeout=0.01} +box.cfg{replication_timeout = 0.03, replication_connect_timeout=0.03} --- ... -test_run:cmd("switch autobootstrap3") +test_run:cmd("switch misc3") --- - true ... @@ -136,7 +130,7 @@ test_run = require('test_run').new() fiber=require('fiber') --- ... -box.cfg{replication_timeout = 0.01, replication_connect_timeout=0.01} +box.cfg{replication_timeout = 0.03, replication_connect_timeout=0.03} --- ... _ = box.schema.space.create('test_timeout'):create_index('pk') @@ -149,7 +143,7 @@ test_run:cmd("setopt delimiter ';'") function wait_follow(replicaA, replicaB) return test_run:wait_cond(function() return replicaA.status ~= 'follow' or replicaB.status ~= 'follow' - end, 0.01) + end, 0.1) end ; --- ... @@ -158,7 +152,7 @@ function test_timeout() local replicaB = box.info.replication[3].upstream or box.info.replication[2].upstream local follows = test_run:wait_cond(function() return replicaA.status == 'follow' or replicaB.status == 'follow' - end, 0.1) + end, 1) if not follows then error('replicas not in follow status') end for i = 0, 99 do box.space.test_timeout:replace({1}) @@ -180,7 +174,7 @@ test_timeout() ... -- gh-3247 - Sequence-generated value is not replicated in case -- the request was sent via iproto. -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch misc1") --- - true ... @@ -217,13 +211,13 @@ box.space.space1:select{} --- - - [2, 1, 'data'] ... -vclock = test_run:get_vclock("autobootstrap1") +vclock = test_run:get_vclock("misc1") --- ... -_ = test_run:wait_vclock("autobootstrap2", vclock) +_ = test_run:wait_vclock("misc2", vclock) --- ... -test_run:cmd("switch autobootstrap2") +test_run:cmd("switch misc2") --- - true ... @@ -231,7 +225,7 @@ box.space.space1:select{} --- - - [2, 1, 'data'] ... -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch misc1") --- - true ... @@ -268,15 +262,14 @@ lim.rlim_cur = 64 rlimit.setrlimit(rlimit.RLIMIT_NOFILE, lim) --- ... -test_run:cmd('create server sock with rpl_master=default, script="replication/replica.lua"') +replica_set.create(test_run, 'misc_gh3642') --- -- true ... -test_run:cmd('start server sock') +test_run:cmd('start server misc_gh3642') --- - true ... -test_run:cmd('switch sock') +test_run:cmd('switch misc_gh3642') --- - true ... @@ -318,15 +311,7 @@ lim.rlim_cur = old_fno rlimit.setrlimit(rlimit.RLIMIT_NOFILE, lim) --- ... -test_run:cmd("stop server sock") ---- -- true -... -test_run:cmd("cleanup server sock") ---- -- true -... -test_run:cmd("delete server sock") +replica_set.drop(test_run, 'misc_gh3642') --- - true ... @@ -337,42 +322,40 @@ box.schema.user.revoke('guest', 'replication') --- ... -- gh-3510 assertion failure in replica_on_applier_disconnect() -test_run:cmd('create server er_load1 with script="replication/er_load1.lua"') +replica_set.create(test_run, 'misc_gh3510_1', 'er_load1') --- -- true ... -test_run:cmd('create server er_load2 with script="replication/er_load2.lua"') +replica_set.create(test_run, 'misc_gh3510_2', 'er_load2') --- -- true ... -test_run:cmd('start server er_load1 with wait=False, wait_load=False') +test_run:cmd('start server misc_gh3510_1 with wait=False, wait_load=False') --- - true ... --- instance er_load2 will fail with error ER_READONLY. this is ok. --- We only test here that er_load1 doesn't assert. -test_run:cmd('start server er_load2 with wait=True, wait_load=True, crash_expected = True') +-- instance misc_gh3510_2 will fail with error ER_READONLY. this is ok. +-- We only test here that misc_gh3510_1 doesn't assert. +test_run:cmd('start server misc_gh3510_2 with wait=True, wait_load=True, crash_expected = True') --- - false ... -test_run:cmd('stop server er_load1') +test_run:cmd('stop server misc_gh3510_1') --- - true ... --- er_load2 exits automatically. -test_run:cmd('cleanup server er_load1') +-- misc_gh3510_2 exits automatically. +test_run:cmd('cleanup server misc_gh3510_1') --- - true ... -test_run:cmd('cleanup server er_load2') +test_run:cmd('cleanup server misc_gh3510_2') --- - true ... -test_run:cmd('delete server er_load1') +test_run:cmd('delete server misc_gh3510_1') --- - true ... -test_run:cmd('delete server er_load2') +test_run:cmd('delete server misc_gh3510_2') --- - true ... @@ -386,11 +369,10 @@ test_run:cleanup_cluster() fiber = require('fiber') --- ... -test_run:cmd("create server replica_auth with rpl_master=default, script='replication/replica_auth.lua'") +replica_set.create(test_run, 'misc_gh3637', 'replica_auth') --- -- true ... -test_run:cmd("start server replica_auth with wait=False, wait_load=False, args='cluster:pass 0.05'") +test_run:cmd("start server misc_gh3637 with wait=False, wait_load=False, args='cluster:pass 0.05'") --- - true ... @@ -404,24 +386,16 @@ box.schema.user.create('cluster', {password='pass'}) box.schema.user.grant('cluster', 'replication') --- ... -while box.info.replication[2] == nil do fiber.sleep(0.01) end +while box.info.replication[2] == nil do fiber.sleep(0.03) end --- ... vclock = test_run:get_vclock('default') --- ... -_ = test_run:wait_vclock('replica_auth', vclock) ---- -... -test_run:cmd("stop server replica_auth") ---- -- true -... -test_run:cmd("cleanup server replica_auth") +_ = test_run:wait_vclock('misc_gh3637', vclock) --- -- true ... -test_run:cmd("delete server replica_auth") +replica_set.drop(test_run, 'misc_gh3637') --- - true ... @@ -438,15 +412,14 @@ box.schema.user.drop('cluster') box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'misc_gh3610') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server misc_gh3610") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch misc_gh3610") --- - true ... @@ -469,11 +442,11 @@ listen = box.cfg.listen box.cfg{listen = ''} --- ... -test_run:cmd("switch replica") +test_run:cmd("switch misc_gh3610") --- - true ... -box.cfg{replication_connect_quorum = 0, replication_connect_timeout = 0.01} +box.cfg{replication_connect_quorum = 0, replication_connect_timeout = 0.03} --- ... box.cfg{replication = {replication, replication}} @@ -486,18 +459,10 @@ test_run:cmd("switch default") box.cfg{listen = listen} --- ... -while test_run:grep_log('replica', 'duplicate connection') == nil do fiber.sleep(0.01) end ---- -... -test_run:cmd("stop server replica") +while test_run:grep_log('misc_gh3610', 'duplicate connection') == nil do fiber.sleep(0.03) end --- -- true -... -test_run:cmd("cleanup server replica") ---- -- true ... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'misc_gh3610') --- - true ... @@ -514,11 +479,10 @@ box.schema.user.revoke('guest', 'replication') box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'misc_gh3711') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server misc_gh3711") --- - true ... @@ -528,7 +492,7 @@ test_run:cmd("start server replica") box.schema.user.revoke('guest', 'replication') --- ... -test_run:cmd("switch replica") +test_run:cmd("switch misc_gh3711") --- - true ... @@ -557,7 +521,7 @@ test_run:cmd("switch default") box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("switch replica") +test_run:cmd("switch misc_gh3711") --- - true ... @@ -574,7 +538,7 @@ test_run:cmd("switch default") box.schema.user.revoke('guest', 'replication') --- ... -test_run:cmd("switch replica") +test_run:cmd("switch misc_gh3711") --- - true ... @@ -589,15 +553,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'misc_gh3711') --- - true ... @@ -607,18 +563,17 @@ test_run:cleanup_cluster() -- -- gh-3704 move cluster id check to replica -- -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'misc_gh3704') --- -- true ... box.schema.user.grant("guest", "replication") --- ... -test_run:cmd("start server replica") +test_run:cmd("start server misc_gh3704") --- - true ... -test_run:grep_log("replica", "REPLICASET_UUID_MISMATCH") +test_run:grep_log("misc_gh3704", "REPLICASET_UUID_MISMATCH") --- - null ... @@ -627,7 +582,7 @@ box.info.replication[2].downstream.status - follow ... -- change master's cluster uuid and check that replica doesn't connect. -test_run:cmd("stop server replica") +test_run:cmd("stop server misc_gh3704") --- - true ... @@ -635,27 +590,19 @@ _ = box.space._schema:replace{'cluster', tostring(uuid.new())} --- ... -- master believes replica is in cluster, but their cluster UUIDs differ. -test_run:cmd("start server replica") +test_run:cmd("start server misc_gh3704") --- - true ... -test_run:wait_log("replica", "REPLICASET_UUID_MISMATCH", nil, 1.0) +test_run:wait_log("misc_gh3704", "REPLICASET_UUID_MISMATCH", nil, 1.0) --- - REPLICASET_UUID_MISMATCH ... -box.info.replication[2].downstream.status ---- -- stopped -... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") +test_run:wait_cond(function() return box.info.replication[2].downstream.status == 'stopped' end) or box.info.replication[2].downstream.status --- - true ... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'misc_gh3704') --- - true ... diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua index eda5310b6..ec38feefa 100644 --- a/test/replication/misc.test.lua +++ b/test/replication/misc.test.lua @@ -1,5 +1,6 @@ uuid = require('uuid') test_run = require('test_run').new() +replica_set = require('fast_replica') box.schema.user.grant('guest', 'replication') @@ -23,47 +24,45 @@ box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = -- gh-3111 - Allow to rebootstrap a replica from a read-only master replica_uuid = uuid.new() -test_run:cmd('create server test with rpl_master=default, script="replication/replica_uuid.lua"') -test_run:cmd(string.format('start server test with args="%s"', replica_uuid)) -test_run:cmd('stop server test') -test_run:cmd('cleanup server test') +replica_set.create(test_run, 'misc_gh3111', 'replica_uuid') +test_run:cmd(string.format('start server misc_gh3111 with args="%s"', replica_uuid)) +replica_set.hibernate(test_run, 'misc_gh3111') box.cfg{read_only = true} -test_run:cmd(string.format('start server test with args="%s"', replica_uuid)) -test_run:cmd('stop server test') -test_run:cmd('cleanup server test') +test_run:cmd(string.format('start server misc_gh3111 with args="%s"', replica_uuid)) +replica_set.hibernate(test_run, 'misc_gh3111') box.cfg{read_only = false} -test_run:cmd('delete server test') +test_run:cmd('delete server misc_gh3111') test_run:cleanup_cluster() -- gh-3160 - Send heartbeats if there are changes from a remote master only -SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +SERVERS = { 'misc1', 'misc2', 'misc3' } -- Deploy a cluster. test_run:create_cluster(SERVERS, "replication", {args="0.1"}) test_run:wait_fullmesh(SERVERS) -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch misc1") test_run = require('test_run').new() -box.cfg{replication_timeout = 0.01, replication_connect_timeout=0.01} -test_run:cmd("switch autobootstrap2") +box.cfg{replication_timeout = 0.03, replication_connect_timeout=0.03} +test_run:cmd("switch misc2") test_run = require('test_run').new() -box.cfg{replication_timeout = 0.01, replication_connect_timeout=0.01} -test_run:cmd("switch autobootstrap3") +box.cfg{replication_timeout = 0.03, replication_connect_timeout=0.03} +test_run:cmd("switch misc3") test_run = require('test_run').new() fiber=require('fiber') -box.cfg{replication_timeout = 0.01, replication_connect_timeout=0.01} +box.cfg{replication_timeout = 0.03, replication_connect_timeout=0.03} _ = box.schema.space.create('test_timeout'):create_index('pk') test_run:cmd("setopt delimiter ';'") function wait_follow(replicaA, replicaB) return test_run:wait_cond(function() return replicaA.status ~= 'follow' or replicaB.status ~= 'follow' - end, 0.01) + end, 0.1) end ; function test_timeout() local replicaA = box.info.replication[1].upstream or box.info.replication[2].upstream local replicaB = box.info.replication[3].upstream or box.info.replication[2].upstream local follows = test_run:wait_cond(function() return replicaA.status == 'follow' or replicaB.status == 'follow' - end, 0.1) + end, 1) if not follows then error('replicas not in follow status') end for i = 0, 99 do box.space.test_timeout:replace({1}) @@ -78,7 +77,7 @@ test_timeout() -- gh-3247 - Sequence-generated value is not replicated in case -- the request was sent via iproto. -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch misc1") net_box = require('net.box') _ = box.schema.space.create('space1') _ = box.schema.sequence.create('seq') @@ -89,11 +88,11 @@ c = net_box.connect(box.cfg.listen) c.space.space1:insert{box.NULL, "data"} -- fails, but bumps sequence value c.space.space1:insert{box.NULL, 1, "data"} box.space.space1:select{} -vclock = test_run:get_vclock("autobootstrap1") -_ = test_run:wait_vclock("autobootstrap2", vclock) -test_run:cmd("switch autobootstrap2") +vclock = test_run:get_vclock("misc1") +_ = test_run:wait_vclock("misc2", vclock) +test_run:cmd("switch misc2") box.space.space1:select{} -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch misc1") box.space.space1:drop() test_run:cmd("switch default") @@ -109,9 +108,9 @@ old_fno = lim.rlim_cur lim.rlim_cur = 64 rlimit.setrlimit(rlimit.RLIMIT_NOFILE, lim) -test_run:cmd('create server sock with rpl_master=default, script="replication/replica.lua"') -test_run:cmd('start server sock') -test_run:cmd('switch sock') +replica_set.create(test_run, 'misc_gh3642') +test_run:cmd('start server misc_gh3642') +test_run:cmd('switch misc_gh3642') test_run = require('test_run').new() fiber = require('fiber') test_run:cmd("setopt delimiter ';'") @@ -132,26 +131,24 @@ test_run:cmd('switch default') lim.rlim_cur = old_fno rlimit.setrlimit(rlimit.RLIMIT_NOFILE, lim) -test_run:cmd("stop server sock") -test_run:cmd("cleanup server sock") -test_run:cmd("delete server sock") +replica_set.drop(test_run, 'misc_gh3642') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') -- gh-3510 assertion failure in replica_on_applier_disconnect() -test_run:cmd('create server er_load1 with script="replication/er_load1.lua"') -test_run:cmd('create server er_load2 with script="replication/er_load2.lua"') -test_run:cmd('start server er_load1 with wait=False, wait_load=False') --- instance er_load2 will fail with error ER_READONLY. this is ok. --- We only test here that er_load1 doesn't assert. -test_run:cmd('start server er_load2 with wait=True, wait_load=True, crash_expected = True') -test_run:cmd('stop server er_load1') --- er_load2 exits automatically. -test_run:cmd('cleanup server er_load1') -test_run:cmd('cleanup server er_load2') -test_run:cmd('delete server er_load1') -test_run:cmd('delete server er_load2') +replica_set.create(test_run, 'misc_gh3510_1', 'er_load1') +replica_set.create(test_run, 'misc_gh3510_2', 'er_load2') +test_run:cmd('start server misc_gh3510_1 with wait=False, wait_load=False') +-- instance misc_gh3510_2 will fail with error ER_READONLY. this is ok. +-- We only test here that misc_gh3510_1 doesn't assert. +test_run:cmd('start server misc_gh3510_2 with wait=True, wait_load=True, crash_expected = True') +test_run:cmd('stop server misc_gh3510_1') +-- misc_gh3510_2 exits automatically. +test_run:cmd('cleanup server misc_gh3510_1') +test_run:cmd('cleanup server misc_gh3510_2') +test_run:cmd('delete server misc_gh3510_1') +test_run:cmd('delete server misc_gh3510_2') test_run:cleanup_cluster() -- @@ -159,20 +156,18 @@ test_run:cleanup_cluster() -- an error. Now check that we don't hang and successfully connect. -- fiber = require('fiber') -test_run:cmd("create server replica_auth with rpl_master=default, script='replication/replica_auth.lua'") -test_run:cmd("start server replica_auth with wait=False, wait_load=False, args='cluster:pass 0.05'") +replica_set.create(test_run, 'misc_gh3637', 'replica_auth') +test_run:cmd("start server misc_gh3637 with wait=False, wait_load=False, args='cluster:pass 0.05'") -- Wait a bit to make sure replica waits till user is created. fiber.sleep(0.1) box.schema.user.create('cluster', {password='pass'}) box.schema.user.grant('cluster', 'replication') -while box.info.replication[2] == nil do fiber.sleep(0.01) end +while box.info.replication[2] == nil do fiber.sleep(0.03) end vclock = test_run:get_vclock('default') -_ = test_run:wait_vclock('replica_auth', vclock) +_ = test_run:wait_vclock('misc_gh3637', vclock) -test_run:cmd("stop server replica_auth") -test_run:cmd("cleanup server replica_auth") -test_run:cmd("delete server replica_auth") +replica_set.drop(test_run, 'misc_gh3637') test_run:cleanup_cluster() box.schema.user.drop('cluster') @@ -182,9 +177,9 @@ box.schema.user.drop('cluster') -- when trying to connect to the same master twice. -- box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") -test_run:cmd("switch replica") +replica_set.create(test_run, 'misc_gh3610') +test_run:cmd("start server misc_gh3610") +test_run:cmd("switch misc_gh3610") replication = box.cfg.replication[1] box.cfg{replication = {replication, replication}} @@ -193,17 +188,15 @@ test_run:cmd("switch default") listen = box.cfg.listen box.cfg{listen = ''} -test_run:cmd("switch replica") -box.cfg{replication_connect_quorum = 0, replication_connect_timeout = 0.01} +test_run:cmd("switch misc_gh3610") +box.cfg{replication_connect_quorum = 0, replication_connect_timeout = 0.03} box.cfg{replication = {replication, replication}} test_run:cmd("switch default") box.cfg{listen = listen} -while test_run:grep_log('replica', 'duplicate connection') == nil do fiber.sleep(0.01) end +while test_run:grep_log('misc_gh3610', 'duplicate connection') == nil do fiber.sleep(0.03) end -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'misc_gh3610') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') @@ -212,14 +205,14 @@ box.schema.user.revoke('guest', 'replication') -- configuration didn't change. -- box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") +replica_set.create(test_run, 'misc_gh3711') +test_run:cmd("start server misc_gh3711") -- Access rights are checked only during reconnect. If the new -- and old configurations are equivalent, no reconnect will be -- issued and replication should continue working. box.schema.user.revoke('guest', 'replication') -test_run:cmd("switch replica") +test_run:cmd("switch misc_gh3711") replication = box.cfg.replication[1] box.cfg{replication = {replication}} box.info.status == 'running' @@ -229,39 +222,35 @@ box.info.status == 'running' -- Check that comparison of tables works as expected as well. test_run:cmd("switch default") box.schema.user.grant('guest', 'replication') -test_run:cmd("switch replica") +test_run:cmd("switch misc_gh3711") replication = box.cfg.replication table.insert(replication, box.cfg.listen) test_run:cmd("switch default") box.schema.user.revoke('guest', 'replication') -test_run:cmd("switch replica") +test_run:cmd("switch misc_gh3711") box.cfg{replication = replication} box.info.status == 'running' test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'misc_gh3711') test_run:cleanup_cluster() -- -- gh-3704 move cluster id check to replica -- -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'misc_gh3704') box.schema.user.grant("guest", "replication") -test_run:cmd("start server replica") -test_run:grep_log("replica", "REPLICASET_UUID_MISMATCH") +test_run:cmd("start server misc_gh3704") +test_run:grep_log("misc_gh3704", "REPLICASET_UUID_MISMATCH") box.info.replication[2].downstream.status -- change master's cluster uuid and check that replica doesn't connect. -test_run:cmd("stop server replica") +test_run:cmd("stop server misc_gh3704") _ = box.space._schema:replace{'cluster', tostring(uuid.new())} -- master believes replica is in cluster, but their cluster UUIDs differ. -test_run:cmd("start server replica") -test_run:wait_log("replica", "REPLICASET_UUID_MISMATCH", nil, 1.0) -box.info.replication[2].downstream.status +test_run:cmd("start server misc_gh3704") +test_run:wait_log("misc_gh3704", "REPLICASET_UUID_MISMATCH", nil, 1.0) +test_run:wait_cond(function() return box.info.replication[2].downstream.status == 'stopped' end) or box.info.replication[2].downstream.status -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'misc_gh3704') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/misc1.lua b/test/replication/misc1.lua new file mode 120000 index 000000000..ced948b89 --- /dev/null +++ b/test/replication/misc1.lua @@ -0,0 +1 @@ +misc.lua \ No newline at end of file diff --git a/test/replication/misc2.lua b/test/replication/misc2.lua new file mode 120000 index 000000000..ced948b89 --- /dev/null +++ b/test/replication/misc2.lua @@ -0,0 +1 @@ +misc.lua \ No newline at end of file diff --git a/test/replication/misc3.lua b/test/replication/misc3.lua new file mode 120000 index 000000000..ced948b89 --- /dev/null +++ b/test/replication/misc3.lua @@ -0,0 +1 @@ +misc.lua \ No newline at end of file diff --git a/test/replication/on_replace.lua b/test/replication/on_replace.lua index 40c12a9ea..93544afbe 100644 --- a/test/replication/on_replace.lua +++ b/test/replication/on_replace.lua @@ -7,7 +7,7 @@ local PASSWORD = 'somepassword' local SOCKET_DIR = require('fio').cwd() local TIMEOUT = tonumber(arg[1]) -local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 30.0 +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 local function instance_uri(instance_id) --return 'localhost:'..(3310 + instance_id) diff --git a/test/replication/on_replace.result b/test/replication/on_replace.result index 2e95b90ea..2e3df5c3f 100644 --- a/test/replication/on_replace.result +++ b/test/replication/on_replace.result @@ -1,13 +1,13 @@ -- -- Check that replication applier invokes on_replace triggers -- -env = require('test_run') +test_run = require('test_run').new() --- ... -test_run = env.new() +fiber = require('fiber') --- ... -fiber = require('fiber') +replica_set = require('fast_replica') --- ... _ = box.schema.space.create('test') @@ -19,15 +19,14 @@ _ = box.space.test:create_index('primary') box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'on_replace') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server on_replace") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch on_replace") --- - true ... @@ -59,7 +58,7 @@ box.space.test:insert{2} --- - [2] ... -test_run:cmd("switch replica") +test_run:cmd("switch on_replace") --- - true ... @@ -83,15 +82,7 @@ test_run:cmd("switch default") -- -- cleanup -- -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'on_replace') --- - true ... @@ -108,7 +99,7 @@ box.schema.user.revoke('guest', 'replication') SERVERS = { 'on_replace1', 'on_replace2' } --- ... -test_run:create_cluster(SERVERS, "replication", {args="0.2"}) +test_run:create_cluster(SERVERS, "replication", {args="0.1"}) --- ... test_run:wait_fullmesh(SERVERS) diff --git a/test/replication/on_replace.test.lua b/test/replication/on_replace.test.lua index e34832103..d427cc766 100644 --- a/test/replication/on_replace.test.lua +++ b/test/replication/on_replace.test.lua @@ -2,17 +2,17 @@ -- Check that replication applier invokes on_replace triggers -- -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() fiber = require('fiber') +replica_set = require('fast_replica') _ = box.schema.space.create('test') _ = box.space.test:create_index('primary') box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") -test_run:cmd("switch replica") +replica_set.create(test_run, 'on_replace') +test_run:cmd("start server on_replace") +test_run:cmd("switch on_replace") session_type = nil -- -- gh-2642: box.session.type() in replication applier @@ -25,7 +25,7 @@ box.space.test:insert{1} session_type test_run:cmd("switch default") box.space.test:insert{2} -test_run:cmd("switch replica") +test_run:cmd("switch on_replace") fiber = require('fiber') while box.space.test:count() < 2 do fiber.sleep(0.01) end -- @@ -36,18 +36,15 @@ test_run:cmd("switch default") -- -- cleanup -- -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'on_replace') test_run:cleanup_cluster() box.space.test:drop() box.schema.user.revoke('guest', 'replication') - -- gh-2682 on_replace on slave server with data change SERVERS = { 'on_replace1', 'on_replace2' } -test_run:create_cluster(SERVERS, "replication", {args="0.2"}) +test_run:create_cluster(SERVERS, "replication", {args="0.1"}) test_run:wait_fullmesh(SERVERS) test_run:cmd('switch on_replace1') diff --git a/test/replication/prune.result b/test/replication/prune.result index 1a130df40..07b0e8c04 100644 --- a/test/replication/prune.result +++ b/test/replication/prune.result @@ -7,10 +7,10 @@ print 'gh-806: cant prune old replicas by deleting their server ids' print '-------------------------------------------------------------' --- ... -env = require('test_run') +test_name = 'tp1_prune' --- ... -test_run = env.new() +test_run = require('test_run').new() --- ... engine = test_run:get_cfg('engine') @@ -43,7 +43,7 @@ for i=1,10 do space:insert{i, 'test'} end --- ... -- create max number of replicas and check -replica_set.join(test_run, box.schema.REPLICA_MAX - 2) +replica_set.join(test_run, test_name, box.schema.REPLICA_MAX - 2) --- ... while box.space._cluster:len() ~= box.schema.REPLICA_MAX - 1 do fiber.sleep(0.001) end @@ -62,7 +62,7 @@ box.space._cluster:insert{box.schema.REPLICA_MAX, uuid.str()} - error: 'Replica count limit reached: 32' ... -- Delete all replication nodes -replica_set.drop_all(test_run) +replica_set.prune_all(test_run, test_name) --- ... box.space._cluster:len() == 1 @@ -76,6 +76,9 @@ box.snapshot() ... -- Master is not crashed then recovering xlog with {replica_id: 0} in header test_run:cmd('restart server default') +test_name = 'tp2_prune' +--- +... replica_set = require('fast_replica') --- ... @@ -83,14 +86,14 @@ fiber = require('fiber') --- ... -- Rejoin replica and check -replica_set.join(test_run, 1) +replica_set.join(test_run, test_name, 1) --- ... while box.space._cluster:len() ~= 2 do fiber.sleep(0.001) end --- ... -- Check server ids -test_run:cmd('eval replica1 "return box.info.id"') +test_run:cmd('eval '..test_name..' "return box.info.id"') --- - [2] ... @@ -99,22 +102,23 @@ box.space._cluster:len() == 2 - true ... -- Cleanup -replica_set.drop_all(test_run) +replica_set.prune(test_run, test_name) --- +- true ... box.space._cluster:len() == 1 --- - true ... -- delete replica from master -replica_set.join(test_run, 1) +replica_set.join(test_run, test_name, 1) --- ... while box.space._cluster:len() ~= 2 do fiber.sleep(0.001) end --- ... -- Check server ids -test_run:cmd('eval replica1 "return box.info.id"') +test_run:cmd('eval '..test_name..' "return box.info.id"') --- - [2] ... @@ -122,18 +126,19 @@ box.space._cluster:len() == 2 --- - true ... -replica_set.unregister(test_run, 2) +replica_set.unregister(test_run) --- +- true ... -while test_run:cmd('eval replica1 "box.info.replication[1].upstream.status"')[1] ~= 'stopped' do fiber.sleep(0.001) end +while test_run:cmd('eval '..test_name..' "box.info.replication[1].upstream.status"')[1] ~= 'stopped' do fiber.sleep(0.001) end --- ... -test_run:cmd('eval replica1 "box.info.replication[1].upstream.message"') +test_run:cmd('eval '..test_name..' "box.info.replication[1].upstream.message"') --- - ['The local instance id 2 is read-only'] ... -- restart replica and check that replica isn't able to join to cluster -test_run:cmd('restart server replica1') +test_run:cmd('restart server '..test_name) --- - true ... @@ -145,16 +150,17 @@ box.space._cluster:len() == 1 --- - true ... -test_run:cmd('eval replica1 "box.info.replication[1].upstream.status"') +test_run:cmd('eval '..test_name..' "box.info.replication[1].upstream.status"') --- - ['stopped'] ... -test_run:cmd('eval replica1 "box.info.replication[1].upstream.message"')[1]:match("is not registered with replica set") ~= nil +test_run:cmd('eval '..test_name..' "box.info.replication[1].upstream.message"')[1]:match("is not registered with replica set") ~= nil --- - true ... -replica_set.delete(test_run, 2) +replica_set.drop(test_run, test_name) --- +- true ... box.space.test:drop() --- diff --git a/test/replication/prune.test.lua b/test/replication/prune.test.lua index 80847325b..d78596c1d 100644 --- a/test/replication/prune.test.lua +++ b/test/replication/prune.test.lua @@ -2,8 +2,8 @@ print '-------------------------------------------------------------' print 'gh-806: cant prune old replicas by deleting their server ids' print '-------------------------------------------------------------' -env = require('test_run') -test_run = env.new() +test_name = 'tp1_prune' +test_run = require('test_run').new() engine = test_run:get_cfg('engine') replica_set = require('fast_replica') fiber = require('fiber') @@ -20,7 +20,7 @@ index = box.space.test:create_index('primary') for i=1,10 do space:insert{i, 'test'} end -- create max number of replicas and check -replica_set.join(test_run, box.schema.REPLICA_MAX - 2) +replica_set.join(test_run, test_name, box.schema.REPLICA_MAX - 2) while box.space._cluster:len() ~= box.schema.REPLICA_MAX - 1 do fiber.sleep(0.001) end box.space._cluster:len() == box.schema.REPLICA_MAX - 1 @@ -30,49 +30,50 @@ uuid = require('uuid') box.space._cluster:insert{box.schema.REPLICA_MAX, uuid.str()} -- Delete all replication nodes -replica_set.drop_all(test_run) +replica_set.prune_all(test_run, test_name) box.space._cluster:len() == 1 -- Save a snapshot without removed replicas in vclock box.snapshot() + -- Master is not crashed then recovering xlog with {replica_id: 0} in header test_run:cmd('restart server default') +test_name = 'tp2_prune' replica_set = require('fast_replica') fiber = require('fiber') -- Rejoin replica and check -replica_set.join(test_run, 1) +replica_set.join(test_run, test_name, 1) while box.space._cluster:len() ~= 2 do fiber.sleep(0.001) end -- Check server ids -test_run:cmd('eval replica1 "return box.info.id"') +test_run:cmd('eval '..test_name..' "return box.info.id"') box.space._cluster:len() == 2 -- Cleanup -replica_set.drop_all(test_run) +replica_set.prune(test_run, test_name) box.space._cluster:len() == 1 -- delete replica from master -replica_set.join(test_run, 1) +replica_set.join(test_run, test_name, 1) while box.space._cluster:len() ~= 2 do fiber.sleep(0.001) end -- Check server ids -test_run:cmd('eval replica1 "return box.info.id"') +test_run:cmd('eval '..test_name..' "return box.info.id"') box.space._cluster:len() == 2 -replica_set.unregister(test_run, 2) +replica_set.unregister(test_run) -while test_run:cmd('eval replica1 "box.info.replication[1].upstream.status"')[1] ~= 'stopped' do fiber.sleep(0.001) end -test_run:cmd('eval replica1 "box.info.replication[1].upstream.message"') +while test_run:cmd('eval '..test_name..' "box.info.replication[1].upstream.status"')[1] ~= 'stopped' do fiber.sleep(0.001) end +test_run:cmd('eval '..test_name..' "box.info.replication[1].upstream.message"') -- restart replica and check that replica isn't able to join to cluster -test_run:cmd('restart server replica1') +test_run:cmd('restart server '..test_name) test_run:cmd('switch default') box.space._cluster:len() == 1 -test_run:cmd('eval replica1 "box.info.replication[1].upstream.status"') -test_run:cmd('eval replica1 "box.info.replication[1].upstream.message"')[1]:match("is not registered with replica set") ~= nil -replica_set.delete(test_run, 2) +test_run:cmd('eval '..test_name..' "box.info.replication[1].upstream.status"') +test_run:cmd('eval '..test_name..' "box.info.replication[1].upstream.message"')[1]:match("is not registered with replica set") ~= nil +replica_set.drop(test_run, test_name) box.space.test:drop() - box.schema.user.revoke('guest', 'read,write,execute', 'universe') diff --git a/test/replication/quorum.lua b/test/replication/quorum.lua index f61c8748f..82f432b2d 100644 --- a/test/replication/quorum.lua +++ b/test/replication/quorum.lua @@ -6,7 +6,7 @@ local INSTANCE_ID = string.match(arg[0], "%d") local SOCKET_DIR = require('fio').cwd() local TIMEOUT = tonumber(arg[1]) -local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 30.0 +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 local function instance_uri(instance_id) --return 'localhost:'..(3310 + instance_id) diff --git a/test/replication/quorum.result b/test/replication/quorum.result index ff5fa0150..0431bf9df 100644 --- a/test/replication/quorum.result +++ b/test/replication/quorum.result @@ -1,6 +1,9 @@ test_run = require('test_run').new() --- ... +replica_set = require('fast_replica') +--- +... SERVERS = {'quorum1', 'quorum2', 'quorum3'} --- ... @@ -28,9 +31,9 @@ test_run:cmd('switch quorum2') - true ... test_run:cmd('restart server quorum2 with args="0.1 0.5"') -box.info.status -- orphan +test_run:wait_cond(function() return box.info.status == 'orphan' end) or box.info.status --- -- orphan +- true ... box.ctl.wait_rw(0.001) -- timeout --- @@ -47,14 +50,14 @@ box.space.test:replace{100} -- error box.cfg{replication={}} --- ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... test_run:cmd('restart server quorum2 with args="0.1 0.5"') -box.info.status -- orphan +test_run:wait_cond(function() return box.info.status == 'orphan' end) or box.info.status --- -- orphan +- true ... box.ctl.wait_rw(0.001) -- timeout --- @@ -78,14 +81,14 @@ box.info.ro -- false --- - false ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... test_run:cmd('restart server quorum2 with args="0.1 0.5"') -box.info.status -- orphan +test_run:wait_cond(function() return box.info.status == 'orphan' end) or box.info.status --- -- orphan +- true ... box.ctl.wait_rw(0.001) -- timeout --- @@ -110,20 +113,20 @@ box.info.ro -- false --- - false ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... -- Check that the replica follows all masters. -box.info.id == 1 or box.info.replication[1].upstream.status == 'follow' +test_run:wait_cond(function() return box.info.id == 1 or box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- - true ... -box.info.id == 2 or box.info.replication[2].upstream.status == 'follow' +test_run:wait_cond(function() return box.info.id == 2 or box.info.replication[2].upstream.status == 'follow' end) or box.info.replication[2].upstream.status --- - true ... -box.info.id == 3 or box.info.replication[3].upstream.status == 'follow' +test_run:wait_cond(function() return box.info.id == 3 or box.info.replication[3].upstream.status == 'follow' end) or box.info.replication[3].upstream.status --- - true ... @@ -158,7 +161,7 @@ fiber = require('fiber') fiber.sleep(0.1) --- ... -test_run:cmd('start server quorum1 with args="0.1 0.5"') +test_run:cmd('start server quorum1 with args="0.1 0.5"') --- - true ... @@ -224,13 +227,6 @@ test_run:cmd('switch quorum3') fiber = require('fiber') --- ... -while box.info.replication[4].upstream.status ~= 'follow' do fiber.sleep(0.001) end ---- -... -box.info.replication[4].upstream.status ---- -- follow -... -- Cleanup. test_run:cmd('switch default') --- @@ -256,21 +252,20 @@ space:insert{1} --- - [1] ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica_no_quorum.lua'") +replica_set.create(test_run, 'quorum_gh3278', 'replica_no_quorum') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server quorum_gh3278") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch quorum_gh3278") --- - true ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... box.space.test:select() --- @@ -280,7 +275,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server quorum_gh3278") --- - true ... @@ -290,17 +285,17 @@ listen = box.cfg.listen box.cfg{listen = ''} --- ... -test_run:cmd("start server replica") +test_run:cmd("start server quorum_gh3278") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch quorum_gh3278") --- - true ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... test_run:cmd("switch default") --- @@ -317,16 +312,16 @@ space:insert{2} vclock = test_run:get_vclock("default") --- ... -_ = test_run:wait_vclock("replica", vclock) +_ = test_run:wait_vclock("quorum_gh3278", vclock) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch quorum_gh3278") --- - true ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... box.space.test:select() --- @@ -337,11 +332,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") +replica_set.hibernate(test_run, 'quorum_gh3278') --- - true ... @@ -406,7 +397,7 @@ test_run:drop_cluster(SERVERS) box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd('create server replica_quorum with script="replication/replica_quorum.lua"') +test_run:cmd('create server quorum with script="replication/replica_quorum.lua"') --- - true ... @@ -414,11 +405,11 @@ test_run:cmd('create server replica_quorum with script="replication/replica_quor -- replication_connect_timeout. -- If replication_connect_quorum was ignored here, the instance -- would exit with an error. -test_run:cmd('start server replica_quorum with wait=True, wait_load=True, args="1 0.05 0.1"') +test_run:cmd('start server quorum with wait=True, wait_load=True, args="1 0.05 0.1"') --- - true ... -test_run:cmd('switch replica_quorum') +test_run:cmd('switch quorum') --- - true ... @@ -435,15 +426,7 @@ test_run:cmd('switch default') --- - true ... -test_run:cmd('stop server replica_quorum') ---- -- true -... -test_run:cmd('cleanup server replica_quorum') ---- -- true -... -test_run:cmd('delete server replica_quorum') +replica_set.drop(test_run, 'quorum') --- - true ... diff --git a/test/replication/quorum.test.lua b/test/replication/quorum.test.lua index 98febb367..5f01bd604 100644 --- a/test/replication/quorum.test.lua +++ b/test/replication/quorum.test.lua @@ -1,4 +1,5 @@ test_run = require('test_run').new() +replica_set = require('fast_replica') SERVERS = {'quorum1', 'quorum2', 'quorum3'} @@ -19,37 +20,37 @@ test_run:cmd('stop server quorum1') test_run:cmd('switch quorum2') test_run:cmd('restart server quorum2 with args="0.1 0.5"') -box.info.status -- orphan +test_run:wait_cond(function() return box.info.status == 'orphan' end) or box.info.status box.ctl.wait_rw(0.001) -- timeout box.info.ro -- true box.space.test:replace{100} -- error box.cfg{replication={}} -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status test_run:cmd('restart server quorum2 with args="0.1 0.5"') -box.info.status -- orphan +test_run:wait_cond(function() return box.info.status == 'orphan' end) or box.info.status box.ctl.wait_rw(0.001) -- timeout box.info.ro -- true box.space.test:replace{100} -- error box.cfg{replication_connect_quorum = 2} box.ctl.wait_rw() box.info.ro -- false -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status test_run:cmd('restart server quorum2 with args="0.1 0.5"') -box.info.status -- orphan +test_run:wait_cond(function() return box.info.status == 'orphan' end) or box.info.status box.ctl.wait_rw(0.001) -- timeout box.info.ro -- true box.space.test:replace{100} -- error test_run:cmd('start server quorum1 with args="0.1 0.5"') box.ctl.wait_rw() box.info.ro -- false -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status -- Check that the replica follows all masters. -box.info.id == 1 or box.info.replication[1].upstream.status == 'follow' -box.info.id == 2 or box.info.replication[2].upstream.status == 'follow' -box.info.id == 3 or box.info.replication[3].upstream.status == 'follow' +test_run:wait_cond(function() return box.info.id == 1 or box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.info.id == 2 or box.info.replication[2].upstream.status == 'follow' end) or box.info.replication[2].upstream.status +test_run:wait_cond(function() return box.info.id == 3 or box.info.replication[3].upstream.status == 'follow' end) or box.info.replication[3].upstream.status -- Check that box.cfg() doesn't return until the instance -- catches up with all configured replicas. @@ -63,7 +64,7 @@ for i = 1, 100 do box.space.test:insert{i} end fiber = require('fiber') fiber.sleep(0.1) -test_run:cmd('start server quorum1 with args="0.1 0.5"') +test_run:cmd('start server quorum1 with args="0.1 0.5"') test_run:cmd('switch quorum1') box.space.test:count() -- 100 @@ -91,8 +92,6 @@ while box.info.replication[4].upstream.status ~= 'follow' do fiber.sleep(0.001) box.info.replication[4].upstream.status test_run:cmd('switch quorum3') fiber = require('fiber') -while box.info.replication[4].upstream.status ~= 'follow' do fiber.sleep(0.001) end -box.info.replication[4].upstream.status -- Cleanup. test_run:cmd('switch default') @@ -107,30 +106,29 @@ space = box.schema.space.create('test', {engine = test_run:get_cfg('engine')}); index = box.space.test:create_index('primary') -- Insert something just to check that replica with quorum = 0 works as expected. space:insert{1} -test_run:cmd("create server replica with rpl_master=default, script='replication/replica_no_quorum.lua'") -test_run:cmd("start server replica") -test_run:cmd("switch replica") -box.info.status -- running +replica_set.create(test_run, 'quorum_gh3278', 'replica_no_quorum') +test_run:cmd("start server quorum_gh3278") +test_run:cmd("switch quorum_gh3278") +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status box.space.test:select() test_run:cmd("switch default") -test_run:cmd("stop server replica") +test_run:cmd("stop server quorum_gh3278") listen = box.cfg.listen box.cfg{listen = ''} -test_run:cmd("start server replica") -test_run:cmd("switch replica") -box.info.status -- running +test_run:cmd("start server quorum_gh3278") +test_run:cmd("switch quorum_gh3278") +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status test_run:cmd("switch default") -- Check that replica is able to reconnect, case was broken with earlier quorum "fix". box.cfg{listen = listen} space:insert{2} vclock = test_run:get_vclock("default") -_ = test_run:wait_vclock("replica", vclock) -test_run:cmd("switch replica") -box.info.status -- running +_ = test_run:wait_vclock("quorum_gh3278", vclock) +test_run:cmd("switch quorum_gh3278") +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status box.space.test:select() test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") +replica_set.hibernate(test_run, 'quorum_gh3278') space:drop() box.schema.user.revoke('guest', 'replication') -- Second case, check that master-master works. @@ -154,20 +152,18 @@ test_run:drop_cluster(SERVERS) -- Test that quorum is not ignored neither during bootstrap, nor -- during reconfiguration. box.schema.user.grant('guest', 'replication') -test_run:cmd('create server replica_quorum with script="replication/replica_quorum.lua"') +test_run:cmd('create server quorum with script="replication/replica_quorum.lua"') -- Arguments are: replication_connect_quorum, replication_timeout -- replication_connect_timeout. -- If replication_connect_quorum was ignored here, the instance -- would exit with an error. -test_run:cmd('start server replica_quorum with wait=True, wait_load=True, args="1 0.05 0.1"') -test_run:cmd('switch replica_quorum') +test_run:cmd('start server quorum with wait=True, wait_load=True, args="1 0.05 0.1"') +test_run:cmd('switch quorum') -- If replication_connect_quorum was ignored here, the instance -- would exit with an error. box.cfg{replication={INSTANCE_URI, nonexistent_uri(1)}} box.info.id test_run:cmd('switch default') -test_run:cmd('stop server replica_quorum') -test_run:cmd('cleanup server replica_quorum') -test_run:cmd('delete server replica_quorum') +replica_set.drop(test_run, 'quorum') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/rebootstrap.lua b/test/replication/rebootstrap.lua index 3e7d8f062..472dbe453 100644 --- a/test/replication/rebootstrap.lua +++ b/test/replication/rebootstrap.lua @@ -6,7 +6,7 @@ local INSTANCE_ID = string.match(arg[0], "%d") local SOCKET_DIR = require('fio').cwd() local TIMEOUT = tonumber(arg[1]) -local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 30.0 +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 local function instance_uri(instance_id) return SOCKET_DIR..'/rebootstrap'..instance_id..'.sock'; diff --git a/test/replication/rebootstrap.result b/test/replication/rebootstrap.result index ea390c19f..fccf16533 100644 --- a/test/replication/rebootstrap.result +++ b/test/replication/rebootstrap.result @@ -20,11 +20,11 @@ test_run:cmd('stop server rebootstrap1') --- - true ... -test_run:cmd('restart server rebootstrap2 with cleanup=True, wait=False, wait_load=False, args="0.1 2.0"') +test_run:cmd('restart server rebootstrap2 with cleanup=True, wait=False, wait_load=False, args="0.1"') --- - true ... -test_run:cmd('start server rebootstrap1 with args="0.1 0.5"') +test_run:cmd('start server rebootstrap1 with args="0.1"') --- - true ... @@ -32,9 +32,9 @@ test_run:cmd('switch rebootstrap1') --- - true ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... test_run:cmd('switch default') --- diff --git a/test/replication/rebootstrap.test.lua b/test/replication/rebootstrap.test.lua index 8ddf77912..1af8cbc10 100644 --- a/test/replication/rebootstrap.test.lua +++ b/test/replication/rebootstrap.test.lua @@ -12,10 +12,10 @@ test_run:wait_fullmesh(SERVERS) -- in 'orphan' mode. -- test_run:cmd('stop server rebootstrap1') -test_run:cmd('restart server rebootstrap2 with cleanup=True, wait=False, wait_load=False, args="0.1 2.0"') -test_run:cmd('start server rebootstrap1 with args="0.1 0.5"') +test_run:cmd('restart server rebootstrap2 with cleanup=True, wait=False, wait_load=False, args="0.1"') +test_run:cmd('start server rebootstrap1 with args="0.1"') test_run:cmd('switch rebootstrap1') -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status test_run:cmd('switch default') test_run:drop_cluster(SERVERS) diff --git a/test/replication/recover_missing_xlog.lua b/test/replication/recover_missing_xlog.lua new file mode 100644 index 000000000..5d3eeddee --- /dev/null +++ b/test/replication/recover_missing_xlog.lua @@ -0,0 +1,37 @@ +#!/usr/bin/env tarantool + +-- get instance name from filename (recover_missing_xlog1.lua => recover_missing_xlog1) +local INSTANCE_ID = string.match(arg[0], "%d") +local USER = 'cluster' +local PASSWORD = 'somepassword' +local SOCKET_DIR = require('fio').cwd() +local TIMEOUT = tonumber(arg[1]) +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 + +local function instance_uri(instance_id) + --return 'localhost:'..(3310 + instance_id) + return SOCKET_DIR..'/recover_missing_xlog'..instance_id..'.sock'; +end + +-- start console first +require('console').listen(os.getenv('ADMIN')) + +box.cfg({ + listen = instance_uri(INSTANCE_ID); +-- log_level = 7; + replication = { + USER..':'..PASSWORD..'@'..instance_uri(1); + USER..':'..PASSWORD..'@'..instance_uri(2); + USER..':'..PASSWORD..'@'..instance_uri(3); + }; + replication_timeout = TIMEOUT; + replication_connect_timeout = CON_TIMEOUT; +}) + +box.once("bootstrap", function() + local test_run = require('test_run').new() + box.schema.user.create(USER, { password = PASSWORD }) + box.schema.user.grant(USER, 'replication') + box.schema.space.create('test', {engine = test_run:get_cfg('engine')}) + box.space.test:create_index('primary') +end) diff --git a/test/replication/recover_missing_xlog.result b/test/replication/recover_missing_xlog.result index ef4c29e50..667700284 100644 --- a/test/replication/recover_missing_xlog.result +++ b/test/replication/recover_missing_xlog.result @@ -4,7 +4,7 @@ env = require('test_run') test_run = env.new() --- ... -SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +SERVERS = { 'recover_missing_xlog1', 'recover_missing_xlog2', 'recover_missing_xlog3' } --- ... -- Start servers @@ -15,7 +15,7 @@ test_run:create_cluster(SERVERS, "replication", {args="0.1"}) test_run:wait_fullmesh(SERVERS) --- ... -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch recover_missing_xlog1") --- - true ... @@ -30,13 +30,13 @@ test_run:cmd('switch default') --- - true ... -vclock1 = test_run:get_vclock('autobootstrap1') +vclock1 = test_run:get_vclock('recover_missing_xlog1') --- ... vclock2 = test_run:wait_cluster_vclock(SERVERS, vclock1) --- ... -test_run:cmd("switch autobootstrap2") +test_run:cmd("switch recover_missing_xlog2") --- - true ... @@ -48,7 +48,7 @@ box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.01) --- - ok ... -test_run:cmd("stop server autobootstrap1") +test_run:cmd("stop server recover_missing_xlog1") --- - true ... @@ -62,18 +62,18 @@ fio = require('fio') -- all missing data from replica. -- Also check that there is no concurrency, i.e. master is -- in 'read-only' mode unless it receives all data. -list = fio.glob(fio.pathjoin(fio.abspath("."), 'autobootstrap1/*.xlog')) +list = fio.glob(fio.pathjoin(fio.abspath("."), 'recover_missing_xlog1/*.xlog')) --- ... fio.unlink(list[#list]) --- - true ... -test_run:cmd('start server autobootstrap1 with args="0.1 0.5"') +test_run:cmd('start server recover_missing_xlog1 with args="0.1 0.5"') --- - true ... -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch recover_missing_xlog1") --- - true ... diff --git a/test/replication/recover_missing_xlog.test.lua b/test/replication/recover_missing_xlog.test.lua index 2cd73520f..4e131293c 100644 --- a/test/replication/recover_missing_xlog.test.lua +++ b/test/replication/recover_missing_xlog.test.lua @@ -1,24 +1,24 @@ env = require('test_run') test_run = env.new() -SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +SERVERS = { 'recover_missing_xlog1', 'recover_missing_xlog2', 'recover_missing_xlog3' } -- Start servers test_run:create_cluster(SERVERS, "replication", {args="0.1"}) -- Wait for full mesh test_run:wait_fullmesh(SERVERS) -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch recover_missing_xlog1") for i = 0, 9 do box.space.test:insert{i, 'test' .. i} end box.space.test:count() test_run:cmd('switch default') -vclock1 = test_run:get_vclock('autobootstrap1') +vclock1 = test_run:get_vclock('recover_missing_xlog1') vclock2 = test_run:wait_cluster_vclock(SERVERS, vclock1) -test_run:cmd("switch autobootstrap2") +test_run:cmd("switch recover_missing_xlog2") box.space.test:count() box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.01) -test_run:cmd("stop server autobootstrap1") +test_run:cmd("stop server recover_missing_xlog1") fio = require('fio') -- This test checks ability to recover missing local data -- from remote replica. See #3210. @@ -27,11 +27,11 @@ fio = require('fio') -- all missing data from replica. -- Also check that there is no concurrency, i.e. master is -- in 'read-only' mode unless it receives all data. -list = fio.glob(fio.pathjoin(fio.abspath("."), 'autobootstrap1/*.xlog')) +list = fio.glob(fio.pathjoin(fio.abspath("."), 'recover_missing_xlog1/*.xlog')) fio.unlink(list[#list]) -test_run:cmd('start server autobootstrap1 with args="0.1 0.5"') +test_run:cmd('start server recover_missing_xlog1 with args="0.1 0.5"') -test_run:cmd("switch autobootstrap1") +test_run:cmd("switch recover_missing_xlog1") for i = 10, 19 do box.space.test:insert{i, 'test' .. i} end fiber = require('fiber') box.space.test:select() diff --git a/test/replication/recover_missing_xlog1.lua b/test/replication/recover_missing_xlog1.lua new file mode 120000 index 000000000..4102da849 --- /dev/null +++ b/test/replication/recover_missing_xlog1.lua @@ -0,0 +1 @@ +recover_missing_xlog.lua \ No newline at end of file diff --git a/test/replication/recover_missing_xlog2.lua b/test/replication/recover_missing_xlog2.lua new file mode 120000 index 000000000..4102da849 --- /dev/null +++ b/test/replication/recover_missing_xlog2.lua @@ -0,0 +1 @@ +recover_missing_xlog.lua \ No newline at end of file diff --git a/test/replication/recover_missing_xlog3.lua b/test/replication/recover_missing_xlog3.lua new file mode 120000 index 000000000..4102da849 --- /dev/null +++ b/test/replication/recover_missing_xlog3.lua @@ -0,0 +1 @@ +recover_missing_xlog.lua \ No newline at end of file diff --git a/test/replication/replica.lua b/test/replication/replica.lua index 20ac064e1..855cf9f50 100644 --- a/test/replication/replica.lua +++ b/test/replication/replica.lua @@ -4,8 +4,8 @@ box.cfg({ listen = os.getenv("LISTEN"), replication = os.getenv("MASTER"), memtx_memory = 107374182, - replication_timeout = 0.1, - replication_connect_timeout = 0.5, + replication_timeout = 1, + replication_connect_timeout = 100, }) require('console').listen(os.getenv('ADMIN')) diff --git a/test/replication/replica_auth.lua b/test/replication/replica_auth.lua index 22ba9146c..306ef578b 100644 --- a/test/replication/replica_auth.lua +++ b/test/replication/replica_auth.lua @@ -2,7 +2,7 @@ local USER_PASS = arg[1] local TIMEOUT = arg[2] and tonumber(arg[2]) or 0.1 -local CON_TIMEOUT = arg[3] and tonumber(arg[3]) or 30.0 +local CON_TIMEOUT = arg[3] and tonumber(arg[3]) or 60.0 require('console').listen(os.getenv('ADMIN')) diff --git a/test/replication/replica_quorum.lua b/test/replication/replica_quorum.lua index dd42b8214..518c30181 100644 --- a/test/replication/replica_quorum.lua +++ b/test/replication/replica_quorum.lua @@ -4,7 +4,7 @@ local SOCKET_DIR = require('fio').cwd() local QUORUM = tonumber(arg[1]) local TIMEOUT = arg[2] and tonumber(arg[2]) or 0.1 -local CON_TIMEOUT = arg[3] and tonumber(arg[3]) or 30.0 +local CON_TIMEOUT = arg[3] and tonumber(arg[3]) or 60.0 INSTANCE_URI = SOCKET_DIR .. '/replica_quorum.sock' function nonexistent_uri(id) diff --git a/test/replication/replica_rejoin.result b/test/replication/replica_rejoin.result index 87d626e20..7eedfb4e1 100644 --- a/test/replication/replica_rejoin.result +++ b/test/replication/replica_rejoin.result @@ -1,10 +1,10 @@ -env = require('test_run') +test_run = require('test_run').new() --- ... -test_run = env.new() +engine = test_run:get_cfg('engine') --- ... -engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') --- ... test_run:cleanup_cluster() @@ -33,19 +33,18 @@ _ = box.space.test:insert{3} --- ... -- Join a replica, then stop it. -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'replica_rejoin') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server replica_rejoin") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") --- - true ... -box.info.replication[1].upstream.status == 'follow' or box.info +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- - true ... @@ -59,7 +58,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server replica_rejoin") --- - true ... @@ -102,16 +101,16 @@ _ = box.space.test:insert{30} fio = require('fio') --- ... -#fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) -- 1 +test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') --- -- 1 +- true ... box.cfg{checkpoint_count = checkpoint_count} --- ... -- Restart the replica. Since xlogs have been removed, -- it is supposed to rejoin without changing id. -test_run:cmd("start server replica") +test_run:cmd("start server replica_rejoin") --- - true ... @@ -119,11 +118,11 @@ box.info.replication[2].downstream.vclock ~= nil or box.info --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") --- - true ... -box.info.replication[1].upstream.status == 'follow' or box.info +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- - true ... @@ -144,10 +143,10 @@ for i = 10, 30, 10 do box.space.test:update(i, {{'!', 1, i}}) end vclock = test_run:get_vclock('default') --- ... -_ = test_run:wait_vclock('replica', vclock) +_ = test_run:wait_vclock('replica_rejoin', vclock) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") --- - true ... @@ -158,7 +157,7 @@ box.space.test:select() - [30, 30] ... -- Check that restart works as usual. -test_run:cmd("restart server replica") +test_run:cmd("restart server replica_rejoin") box.info.replication[1].upstream.status == 'follow' or box.info --- - true @@ -179,7 +178,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server replica_rejoin") --- - true ... @@ -203,18 +202,18 @@ for i = 1, 3 do box.space.test:insert{i * 100} end fio = require('fio') --- ... -#fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) -- 1 +test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') --- -- 1 +- true ... box.cfg{checkpoint_count = checkpoint_count} --- ... -test_run:cmd("start server replica") +test_run:cmd("start server replica_rejoin") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") --- - true ... @@ -238,11 +237,11 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server replica_rejoin") --- - true ... -test_run:cmd("cleanup server replica") +test_run:cmd("cleanup server replica_rejoin") --- - true ... @@ -252,12 +251,12 @@ test_run:cleanup_cluster() box.space.test:truncate() --- ... -test_run:cmd("start server replica") +test_run:cmd("start server replica_rejoin") --- - true ... -- Subscribe the master to the replica. -replica_listen = test_run:cmd("eval replica 'return box.cfg.listen'") +replica_listen = test_run:cmd("eval replica_rejoin 'return box.cfg.listen'") --- ... replica_listen ~= nil @@ -268,7 +267,7 @@ box.cfg{replication = replica_listen} --- ... -- Unsubscribe the replica from the master. -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") --- - true ... @@ -285,14 +284,14 @@ box.space.test:replace{1} - [1] ... -- Bump vclock on the replica. -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") --- - true ... for i = 1, 10 do box.space.test:replace{2} end --- ... -vclock = test_run:get_vclock('replica') +vclock = test_run:get_vclock('replica_rejoin') --- ... _ = test_run:wait_vclock('default', vclock) @@ -304,7 +303,7 @@ test_run:cmd("switch default") - true ... test_run:cmd("restart server default") -replica_listen = test_run:cmd("eval replica 'return box.cfg.listen'") +replica_listen = test_run:cmd("eval replica_rejoin 'return box.cfg.listen'") --- ... replica_listen ~= nil @@ -330,26 +329,26 @@ box.cfg{checkpoint_count = default_checkpoint_count} fio = require('fio') --- ... -#fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 +test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') --- - true ... -- Bump vclock on the replica again. -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") --- - true ... for i = 1, 10 do box.space.test:replace{2} end --- ... -vclock = test_run:get_vclock('replica') +vclock = test_run:get_vclock('replica_rejoin') --- ... _ = test_run:wait_vclock('default', vclock) --- ... -- Restart the replica. It should successfully rebootstrap. -test_run:cmd("restart server replica") +test_run:cmd("restart server replica_rejoin") box.space.test:select() --- - - [1] @@ -371,15 +370,15 @@ test_run:cmd("switch default") box.cfg{replication = ''} --- ... -test_run:cmd("stop server replica") +test_run:cmd("stop server replica_rejoin") --- - true ... -test_run:cmd("cleanup server replica") +test_run:cmd("cleanup server replica_rejoin") --- - true ... -test_run:cmd("delete server replica") +test_run:cmd("delete server replica_rejoin") --- - true ... diff --git a/test/replication/replica_rejoin.test.lua b/test/replication/replica_rejoin.test.lua index 9bf43eff8..588807e2f 100644 --- a/test/replication/replica_rejoin.test.lua +++ b/test/replication/replica_rejoin.test.lua @@ -1,6 +1,6 @@ -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') test_run:cleanup_cluster() @@ -16,13 +16,13 @@ _ = box.space.test:insert{2} _ = box.space.test:insert{3} -- Join a replica, then stop it. -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") -test_run:cmd("switch replica") -box.info.replication[1].upstream.status == 'follow' or box.info +replica_set.create(test_run, 'replica_rejoin') +test_run:cmd("start server replica_rejoin") +test_run:cmd("switch replica_rejoin") +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status box.space.test:select() test_run:cmd("switch default") -test_run:cmd("stop server replica") +test_run:cmd("stop server replica_rejoin") -- Restart the server to purge the replica from -- the garbage collection state. @@ -40,27 +40,27 @@ box.snapshot() _ = box.space.test:delete{3} _ = box.space.test:insert{30} fio = require('fio') -#fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) -- 1 +test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') box.cfg{checkpoint_count = checkpoint_count} -- Restart the replica. Since xlogs have been removed, -- it is supposed to rejoin without changing id. -test_run:cmd("start server replica") +test_run:cmd("start server replica_rejoin") box.info.replication[2].downstream.vclock ~= nil or box.info -test_run:cmd("switch replica") -box.info.replication[1].upstream.status == 'follow' or box.info +test_run:cmd("switch replica_rejoin") +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status box.space.test:select() test_run:cmd("switch default") -- Make sure the replica follows new changes. for i = 10, 30, 10 do box.space.test:update(i, {{'!', 1, i}}) end vclock = test_run:get_vclock('default') -_ = test_run:wait_vclock('replica', vclock) -test_run:cmd("switch replica") +_ = test_run:wait_vclock('replica_rejoin', vclock) +test_run:cmd("switch replica_rejoin") box.space.test:select() -- Check that restart works as usual. -test_run:cmd("restart server replica") +test_run:cmd("restart server replica_rejoin") box.info.replication[1].upstream.status == 'follow' or box.info box.space.test:select() @@ -68,7 +68,7 @@ box.space.test:select() -- is strictly behind the master. box.space.test:replace{1, 2, 3} -- bumps LSN on the replica test_run:cmd("switch default") -test_run:cmd("stop server replica") +test_run:cmd("stop server replica_rejoin") test_run:cmd("restart server default") checkpoint_count = box.cfg.checkpoint_count box.cfg{checkpoint_count = 1} @@ -76,10 +76,10 @@ for i = 1, 3 do box.space.test:delete{i * 10} end box.snapshot() for i = 1, 3 do box.space.test:insert{i * 100} end fio = require('fio') -#fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) -- 1 +test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') box.cfg{checkpoint_count = checkpoint_count} -test_run:cmd("start server replica") -test_run:cmd("switch replica") +test_run:cmd("start server replica_rejoin") +test_run:cmd("switch replica_rejoin") box.info.status -- orphan box.space.test:select() @@ -90,30 +90,30 @@ box.space.test:select() -- Bootstrap a new replica. test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") +test_run:cmd("stop server replica_rejoin") +test_run:cmd("cleanup server replica_rejoin") test_run:cleanup_cluster() box.space.test:truncate() -test_run:cmd("start server replica") +test_run:cmd("start server replica_rejoin") -- Subscribe the master to the replica. -replica_listen = test_run:cmd("eval replica 'return box.cfg.listen'") +replica_listen = test_run:cmd("eval replica_rejoin 'return box.cfg.listen'") replica_listen ~= nil box.cfg{replication = replica_listen} -- Unsubscribe the replica from the master. -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") box.cfg{replication = ''} -- Bump vclock on the master. test_run:cmd("switch default") box.space.test:replace{1} -- Bump vclock on the replica. -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") for i = 1, 10 do box.space.test:replace{2} end -vclock = test_run:get_vclock('replica') +vclock = test_run:get_vclock('replica_rejoin') _ = test_run:wait_vclock('default', vclock) -- Restart the master and force garbage collection. test_run:cmd("switch default") test_run:cmd("restart server default") -replica_listen = test_run:cmd("eval replica 'return box.cfg.listen'") +replica_listen = test_run:cmd("eval replica_rejoin 'return box.cfg.listen'") replica_listen ~= nil box.cfg{replication = replica_listen} default_checkpoint_count = box.cfg.checkpoint_count @@ -121,14 +121,14 @@ box.cfg{checkpoint_count = 1} box.snapshot() box.cfg{checkpoint_count = default_checkpoint_count} fio = require('fio') -#fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 +test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog') -- Bump vclock on the replica again. -test_run:cmd("switch replica") +test_run:cmd("switch replica_rejoin") for i = 1, 10 do box.space.test:replace{2} end -vclock = test_run:get_vclock('replica') +vclock = test_run:get_vclock('replica_rejoin') _ = test_run:wait_vclock('default', vclock) -- Restart the replica. It should successfully rebootstrap. -test_run:cmd("restart server replica") +test_run:cmd("restart server replica_rejoin") box.space.test:select() box.snapshot() box.space.test:replace{2} @@ -136,9 +136,9 @@ box.space.test:replace{2} -- Cleanup. test_run:cmd("switch default") box.cfg{replication = ''} -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +test_run:cmd("stop server replica_rejoin") +test_run:cmd("cleanup server replica_rejoin") +test_run:cmd("delete server replica_rejoin") test_run:cleanup_cluster() box.space.test:drop() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/replica_timeout.lua b/test/replication/replica_timeout.lua index 38922fa3d..a28d83393 100644 --- a/test/replication/replica_timeout.lua +++ b/test/replication/replica_timeout.lua @@ -1,7 +1,7 @@ #!/usr/bin/env tarantool local TIMEOUT = tonumber(arg[1]) -local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 30.0 +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 box.cfg({ listen = os.getenv("LISTEN"), diff --git a/test/replication/replica_uuid_ro1.lua b/test/replication/replica_uuid_ro1.lua deleted file mode 120000 index 342d71c57..000000000 --- a/test/replication/replica_uuid_ro1.lua +++ /dev/null @@ -1 +0,0 @@ -replica_uuid_ro.lua \ No newline at end of file diff --git a/test/replication/replica_uuid_ro2.lua b/test/replication/replica_uuid_ro2.lua deleted file mode 120000 index 342d71c57..000000000 --- a/test/replication/replica_uuid_ro2.lua +++ /dev/null @@ -1 +0,0 @@ -replica_uuid_ro.lua \ No newline at end of file diff --git a/test/replication/replica_uuid_ro3.lua b/test/replication/replica_uuid_ro3.lua deleted file mode 120000 index 342d71c57..000000000 --- a/test/replication/replica_uuid_ro3.lua +++ /dev/null @@ -1 +0,0 @@ -replica_uuid_ro.lua \ No newline at end of file diff --git a/test/replication/replica_uuid_ro.lua b/test/replication/replicaset_ro_mostly.lua similarity index 83% rename from test/replication/replica_uuid_ro.lua rename to test/replication/replicaset_ro_mostly.lua index d5ba55852..fccd4eb3c 100644 --- a/test/replication/replica_uuid_ro.lua +++ b/test/replication/replicaset_ro_mostly.lua @@ -1,17 +1,17 @@ #!/usr/bin/env tarantool --- get instance name from filename (replica_uuid_ro1.lua => replica_uuid_ro1) +-- get instance name from filename (replicaset_ro_mostly1.lua => replicaset_ro_mostly1) local INSTANCE_ID = string.match(arg[0], "%d") local USER = 'cluster' local PASSWORD = 'somepassword' local SOCKET_DIR = require('fio').cwd() local TIMEOUT = tonumber(arg[2]) -local CON_TIMEOUT = arg[3] and tonumber(arg[3]) or 30.0 +local CON_TIMEOUT = arg[3] and tonumber(arg[3]) or 60.0 local function instance_uri(instance_id) --return 'localhost:'..(3310 + instance_id) - return SOCKET_DIR..'/replica_uuid_ro'..instance_id..'.sock'; + return SOCKET_DIR..'/replicaset_ro_mostly'..instance_id..'.sock'; end -- start console first diff --git a/test/replication/replicaset_ro_mostly.result b/test/replication/replicaset_ro_mostly.result index 1ce7d6f8e..43f0b489b 100644 --- a/test/replication/replicaset_ro_mostly.result +++ b/test/replication/replicaset_ro_mostly.result @@ -3,7 +3,7 @@ test_run = require('test_run').new() --- ... -SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'} +SERVERS = {'replicaset_ro_mostly1', 'replicaset_ro_mostly2'} --- ... uuid = require('uuid') @@ -56,7 +56,7 @@ test_run:wait_fullmesh(SERVERS) --- ... -- Add third replica -name = 'replica_uuid_ro3' +name = 'replicaset_ro_mostly3' --- ... test_run:cmd(create_cluster_cmd1:format(name, name)) @@ -67,7 +67,7 @@ test_run:cmd(create_cluster_cmd2:format(name, uuid.new(), "0.1")) --- - true ... -test_run:cmd('switch replica_uuid_ro3') +test_run:cmd('switch replicaset_ro_mostly3') --- - true ... diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua index c75af7218..0627d8316 100644 --- a/test/replication/replicaset_ro_mostly.test.lua +++ b/test/replication/replicaset_ro_mostly.test.lua @@ -2,7 +2,7 @@ -- Old behaviour: failed, since read-only is chosen by uuid. test_run = require('test_run').new() -SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'} +SERVERS = {'replicaset_ro_mostly1', 'replicaset_ro_mostly2'} uuid = require('uuid') uuid1 = uuid.new() @@ -30,10 +30,10 @@ create_cluster_uuid(SERVERS, UUID) test_run:wait_fullmesh(SERVERS) -- Add third replica -name = 'replica_uuid_ro3' +name = 'replicaset_ro_mostly3' test_run:cmd(create_cluster_cmd1:format(name, name)) test_run:cmd(create_cluster_cmd2:format(name, uuid.new(), "0.1")) -test_run:cmd('switch replica_uuid_ro3') +test_run:cmd('switch replicaset_ro_mostly3') test_run:cmd('switch default') -- Cleanup. diff --git a/test/replication/replicaset_ro_mostly1.lua b/test/replication/replicaset_ro_mostly1.lua new file mode 120000 index 000000000..dc050de06 --- /dev/null +++ b/test/replication/replicaset_ro_mostly1.lua @@ -0,0 +1 @@ +replicaset_ro_mostly.lua \ No newline at end of file diff --git a/test/replication/replicaset_ro_mostly2.lua b/test/replication/replicaset_ro_mostly2.lua new file mode 120000 index 000000000..dc050de06 --- /dev/null +++ b/test/replication/replicaset_ro_mostly2.lua @@ -0,0 +1 @@ +replicaset_ro_mostly.lua \ No newline at end of file diff --git a/test/replication/replicaset_ro_mostly3.lua b/test/replication/replicaset_ro_mostly3.lua new file mode 120000 index 000000000..dc050de06 --- /dev/null +++ b/test/replication/replicaset_ro_mostly3.lua @@ -0,0 +1 @@ +replicaset_ro_mostly.lua \ No newline at end of file diff --git a/test/replication/show_error_on_disconnect.lua b/test/replication/show_error_on_disconnect.lua new file mode 100644 index 000000000..6aae11fcf --- /dev/null +++ b/test/replication/show_error_on_disconnect.lua @@ -0,0 +1,38 @@ +#!/usr/bin/env tarantool + +-- get instance name from filename (show_error_on_disconnect1.lua => show_error_on_disconnect) +local INSTANCE_ID = string.match(arg[0], "%d") + +local SOCKET_DIR = require('fio').cwd() + +local TIMEOUT = tonumber(arg[1]) +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or 60.0 + +local function instance_uri(instance_id) + --return 'localhost:'..(3310 + instance_id) + return SOCKET_DIR..'/show_error_on_disconnect'..instance_id..'.sock'; +end + +-- start console first +require('console').listen(os.getenv('ADMIN')) + +box.cfg({ + listen = instance_uri(INSTANCE_ID); +-- log_level = 7; + replication = { + instance_uri(1); + instance_uri(2); + }; + replication_connect_quorum = 0; + replication_timeout = TIMEOUT; + replication_connect_timeout = CON_TIMEOUT; +}) + +test_run = require('test_run').new() +engine = test_run:get_cfg('engine') + +box.once("bootstrap", function() + box.schema.user.grant("guest", 'replication') + box.schema.space.create('test', {engine = engine}) + box.space.test:create_index('primary') +end) diff --git a/test/replication/show_error_on_disconnect.result b/test/replication/show_error_on_disconnect.result index af082203b..ae8d6a7da 100644 --- a/test/replication/show_error_on_disconnect.result +++ b/test/replication/show_error_on_disconnect.result @@ -6,17 +6,17 @@ test_run = require('test_run').new() --- ... -SERVERS = {'master_quorum1', 'master_quorum2'} +SERVERS = {'show_error_on_disconnect1', 'show_error_on_disconnect2'} --- ... -- Deploy a cluster. -test_run:create_cluster(SERVERS) +test_run:create_cluster(SERVERS, "replication", {args="0.1"}) --- ... test_run:wait_fullmesh(SERVERS) --- ... -test_run:cmd("switch master_quorum1") +test_run:cmd("switch show_error_on_disconnect1") --- - true ... @@ -26,7 +26,7 @@ repl = box.cfg.replication box.cfg{replication = ""} --- ... -test_run:cmd("switch master_quorum2") +test_run:cmd("switch show_error_on_disconnect2") --- - true ... @@ -46,7 +46,7 @@ box.snapshot() --- - ok ... --- Manually remove all xlogs on master_quorum2 to break replication to master_quorum1. +-- Manually remove all xlogs on show_error_on_disconnect2 to break replication to show_error_on_disconnect1. fio = require('fio') --- ... @@ -58,7 +58,7 @@ box.space.test:insert{3} - [3] ... -- Check error reporting. -test_run:cmd("switch master_quorum1") +test_run:cmd("switch show_error_on_disconnect1") --- - true ... @@ -75,15 +75,15 @@ box.space.test:select() other_id = box.info.id % 2 + 1 --- ... -box.info.replication[other_id].upstream.status +test_run:wait_cond(function() return box.info.replication[other_id].upstream.status == 'stopped' end) or box.info.replication[other_id].upstream.status --- -- stopped +- true ... box.info.replication[other_id].upstream.message:match("Missing") --- - Missing ... -test_run:cmd("switch master_quorum2") +test_run:cmd("switch show_error_on_disconnect2") --- - true ... @@ -96,17 +96,17 @@ box.space.test:select() other_id = box.info.id % 2 + 1 --- ... -box.info.replication[other_id].upstream.status +test_run:wait_cond(function() return box.info.replication[other_id].upstream.status == 'follow' end) or box.info.replication[other_id].upstream.status --- -- follow +- true ... box.info.replication[other_id].upstream.message --- - null ... -box.info.replication[other_id].downstream.status +test_run:wait_cond(function() return box.info.replication[other_id].downstream.status == 'stopped' end) or box.info.replication[other_id].downstream.status --- -- stopped +- true ... box.info.replication[other_id].downstream.message:match("Missing") --- diff --git a/test/replication/show_error_on_disconnect.test.lua b/test/replication/show_error_on_disconnect.test.lua index 40e9dbc5e..9fad4d0f4 100644 --- a/test/replication/show_error_on_disconnect.test.lua +++ b/test/replication/show_error_on_disconnect.test.lua @@ -4,40 +4,40 @@ -- The goal here is to see same error message on both side. -- test_run = require('test_run').new() -SERVERS = {'master_quorum1', 'master_quorum2'} +SERVERS = {'show_error_on_disconnect1', 'show_error_on_disconnect2'} -- Deploy a cluster. -test_run:create_cluster(SERVERS) +test_run:create_cluster(SERVERS, "replication", {args="0.1"}) test_run:wait_fullmesh(SERVERS) -test_run:cmd("switch master_quorum1") +test_run:cmd("switch show_error_on_disconnect1") repl = box.cfg.replication box.cfg{replication = ""} -test_run:cmd("switch master_quorum2") +test_run:cmd("switch show_error_on_disconnect2") box.space.test:insert{1} box.snapshot() box.space.test:insert{2} box.snapshot() --- Manually remove all xlogs on master_quorum2 to break replication to master_quorum1. +-- Manually remove all xlogs on show_error_on_disconnect2 to break replication to show_error_on_disconnect1. fio = require('fio') for _, path in ipairs(fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog'))) do fio.unlink(path) end box.space.test:insert{3} -- Check error reporting. -test_run:cmd("switch master_quorum1") +test_run:cmd("switch show_error_on_disconnect1") box.cfg{replication = repl} require('fiber').sleep(0.1) box.space.test:select() other_id = box.info.id % 2 + 1 -box.info.replication[other_id].upstream.status +test_run:wait_cond(function() return box.info.replication[other_id].upstream.status == 'stopped' end) or box.info.replication[other_id].upstream.status box.info.replication[other_id].upstream.message:match("Missing") -test_run:cmd("switch master_quorum2") +test_run:cmd("switch show_error_on_disconnect2") box.space.test:select() other_id = box.info.id % 2 + 1 -box.info.replication[other_id].upstream.status +test_run:wait_cond(function() return box.info.replication[other_id].upstream.status == 'follow' end) or box.info.replication[other_id].upstream.status box.info.replication[other_id].upstream.message -box.info.replication[other_id].downstream.status +test_run:wait_cond(function() return box.info.replication[other_id].downstream.status == 'stopped' end) or box.info.replication[other_id].downstream.status box.info.replication[other_id].downstream.message:match("Missing") test_run:cmd("switch default") -- Cleanup. diff --git a/test/replication/show_error_on_disconnect1.lua b/test/replication/show_error_on_disconnect1.lua new file mode 120000 index 000000000..d3e693ad6 --- /dev/null +++ b/test/replication/show_error_on_disconnect1.lua @@ -0,0 +1 @@ +show_error_on_disconnect.lua \ No newline at end of file diff --git a/test/replication/show_error_on_disconnect2.lua b/test/replication/show_error_on_disconnect2.lua new file mode 120000 index 000000000..d3e693ad6 --- /dev/null +++ b/test/replication/show_error_on_disconnect2.lua @@ -0,0 +1 @@ +show_error_on_disconnect.lua \ No newline at end of file diff --git a/test/replication/show_error_on_disconnect3.lua b/test/replication/show_error_on_disconnect3.lua new file mode 120000 index 000000000..d3e693ad6 --- /dev/null +++ b/test/replication/show_error_on_disconnect3.lua @@ -0,0 +1 @@ +show_error_on_disconnect.lua \ No newline at end of file diff --git a/test/replication/skip_conflict_row.result b/test/replication/skip_conflict_row.result index 9b2777872..75c5bebc2 100644 --- a/test/replication/skip_conflict_row.result +++ b/test/replication/skip_conflict_row.result @@ -1,10 +1,10 @@ -env = require('test_run') +test_run = require('test_run').new() --- ... -test_run = env.new() +engine = test_run:get_cfg('engine') --- ... -engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') --- ... box.schema.user.grant('guest', 'replication') @@ -16,21 +16,23 @@ space = box.schema.space.create('test', {engine = engine}); index = box.space.test:create_index('primary') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'skip_conflict_row') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server skip_conflict_row") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") --- - true ... box.cfg{replication_skip_conflict = true} --- ... +box.ctl.wait_rw() -- success +--- +... box.space.test:insert{1} --- - [1] @@ -47,17 +49,17 @@ space:insert{2} --- - [2] ... -box.info.status +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... vclock = test_run:get_vclock('default') --- ... -_ = test_run:wait_vclock("replica", vclock) +_ = test_run:wait_vclock("skip_conflict_row", vclock) --- ... -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") --- - true ... @@ -65,9 +67,9 @@ box.info.replication[1].upstream.message --- - null ... -box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- -- follow +- true ... box.space.test:select() --- @@ -78,17 +80,17 @@ test_run:cmd("switch default") --- - true ... -box.info.status +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... -- gh-2283: test that if replication_skip_conflict is off vclock -- is not advanced on errors. -test_run:cmd("restart server replica") +test_run:cmd("restart server skip_conflict_row") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") --- - true ... @@ -111,7 +113,7 @@ box.space.test:insert{4} --- - [4] ... -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") --- - true ... @@ -120,19 +122,19 @@ lsn1 == box.info.vclock[1] --- - true ... -box.info.replication[1].upstream.message +test_run:wait_cond(function() return box.info.replication[1].upstream.message == "Duplicate key exists in unique index 'primary' in space 'test'" end) or box.info.replication[1].upstream.message --- -- Duplicate key exists in unique index 'primary' in space 'test' +- true ... -box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'stopped' end) or box.info.replication[1].upstream.status --- -- stopped +- true ... test_run:cmd("switch default") --- - true ... -test_run:cmd("restart server replica") +test_run:cmd("restart server skip_conflict_row") --- - true ... @@ -161,17 +163,17 @@ test_run:cmd("switch default") box.space.test:truncate() --- ... -test_run:cmd("restart server replica") +test_run:cmd("restart server skip_conflict_row") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") --- - true ... -box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- -- follow +- true ... -- write some conflicting records on slave for i = 1, 10 do box.space.test:insert({i, 'r'}) end @@ -191,43 +193,46 @@ test_run:cmd("switch default") for i = 1, 10 do box.space.test:insert({i, 'm'}) end --- ... -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") +--- +- true +... +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- - true ... -- lsn should be incremented -v1 == box.info.vclock[1] - 10 +test_run:wait_cond(function() return v1 == box.info.vclock[1] - 10 end) or box.info.vclock[1] --- - true ... --- and state is follow -box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- -- follow +- true ... -- restart server and check replication continues from nop-ed vclock test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server skip_conflict_row") --- - true ... for i = 11, 20 do box.space.test:insert({i, 'm'}) end --- ... -test_run:cmd("start server replica") +test_run:cmd("start server skip_conflict_row") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") --- - true ... -box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- -- follow +- true ... box.space.test:select({11}, {iterator = "GE"}) --- @@ -247,15 +252,7 @@ test_run:cmd("switch default") - true ... -- cleanup -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'skip_conflict_row') --- - true ... diff --git a/test/replication/skip_conflict_row.test.lua b/test/replication/skip_conflict_row.test.lua index 2982c730a..9b1879b0e 100644 --- a/test/replication/skip_conflict_row.test.lua +++ b/test/replication/skip_conflict_row.test.lua @@ -1,49 +1,50 @@ -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') box.schema.user.grant('guest', 'replication') space = box.schema.space.create('test', {engine = engine}); index = box.space.test:create_index('primary') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") -test_run:cmd("switch replica") +replica_set.create(test_run, 'skip_conflict_row') +test_run:cmd("start server skip_conflict_row") +test_run:cmd("switch skip_conflict_row") box.cfg{replication_skip_conflict = true} +box.ctl.wait_rw() -- success box.space.test:insert{1} test_run:cmd("switch default") space:insert{1, 1} space:insert{2} -box.info.status +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status vclock = test_run:get_vclock('default') -_ = test_run:wait_vclock("replica", vclock) -test_run:cmd("switch replica") +_ = test_run:wait_vclock("skip_conflict_row", vclock) +test_run:cmd("switch skip_conflict_row") box.info.replication[1].upstream.message -box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status box.space.test:select() test_run:cmd("switch default") -box.info.status +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status -- gh-2283: test that if replication_skip_conflict is off vclock -- is not advanced on errors. -test_run:cmd("restart server replica") -test_run:cmd("switch replica") +test_run:cmd("restart server skip_conflict_row") +test_run:cmd("switch skip_conflict_row") box.space.test:insert{3} lsn1 = box.info.vclock[1] test_run:cmd("switch default") box.space.test:insert{3, 3} box.space.test:insert{4} -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") -- lsn is not promoted lsn1 == box.info.vclock[1] -box.info.replication[1].upstream.message -box.info.replication[1].upstream.status +test_run:wait_cond(function() return box.info.replication[1].upstream.message == "Duplicate key exists in unique index 'primary' in space 'test'" end) or box.info.replication[1].upstream.message +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'stopped' end) or box.info.replication[1].upstream.status test_run:cmd("switch default") -test_run:cmd("restart server replica") +test_run:cmd("restart server skip_conflict_row") -- applier is not in follow state box.info.replication[1].upstream.message @@ -57,9 +58,9 @@ test_run:cmd("switch default") -- test if nop were really written box.space.test:truncate() -test_run:cmd("restart server replica") -test_run:cmd("switch replica") -box.info.replication[1].upstream.status +test_run:cmd("restart server skip_conflict_row") +test_run:cmd("switch skip_conflict_row") +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status -- write some conflicting records on slave for i = 1, 10 do box.space.test:insert({i, 'r'}) end box.cfg{replication_skip_conflict = true} @@ -69,26 +70,25 @@ v1 = box.info.vclock[1] test_run:cmd("switch default") for i = 1, 10 do box.space.test:insert({i, 'm'}) end -test_run:cmd("switch replica") +test_run:cmd("switch skip_conflict_row") +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status + -- lsn should be incremented -v1 == box.info.vclock[1] - 10 --- and state is follow -box.info.replication[1].upstream.status +test_run:wait_cond(function() return v1 == box.info.vclock[1] - 10 end) or box.info.vclock[1] +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status -- restart server and check replication continues from nop-ed vclock test_run:cmd("switch default") -test_run:cmd("stop server replica") +test_run:cmd("stop server skip_conflict_row") for i = 11, 20 do box.space.test:insert({i, 'm'}) end -test_run:cmd("start server replica") -test_run:cmd("switch replica") -box.info.replication[1].upstream.status +test_run:cmd("start server skip_conflict_row") +test_run:cmd("switch skip_conflict_row") +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status box.space.test:select({11}, {iterator = "GE"}) test_run:cmd("switch default") -- cleanup -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'skip_conflict_row') test_run:cleanup_cluster() box.space.test:drop() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/status.result b/test/replication/status.result index f9d20a923..c96e69a4c 100644 --- a/test/replication/status.result +++ b/test/replication/status.result @@ -1,7 +1,4 @@ -env = require('test_run') ---- -... -test_run = env.new() +test_run = require('test_run').new() --- ... test_run:cmd('restart server default with cleanup=1') @@ -82,11 +79,13 @@ master.downstream == nil - true ... -- Start Master -> Slave replication -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set = require('fast_replica') +--- +... +replica_set.create(test_run, 'status') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server status") --- - true ... @@ -145,7 +144,7 @@ master.downstream == nil - true ... -- replica's status -replica_id = test_run:get_server_id('replica') +replica_id = test_run:get_server_id('status') --- ... box.info.vclock[replica_id] == nil @@ -187,7 +186,7 @@ replica.downstream.vclock[replica_id] == box.info.vclock[replica_id] -- -- Replica -- -test_run:cmd('switch replica') +test_run:cmd('switch status') --- - true ... @@ -222,7 +221,7 @@ master.uuid == box.space._cluster:get(master_id)[2] --- - true ... -master.upstream.status == "follow" +test_run:wait_cond(function() return master.upstream.status == 'follow' end) or master.upstream.status --- - true ... @@ -234,9 +233,9 @@ master.upstream.idle < 1 --- - true ... -master.upstream.peer:match("localhost") +master.upstream.peer:match("unix/") --- -- localhost +- unix/ ... master.downstream == nil --- @@ -282,7 +281,7 @@ replica.downstream == nil -- -- ClientError during replication -- -test_run:cmd('switch replica') +test_run:cmd('switch status') --- - true ... @@ -298,18 +297,11 @@ box.space._schema:insert({'dup'}) --- - ['dup'] ... -test_run:cmd('switch replica') +test_run:cmd('switch status') --- - true ... -r = box.info.replication[1] ---- -... -r.upstream.status == "stopped" ---- -- true -... -r.upstream.message:match('Duplicate') ~= nil +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'stopped' and box.info.replication[1].upstream.message:match('Duplicate') ~= nil end) --- - true ... @@ -325,7 +317,7 @@ test_run:cmd("push filter ', lsn: [0-9]+' to ', lsn: '") --- - true ... -test_run:grep_log('replica', 'error applying row: .*') +test_run:grep_log('status', 'error applying row: .*') --- - 'error applying row: {type: ''INSERT'', replica_id: 1, lsn: , space_id: 272, index_id: 0, tuple: ["dup"]}' @@ -337,18 +329,18 @@ test_run:cmd("clear filter") -- -- Check box.info.replication login -- -test_run:cmd('switch replica') +test_run:cmd('switch status') --- - true ... -test_run:cmd("set variable master_port to 'replica.master'") +test_run:cmd("set variable master_port to 'status.master'") --- - true ... replica_uri = os.getenv("LISTEN") --- ... -box.cfg{replication = {"guest@localhost:" .. master_port, replica_uri}} +box.cfg{replication = {"guest@unix/:" .. master_port, replica_uri}} --- ... master_id = test_run:get_server_id('default') @@ -369,9 +361,9 @@ master.upstream.peer:match("guest") --- - guest ... -master.upstream.peer:match("localhost") +master.upstream.peer:match("unix/") --- -- localhost +- unix/ ... master.downstream == nil --- @@ -387,15 +379,7 @@ test_run:cmd('switch default') box.schema.user.revoke('guest', 'replication') --- ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'status') --- - true ... diff --git a/test/replication/status.test.lua b/test/replication/status.test.lua index 8a82fe9ae..dffa0696d 100644 --- a/test/replication/status.test.lua +++ b/test/replication/status.test.lua @@ -1,5 +1,5 @@ -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() + test_run:cmd('restart server default with cleanup=1') test_run:cmd('switch default') @@ -32,8 +32,9 @@ master.upstream == nil master.downstream == nil -- Start Master -> Slave replication -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") +replica_set = require('fast_replica') +replica_set.create(test_run, 'status') +test_run:cmd("start server status") -- -- Master @@ -56,7 +57,7 @@ master.upstream == nil master.downstream == nil -- replica's status -replica_id = test_run:get_server_id('replica') +replica_id = test_run:get_server_id('status') box.info.vclock[replica_id] == nil replica = box.info.replication[replica_id] replica.id == replica_id @@ -71,7 +72,7 @@ replica.downstream.vclock[replica_id] == box.info.vclock[replica_id] -- -- Replica -- -test_run:cmd('switch replica') +test_run:cmd('switch status') #box.info.vclock == 1 -- box.info.vclock[replica_id] is nil #box.info.replication == 2 @@ -83,10 +84,10 @@ box.info.vclock[master_id] == 2 master = box.info.replication[master_id] master.id == master_id master.uuid == box.space._cluster:get(master_id)[2] -master.upstream.status == "follow" +test_run:wait_cond(function() return master.upstream.status == 'follow' end) or master.upstream.status master.upstream.lag < 1 master.upstream.idle < 1 -master.upstream.peer:match("localhost") +master.upstream.peer:match("unix/") master.downstream == nil -- replica's status @@ -105,34 +106,32 @@ replica.downstream == nil -- -- ClientError during replication -- -test_run:cmd('switch replica') +test_run:cmd('switch status') box.space._schema:insert({'dup'}) test_run:cmd('switch default') box.space._schema:insert({'dup'}) -test_run:cmd('switch replica') -r = box.info.replication[1] -r.upstream.status == "stopped" -r.upstream.message:match('Duplicate') ~= nil +test_run:cmd('switch status') +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'stopped' and box.info.replication[1].upstream.message:match('Duplicate') ~= nil end) test_run:cmd('switch default') box.space._schema:delete({'dup'}) test_run:cmd("push filter ', lsn: [0-9]+' to ', lsn: '") -test_run:grep_log('replica', 'error applying row: .*') +test_run:grep_log('status', 'error applying row: .*') test_run:cmd("clear filter") -- -- Check box.info.replication login -- -test_run:cmd('switch replica') -test_run:cmd("set variable master_port to 'replica.master'") +test_run:cmd('switch status') +test_run:cmd("set variable master_port to 'status.master'") replica_uri = os.getenv("LISTEN") -box.cfg{replication = {"guest@localhost:" .. master_port, replica_uri}} +box.cfg{replication = {"guest@unix/:" .. master_port, replica_uri}} master_id = test_run:get_server_id('default') master = box.info.replication[master_id] master.id == master_id master.upstream.status == "follow" master.upstream.peer:match("guest") -master.upstream.peer:match("localhost") +master.upstream.peer:match("unix/") master.downstream == nil test_run:cmd('switch default') @@ -141,7 +140,5 @@ test_run:cmd('switch default') -- Cleanup -- box.schema.user.revoke('guest', 'replication') -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'status') test_run:cleanup_cluster() diff --git a/test/replication/suite.ini b/test/replication/suite.ini index 6e9e3edd0..aa79f68f3 100644 --- a/test/replication/suite.ini +++ b/test/replication/suite.ini @@ -7,5 +7,6 @@ release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.l config = suite.cfg lua_libs = lua/fast_replica.lua lua/rlimit.lua use_unix_sockets = True +use_unix_sockets_iproto = True long_run = prune.test.lua is_parallel = True diff --git a/test/replication/sync.result b/test/replication/sync.result index b34501dae..17714cd3b 100644 --- a/test/replication/sync.result +++ b/test/replication/sync.result @@ -1,12 +1,15 @@ -fiber = require('fiber') +test_run = require('test_run').new() --- ... -test_run = require('test_run').new() +fiber = require('fiber') --- ... engine = test_run:get_cfg('engine') --- ... +replica_set = require('fast_replica') +--- +... test_run:cleanup_cluster() --- ... @@ -51,7 +54,7 @@ function fill() local r = box.info.replication[2] return r ~= nil and r.downstream ~= nil and r.downstream.status ~= 'stopped' - end, 10) + end) for i = count + 101, count + 200 do box.space.test:replace{i} end @@ -66,15 +69,14 @@ test_run:cmd("setopt delimiter ''"); - true ... -- Deploy a replica. -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'sync') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server sync") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch sync") --- - true ... @@ -93,10 +95,11 @@ test_run:cmd("switch default") fill() --- ... -test_run:cmd("switch replica") +test_run:cmd("switch sync") --- - true ... +----------------------------------------------------------------------------------------------------- -- Resume replication. -- -- Since max allowed lag is small, all records should arrive @@ -108,13 +111,13 @@ box.cfg{replication_sync_lag = 0.001} box.cfg{replication = replication} --- ... -box.space.test:count() +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- 200 +- true ... -box.info.status -- running +box.space.test:count() --- -- running +- 200 ... box.info.ro -- false --- @@ -135,10 +138,11 @@ test_run:cmd("switch default") fill() --- ... -test_run:cmd("switch replica") +test_run:cmd("switch sync") --- - true ... +----------------------------------------------------------------------------------------------------- -- Resume replication -- -- Since max allowed lag is big, not all records will arrive @@ -151,20 +155,12 @@ box.cfg{replication_sync_lag = 1} box.cfg{replication = replication} --- ... -box.space.test:count() < 400 +test_run:wait_cond(function() return box.space.test:count() == 400 or (box.space.test:count() < 400 and box.info.status == 'running' and box.info.ro) end) or box.info.status --- - true ... -box.info.status -- running ---- -- running -... -box.info.ro -- false ---- -- false -... -- Wait for remaining rows to arrive. -test_run:wait_cond(function() return box.space.test:count() == 400 end, 10) +test_run:wait_cond(function() return box.space.test:count() == 400 end) or box.space.test:count() --- - true ... @@ -183,10 +179,11 @@ test_run:cmd("switch default") fill() --- ... -test_run:cmd("switch replica") +test_run:cmd("switch sync") --- - true ... +----------------------------------------------------------------------------------------------------- -- Resume replication -- -- Although max allowed lag is small, box.cfg() will fail to @@ -199,31 +196,23 @@ box.cfg{replication_sync_lag = 0.001, replication_sync_timeout = 0.001} box.cfg{replication = replication} --- ... -box.space.test:count() < 600 ---- -- true -... -box.info.status -- orphan ---- -- orphan -... -box.info.ro -- true +test_run:wait_cond(function() return box.space.test:count() == 600 or (box.space.test:count() < 600 and box.info.status == 'orphan' and box.info.ro) end) or box.info.status --- - true ... -- Wait for remaining rows to arrive. -test_run:wait_cond(function() return box.space.test:count() == 600 end, 10) +test_run:wait_cond(function() return box.space.test:count() == 600 end) or box.space.test:count() --- - true ... -- Make sure replica leaves oprhan state. -test_run:wait_cond(function() return box.info.status ~= 'orphan' end, 10) +test_run:wait_cond(function() return box.info.status ~= 'orphan' end) --- - true ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... box.info.ro -- false --- @@ -237,13 +226,13 @@ box.info.ro -- false -- ER_CFG "duplicate connection with the same replica UUID" error. -- It should print it to the log, but keep trying to synchronize. -- Eventually, it should leave box.cfg() following the master. -box.cfg{replication_timeout = 0.1} +box.cfg{replication_timeout = 0.01} --- ... -box.cfg{replication_sync_lag = 1} +box.cfg{replication_sync_lag = 0.1} --- ... -box.cfg{replication_sync_timeout = 10} +box.cfg{replication_sync_timeout = 50} --- ... test_run:cmd("switch default") @@ -273,7 +262,7 @@ test_run:cmd("setopt delimiter ''"); --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch sync") --- - true ... @@ -286,19 +275,19 @@ box.cfg{replication = {}} box.cfg{replication = replication} --- ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status ~= 'orphan' end) --- -- running +- true ... box.info.ro -- false --- - false ... -box.info.replication[1].upstream.status -- follow +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status --- -- follow +- true ... -test_run:grep_log('replica', 'ER_CFG.*') +test_run:wait_log("sync", "ER_CFG.*", nil, 200) --- - 'ER_CFG: Incorrect value for option ''replication'': duplicate connection with the same replica UUID' @@ -307,7 +296,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") +test_run:cmd("stop server sync") --- - true ... @@ -324,17 +313,17 @@ box.error.injection.set('ERRINJ_WAL_WRITE_DISK', false) --- - ok ... -test_run:cmd("start server replica") +test_run:cmd("start server sync") --- - true ... -test_run:cmd("switch replica") +test_run:cmd("switch sync") --- - true ... -box.info.status -- running +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status --- -- running +- true ... box.info.ro -- false --- @@ -344,15 +333,7 @@ test_run:cmd("switch default") --- - true ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'sync') --- - true ... diff --git a/test/replication/sync.test.lua b/test/replication/sync.test.lua index cae97a26f..66e5fe5dd 100644 --- a/test/replication/sync.test.lua +++ b/test/replication/sync.test.lua @@ -1,6 +1,7 @@ -fiber = require('fiber') test_run = require('test_run').new() +fiber = require('fiber') engine = test_run:get_cfg('engine') +replica_set = require('fast_replica') test_run:cleanup_cluster() @@ -35,7 +36,7 @@ function fill() local r = box.info.replication[2] return r ~= nil and r.downstream ~= nil and r.downstream.status ~= 'stopped' - end, 10) + end) for i = count + 101, count + 200 do box.space.test:replace{i} end @@ -46,9 +47,9 @@ end; test_run:cmd("setopt delimiter ''"); -- Deploy a replica. -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") -test_run:cmd("switch replica") +replica_set.create(test_run, 'sync') +test_run:cmd("start server sync") +test_run:cmd("switch sync") -- Stop replication. replication = box.cfg.replication @@ -57,8 +58,9 @@ box.cfg{replication = {}} -- Fill the space. test_run:cmd("switch default") fill() -test_run:cmd("switch replica") +test_run:cmd("switch sync") +----------------------------------------------------------------------------------------------------- -- Resume replication. -- -- Since max allowed lag is small, all records should arrive @@ -66,8 +68,8 @@ test_run:cmd("switch replica") -- box.cfg{replication_sync_lag = 0.001} box.cfg{replication = replication} +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status box.space.test:count() -box.info.status -- running box.info.ro -- false -- Stop replication. @@ -77,8 +79,9 @@ box.cfg{replication = {}} -- Fill the space. test_run:cmd("switch default") fill() -test_run:cmd("switch replica") +test_run:cmd("switch sync") +----------------------------------------------------------------------------------------------------- -- Resume replication -- -- Since max allowed lag is big, not all records will arrive @@ -87,12 +90,10 @@ test_run:cmd("switch replica") -- box.cfg{replication_sync_lag = 1} box.cfg{replication = replication} -box.space.test:count() < 400 -box.info.status -- running -box.info.ro -- false +test_run:wait_cond(function() return box.space.test:count() == 400 or (box.space.test:count() < 400 and box.info.status == 'running' and box.info.ro) end) or box.info.status -- Wait for remaining rows to arrive. -test_run:wait_cond(function() return box.space.test:count() == 400 end, 10) +test_run:wait_cond(function() return box.space.test:count() == 400 end) or box.space.test:count() -- Stop replication. replication = box.cfg.replication @@ -101,8 +102,9 @@ box.cfg{replication = {}} -- Fill the space. test_run:cmd("switch default") fill() -test_run:cmd("switch replica") +test_run:cmd("switch sync") +----------------------------------------------------------------------------------------------------- -- Resume replication -- -- Although max allowed lag is small, box.cfg() will fail to @@ -111,16 +113,14 @@ test_run:cmd("switch replica") -- box.cfg{replication_sync_lag = 0.001, replication_sync_timeout = 0.001} box.cfg{replication = replication} -box.space.test:count() < 600 -box.info.status -- orphan -box.info.ro -- true +test_run:wait_cond(function() return box.space.test:count() == 600 or (box.space.test:count() < 600 and box.info.status == 'orphan' and box.info.ro) end) or box.info.status -- Wait for remaining rows to arrive. -test_run:wait_cond(function() return box.space.test:count() == 600 end, 10) +test_run:wait_cond(function() return box.space.test:count() == 600 end) or box.space.test:count() -- Make sure replica leaves oprhan state. -test_run:wait_cond(function() return box.info.status ~= 'orphan' end, 10) -box.info.status -- running +test_run:wait_cond(function() return box.info.status ~= 'orphan' end) +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status box.info.ro -- false -- gh-3636: Check that replica set sync doesn't stop on cfg errors. @@ -131,9 +131,9 @@ box.info.ro -- false -- ER_CFG "duplicate connection with the same replica UUID" error. -- It should print it to the log, but keep trying to synchronize. -- Eventually, it should leave box.cfg() following the master. -box.cfg{replication_timeout = 0.1} -box.cfg{replication_sync_lag = 1} -box.cfg{replication_sync_timeout = 10} +box.cfg{replication_timeout = 0.01} +box.cfg{replication_sync_lag = 0.1} +box.cfg{replication_sync_timeout = 50} test_run:cmd("switch default") box.error.injection.set('ERRINJ_WAL_DELAY', true) @@ -146,32 +146,30 @@ _ = fiber.create(function() box.error.injection.set('ERRINJ_WAL_DELAY', false) end); test_run:cmd("setopt delimiter ''"); -test_run:cmd("switch replica") +test_run:cmd("switch sync") replication = box.cfg.replication box.cfg{replication = {}} box.cfg{replication = replication} -box.info.status -- running +test_run:wait_cond(function() return box.info.status ~= 'orphan' end) box.info.ro -- false -box.info.replication[1].upstream.status -- follow -test_run:grep_log('replica', 'ER_CFG.*') +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) or box.info.replication[1].upstream.status +test_run:wait_log("sync", "ER_CFG.*", nil, 200) test_run:cmd("switch default") -test_run:cmd("stop server replica") +test_run:cmd("stop server sync") -- gh-3830: Sync fails if there's a gap at the end of the master's WAL. box.error.injection.set('ERRINJ_WAL_WRITE_DISK', true) box.space.test:replace{123456789} box.error.injection.set('ERRINJ_WAL_WRITE_DISK', false) -test_run:cmd("start server replica") -test_run:cmd("switch replica") -box.info.status -- running +test_run:cmd("start server sync") +test_run:cmd("switch sync") +test_run:wait_cond(function() return box.info.status == 'running' end) or box.info.status box.info.ro -- false test_run:cmd("switch default") -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'sync') test_run:cleanup_cluster() box.space.test:drop() diff --git a/test/replication/wal_off.result b/test/replication/wal_off.result index e0ae84bd7..f964b066e 100644 --- a/test/replication/wal_off.result +++ b/test/replication/wal_off.result @@ -1,10 +1,10 @@ -- -- gh-1233: JOIN/SUBSCRIBE must fail if master has wal_mode = "none" -- -env = require('test_run') +test_run = require('test_run').new() --- ... -test_run = env.new() +replica_set = require('fast_replica') --- ... test_run:cmd('switch default') @@ -17,9 +17,8 @@ fiber = require('fiber') box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server wal_off with rpl_master=default, script='replication/wal_off.lua'") +replica_set.create(test_run, 'wal_off', 'wal_off') --- -- true ... test_run:cmd("start server wal_off") --- @@ -93,21 +92,13 @@ box.cfg { replication_sync_timeout = replication_sync_timeout } check = "Read access to universe" --- ... -while string.find(box.info.replication[wal_off_id].upstream.message, check) == nil do fiber.sleep(0.01) end +while (box.info.replication[wal_off_id].upstream.message == nil or string.find(box.info.replication[wal_off_id].upstream.message, check) == nil) do fiber.sleep(0.01) end --- ... box.cfg { replication = "" } --- ... -test_run:cmd("stop server wal_off") ---- -- true -... -test_run:cmd("cleanup server wal_off") ---- -- true -... -test_run:cmd("delete server wal_off") +replica_set.drop(test_run, 'wal_off') --- - true ... diff --git a/test/replication/wal_off.test.lua b/test/replication/wal_off.test.lua index 110f2f1f7..10069599a 100644 --- a/test/replication/wal_off.test.lua +++ b/test/replication/wal_off.test.lua @@ -2,12 +2,13 @@ -- gh-1233: JOIN/SUBSCRIBE must fail if master has wal_mode = "none" -- -env = require('test_run') -test_run = env.new() +test_run = require('test_run').new() +replica_set = require('fast_replica') + test_run:cmd('switch default') fiber = require('fiber') box.schema.user.grant('guest', 'replication') -test_run:cmd("create server wal_off with rpl_master=default, script='replication/wal_off.lua'") +replica_set.create(test_run, 'wal_off', 'wal_off') test_run:cmd("start server wal_off") test_run:cmd('switch default') wal_off_uri = test_run:eval('wal_off', 'return box.cfg.listen')[1] @@ -32,12 +33,10 @@ box.cfg { replication_sync_timeout = 0.01 } box.cfg { replication = wal_off_uri } box.cfg { replication_sync_timeout = replication_sync_timeout } check = "Read access to universe" -while string.find(box.info.replication[wal_off_id].upstream.message, check) == nil do fiber.sleep(0.01) end +while (box.info.replication[wal_off_id].upstream.message == nil or string.find(box.info.replication[wal_off_id].upstream.message, check) == nil) do fiber.sleep(0.01) end box.cfg { replication = "" } -test_run:cmd("stop server wal_off") -test_run:cmd("cleanup server wal_off") -test_run:cmd("delete server wal_off") +replica_set.drop(test_run, 'wal_off') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/wal_rw_stress.result b/test/replication/wal_rw_stress.result index cc68877b0..1f88d86b1 100644 --- a/test/replication/wal_rw_stress.result +++ b/test/replication/wal_rw_stress.result @@ -1,6 +1,9 @@ test_run = require('test_run').new() --- ... +replica_set = require('fast_replica') +--- +... -- -- gh-3893: Replication failure: relay may report that an xlog -- is corrupted if it it currently being written to. @@ -15,20 +18,19 @@ _ = s:create_index('primary') box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +replica_set.create(test_run, 'wal_rw_stress') --- -- true ... -test_run:cmd("start server replica") +test_run:cmd("start server wal_rw_stress") --- - true ... -- Setup replica => master channel. -box.cfg{replication = test_run:cmd("eval replica 'return box.cfg.listen'")} +box.cfg{replication = test_run:cmd("eval wal_rw_stress 'return box.cfg.listen'")} --- ... --- Disable master => replica channel. -test_run:cmd("switch replica") +-- Disable master => wal_rw_stress channel. +test_run:cmd("switch wal_rw_stress") --- - true ... @@ -60,11 +62,11 @@ test_run:cmd("setopt delimiter ''"); --- - true ... --- Enable master => replica channel and wait for the replica to catch up. --- The relay handling replica => master channel on the replica will read +-- Enable master => wal_rw_stress channel and wait for the replica to catch up. +-- The relay handling wal_rw_stress => master channel on the replica will read -- an xlog while the applier is writing to it. Although applier and relay -- are running in different threads, there shouldn't be any rw errors. -test_run:cmd("switch replica") +test_run:cmd("switch wal_rw_stress") --- - true ... @@ -83,15 +85,7 @@ test_run:cmd("switch default") box.cfg{replication = {}} --- ... -test_run:cmd("stop server replica") ---- -- true -... -test_run:cmd("cleanup server replica") ---- -- true -... -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'wal_rw_stress') --- - true ... diff --git a/test/replication/wal_rw_stress.test.lua b/test/replication/wal_rw_stress.test.lua index 08570b285..ff3d589b7 100644 --- a/test/replication/wal_rw_stress.test.lua +++ b/test/replication/wal_rw_stress.test.lua @@ -1,4 +1,5 @@ test_run = require('test_run').new() +replica_set = require('fast_replica') -- -- gh-3893: Replication failure: relay may report that an xlog @@ -9,14 +10,14 @@ _ = s:create_index('primary') -- Deploy a replica. box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") +replica_set.create(test_run, 'wal_rw_stress') +test_run:cmd("start server wal_rw_stress") -- Setup replica => master channel. -box.cfg{replication = test_run:cmd("eval replica 'return box.cfg.listen'")} +box.cfg{replication = test_run:cmd("eval wal_rw_stress 'return box.cfg.listen'")} --- Disable master => replica channel. -test_run:cmd("switch replica") +-- Disable master => wal_rw_stress channel. +test_run:cmd("switch wal_rw_stress") replication = box.cfg.replication box.cfg{replication = {}} test_run:cmd("switch default") @@ -32,20 +33,18 @@ for i = 1, 100 do end; test_run:cmd("setopt delimiter ''"); --- Enable master => replica channel and wait for the replica to catch up. --- The relay handling replica => master channel on the replica will read +-- Enable master => wal_rw_stress channel and wait for the replica to catch up. +-- The relay handling wal_rw_stress => master channel on the replica will read -- an xlog while the applier is writing to it. Although applier and relay -- are running in different threads, there shouldn't be any rw errors. -test_run:cmd("switch replica") +test_run:cmd("switch wal_rw_stress") box.cfg{replication = replication} box.info.replication[1].downstream.status ~= 'stopped' or box.info test_run:cmd("switch default") -- Cleanup. box.cfg{replication = {}} -test_run:cmd("stop server replica") -test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") +replica_set.drop(test_run, 'wal_rw_stress') test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') s:drop() -- 2.17.1