From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp16.mail.ru (smtp16.mail.ru [94.100.176.153]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 89120469719 for ; Fri, 2 Oct 2020 13:33:25 +0300 (MSK) From: Serge Petrenko Date: Fri, 2 Oct 2020 13:33:12 +0300 Message-Id: <20201002103312.23042-1-sergepetrenko@tarantool.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH] raft: add a test with synchronous replication List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: v.shpilevoy@tarantool.org Cc: tarantool-patches@dev.tarantool.org --- Branch: https://github.com/tarantool/tarantool/tree/sp/raft-qsync-test The test is relatively long (runs for 10 seconds on my machine). but I still think it's worth having it, at least under --long option (I haven't added it to long_run list yet). test/replication/election_qsync.result | 125 +++++++++++++++++++++++ test/replication/election_qsync.test.lua | 70 +++++++++++++ test/replication/election_replica.lua | 10 +- 3 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 test/replication/election_qsync.result create mode 100644 test/replication/election_qsync.test.lua diff --git a/test/replication/election_qsync.result b/test/replication/election_qsync.result new file mode 100644 index 000000000..1bf13d7bc --- /dev/null +++ b/test/replication/election_qsync.result @@ -0,0 +1,125 @@ +-- test-run result file version 2 +test_run = require('test_run').new() + | --- + | ... +netbox = require('net.box') + | --- + | ... + +-- +-- gh-1146: Leader election + Qsync +-- +test_run:cmd('setopt delimiter ";"') + | --- + | - true + | ... +function get_leader(nrs) + local is_leader_cmd = 'return box.info.election.state == \'leader\'' + local leader_nr = 0 + test_run:wait_cond(function() + local leader_count = 0 + for nr, do_check in pairs(nrs) do + if do_check then + local is_leader = test_run:eval('election_replica'..nr, + is_leader_cmd)[1] + if is_leader then + leader_count = leader_count + 1 + leader_nr = nr + end + assert(leader_count <= 1) + end + end + return leader_count == 1 + end) + return leader_nr +end; + | --- + | ... + +test_run:cmd('setopt delimiter ""'); + | --- + | - true + | ... + +SERVERS = {'election_replica1', 'election_replica2', 'election_replica3'} + | --- + | ... +test_run:create_cluster(SERVERS, "replication", {args='2'}) + | --- + | ... +test_run:wait_fullmesh(SERVERS) + | --- + | ... + +nrs = {true, true, true} + | --- + | ... +old_leader_nr = get_leader(nrs) + | --- + | ... +old_leader = 'election_replica'..old_leader_nr + | --- + | ... +leader_port = test_run:eval(old_leader, 'box.cfg.listen')[1] + | --- + | ... +c = netbox.connect(leader_port) + | --- + | ... + +_ = c:eval('box.schema.space.create("test", {is_sync=true})') + | --- + | ... +_ = c:eval('box.space.test:create_index("pk")') + | --- + | ... + +-- Insert some data to a synchronous space, then kill the leader before the +-- confirmation is written. Check successful confirmation on the new leader. +test_run:cmd('setopt delimiter ";"') + | --- + | - true + | ... +for i = 1,10 do + c:eval('box.cfg{replication_synchro_quorum=4, replication_synchro_timeout=1000}') + c.space.test:insert({i}, {is_async=true}) + test_run:wait_cond(function() return c.space.test:get{i} ~= nil end) + test_run:cmd('stop server '..old_leader) + nrs[old_leader_nr] = false + new_leader_nr = get_leader(nrs) + new_leader = 'election_replica'..new_leader_nr + leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1] + c = netbox.connect(leader_port) + c:eval('box.ctl.clear_synchro_queue()') + c:eval('box.cfg{replication_synchro_timeout=1000}') + c.space._schema:replace{'smth'} + c.space.test:get{i} + test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2"') + nrs[old_leader_nr] = true + old_leader_nr = new_leader_nr + old_leader = new_leader +end; + | --- + | ... +test_run:cmd('setopt delimiter ""'); + | --- + | - true + | ... +-- We're connected to some leader. +c.space.test:select{} + | --- + | - - [1] + | - [2] + | - [3] + | - [4] + | - [5] + | - [6] + | - [7] + | - [8] + | - [9] + | - [10] + | ... + +test_run:drop_cluster(SERVERS) + | --- + | ... diff --git a/test/replication/election_qsync.test.lua b/test/replication/election_qsync.test.lua new file mode 100644 index 000000000..f069c71bb --- /dev/null +++ b/test/replication/election_qsync.test.lua @@ -0,0 +1,70 @@ +test_run = require('test_run').new() +netbox = require('net.box') + +-- +-- gh-1146: Leader election + Qsync +-- +test_run:cmd('setopt delimiter ";"') +function get_leader(nrs) + local is_leader_cmd = 'return box.info.election.state == \'leader\'' + local leader_nr = 0 + test_run:wait_cond(function() + local leader_count = 0 + for nr, do_check in pairs(nrs) do + if do_check then + local is_leader = test_run:eval('election_replica'..nr, + is_leader_cmd)[1] + if is_leader then + leader_count = leader_count + 1 + leader_nr = nr + end + assert(leader_count <= 1) + end + end + return leader_count == 1 + end) + return leader_nr +end; + +test_run:cmd('setopt delimiter ""'); + +SERVERS = {'election_replica1', 'election_replica2', 'election_replica3'} +test_run:create_cluster(SERVERS, "replication", {args='2'}) +test_run:wait_fullmesh(SERVERS) + +nrs = {true, true, true} +old_leader_nr = get_leader(nrs) +old_leader = 'election_replica'..old_leader_nr +leader_port = test_run:eval(old_leader, 'box.cfg.listen')[1] +c = netbox.connect(leader_port) + +_ = c:eval('box.schema.space.create("test", {is_sync=true})') +_ = c:eval('box.space.test:create_index("pk")') + +-- Insert some data to a synchronous space, then kill the leader before the +-- confirmation is written. Check successful confirmation on the new leader. +test_run:cmd('setopt delimiter ";"') +for i = 1,10 do + c:eval('box.cfg{replication_synchro_quorum=4, replication_synchro_timeout=1000}') + c.space.test:insert({i}, {is_async=true}) + test_run:wait_cond(function() return c.space.test:get{i} ~= nil end) + test_run:cmd('stop server '..old_leader) + nrs[old_leader_nr] = false + new_leader_nr = get_leader(nrs) + new_leader = 'election_replica'..new_leader_nr + leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1] + c = netbox.connect(leader_port) + c:eval('box.ctl.clear_synchro_queue()') + c:eval('box.cfg{replication_synchro_timeout=1000}') + c.space._schema:replace{'smth'} + c.space.test:get{i} + test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2"') + nrs[old_leader_nr] = true + old_leader_nr = new_leader_nr + old_leader = new_leader +end; +test_run:cmd('setopt delimiter ""'); +-- We're connected to some leader. +c.space.test:select{} + +test_run:drop_cluster(SERVERS) diff --git a/test/replication/election_replica.lua b/test/replication/election_replica.lua index 36ea1f077..887d8a2a0 100644 --- a/test/replication/election_replica.lua +++ b/test/replication/election_replica.lua @@ -2,9 +2,10 @@ local INSTANCE_ID = string.match(arg[0], "%d") local SOCKET_DIR = require('fio').cwd() +local SYNCHRO_QUORUM = arg[1] and tonumber(arg[1]) or 3 local function instance_uri(instance_id) - return SOCKET_DIR..'/autobootstrap'..instance_id..'.sock'; + return SOCKET_DIR..'/election_replica'..instance_id..'.sock'; end require('console').listen(os.getenv('ADMIN')) @@ -19,8 +20,11 @@ box.cfg({ replication_timeout = 0.1, election_is_enabled = true, election_is_candidate = true, - election_timeout = 0.1, - replication_synchro_quorum = 3, + -- Should be at least as big as replication_disconnect_timeout, which is + -- 4 * replication_timeout. + election_timeout = 0.4, + replication_synchro_quorum = SYNCHRO_QUORUM, + replication_synchro_timeout = 0.1, -- To reveal more election logs. log_level = 6, }) -- 2.24.3 (Apple Git-128)