From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp31.i.mail.ru (smtp31.i.mail.ru [94.100.177.91]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 26B06469719 for ; Fri, 13 Nov 2020 09:03:59 +0300 (MSK) From: "Alexander V. Tikhonov" Date: Fri, 13 Nov 2020 09:03:57 +0300 Message-Id: <6174ccd36e833128fa4da137a25e2be3ca5ffde3.1605247383.git.avtikhon@tarantool.org> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH v1] test: fix flaky election_qsync_stress test List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Serge Petrenko , Kirill Yukhin Cc: tarantool-patches@dev.tarantool.org Found that replication/election_qsync_stress.test.lua test may fail on restating instances. It occures on heavy loaded hosts when its local call to stop instance using SIGTERM fails to stop it. Decided to use SIGKILL in local stop call options to be sure that the instance will be stopped. Also found that running loop inline new hangs occured on server start: --- replication/election_qsync_stress.result Thu Nov 12 16:23:16 2020 +++ var/128_replication/election_qsync_stress.result Thu Nov 12 16:31:22 2020 @@ -323,7 +323,7 @@ | ... test_run:wait_cond(function() return c.space.test ~= nil and c.space.test:get{i} ~= nil end) | --- - | - true + | - false | ... @@ -380,7 +380,7 @@ | ... test_run:wait_cond(function() return c.space.test ~= nil and c.space.test:get{i} ~= nil end) | --- - | - true + | - false | ... @@ -494,687 +494,3 @@ | --- | ... test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"') - | --- - | - true - | … but the test already failed before on getting 'c.space.test:get{i}'. To avoid of the hang and make test code more correct running way it were added log.error messages and return calls. Also the test was changed to use function for each loop iteration to be able to check return values and break the loop just after the fail. Needed for #5395 --- Github: https://github.com/tarantool/tarantool/tree/avtikhon/fix-election_qsync_stress Issue: https://github.com/tarantool/tarantool/issues/5395 test/replication/election_qsync_stress.result | 33 ++++++++++++++----- .../election_qsync_stress.test.lua | 29 +++++++++++----- test/replication/suite.ini | 2 +- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/test/replication/election_qsync_stress.result b/test/replication/election_qsync_stress.result index 9fab2f1d7..cdb00e34c 100644 --- a/test/replication/election_qsync_stress.result +++ b/test/replication/election_qsync_stress.result @@ -5,6 +5,9 @@ test_run = require('test_run').new() netbox = require('net.box') | --- | ... +log = require('log') + | --- + | ... -- -- gh-1146: Leader election + Qsync @@ -83,23 +86,32 @@ test_run:cmd('setopt delimiter ";"') | --- | - true | ... -for i = 1,10 do +function run_iter(i) c:eval('box.cfg{replication_synchro_quorum=4, replication_synchro_timeout=1000}') c.space.test:insert({i}, {is_async=true}) - test_run:wait_cond(function() return c.space.test:get{i} ~= nil end) - test_run:cmd('stop server '..old_leader) + if not test_run:wait_cond(function() return c.space.test ~= nil + and c.space.test:get{i} ~= nil end) then + log.error('error: hanged on first call to c.space.test:get(' .. i .. ')') + return false + end + test_run:cmd('stop server '..old_leader..' with signal=KILL') nrs[old_leader_nr] = false - new_leader_nr = get_leader(nrs) - new_leader = 'election_replica'..new_leader_nr - leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1] + local new_leader_nr = get_leader(nrs) + local new_leader = 'election_replica'..new_leader_nr + local leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1] c = netbox.connect(leader_port) c:eval('box.cfg{replication_synchro_timeout=1000}') c.space._schema:replace{'smth'} - c.space.test:get{i} + if not test_run:wait_cond(function() return c.space.test ~= nil + and c.space.test:get{i} ~= nil end) then + log.error('error: hanged on second call to c.space.test:get(' .. i .. ')') + return false + end test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"') nrs[old_leader_nr] = true old_leader_nr = new_leader_nr old_leader = new_leader + return true end; | --- | ... @@ -107,8 +119,13 @@ test_run:cmd('setopt delimiter ""'); | --- | - true | ... + +for i = 1,10 do if not run_iter(i) then break end end + | --- + | ... + -- We're connected to some leader. -#c.space.test:select{} == 10 or require('log').error(c.space.test:select{}) +#c.space.test:select{} == 10 or log.error(c.space.test:select{}) | --- | - true | ... diff --git a/test/replication/election_qsync_stress.test.lua b/test/replication/election_qsync_stress.test.lua index 0ba15eef7..8b654d063 100644 --- a/test/replication/election_qsync_stress.test.lua +++ b/test/replication/election_qsync_stress.test.lua @@ -1,5 +1,6 @@ test_run = require('test_run').new() netbox = require('net.box') +log = require('log') -- -- gh-1146: Leader election + Qsync @@ -47,26 +48,38 @@ _ = c:eval('box.space.test:create_index("pk")') -- Insert some data to a synchronous space, then kill the leader before the -- confirmation is written. Check successful confirmation on the new leader. test_run:cmd('setopt delimiter ";"') -for i = 1,10 do +function run_iter(i) c:eval('box.cfg{replication_synchro_quorum=4, replication_synchro_timeout=1000}') c.space.test:insert({i}, {is_async=true}) - test_run:wait_cond(function() return c.space.test:get{i} ~= nil end) - test_run:cmd('stop server '..old_leader) + if not test_run:wait_cond(function() return c.space.test ~= nil + and c.space.test:get{i} ~= nil end) then + log.error('error: hanged on first call to c.space.test:get(' .. i .. ')') + return false + end + test_run:cmd('stop server '..old_leader..' with signal=KILL') nrs[old_leader_nr] = false - new_leader_nr = get_leader(nrs) - new_leader = 'election_replica'..new_leader_nr - leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1] + local new_leader_nr = get_leader(nrs) + local new_leader = 'election_replica'..new_leader_nr + local leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1] c = netbox.connect(leader_port) c:eval('box.cfg{replication_synchro_timeout=1000}') c.space._schema:replace{'smth'} - c.space.test:get{i} + if not test_run:wait_cond(function() return c.space.test ~= nil + and c.space.test:get{i} ~= nil end) then + log.error('error: hanged on second call to c.space.test:get(' .. i .. ')') + return false + end test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"') nrs[old_leader_nr] = true old_leader_nr = new_leader_nr old_leader = new_leader + return true end; test_run:cmd('setopt delimiter ""'); + +for i = 1,10 do if not run_iter(i) then break end end + -- We're connected to some leader. -#c.space.test:select{} == 10 or require('log').error(c.space.test:select{}) +#c.space.test:select{} == 10 or log.error(c.space.test:select{}) test_run:drop_cluster(SERVERS) diff --git a/test/replication/suite.ini b/test/replication/suite.ini index 34ee32550..c456678c1 100644 --- a/test/replication/suite.ini +++ b/test/replication/suite.ini @@ -119,7 +119,7 @@ fragile = { }, "election_qsync_stress.test.lua": { "issues": [ "gh-5395" ], - "checksums": [ "634bda94accdcdef7b1db3e14f28f445", "36bcdae426c18a60fd13025c09f197d0", "209c865525154a91435c63850f15eca0", "ca38ff2cdfa65b3defb26607b24454c6", "3b6c573d2aa06ee382d6f27b6a76657b", "bcf08e055cf3ccd9958af25d0fba29f8", "411e7462760bc3fc968b68b4b7267ea1", "37e671aea27e3396098f9d13c7fa7316" ] + "checksums": [ "0ace688f0a267c44c37a5f904fb5d6ce" ] }, "gh-3711-misc-no-restart-on-same-configuration.test.lua": { "issues": [ "gh-5407" ], -- 2.25.1