[Tarantool-patches] [PATCH v1] test: fix flaky election_qsync_stress test

Alexander V. Tikhonov avtikhon at tarantool.org
Fri Nov 13 09:03:57 MSK 2020


Found that replication/election_qsync_stress.test.lua test may fail on
restating instances. It occures on heavy loaded hosts when its local
call to stop instance using SIGTERM fails to stop it. Decided to use
SIGKILL in local stop call options to be sure that the instance will
be stopped.

Also found that running loop inline new hangs occured on server start:

  --- replication/election_qsync_stress.result    Thu Nov 12 16:23:16 2020
  +++ var/128_replication/election_qsync_stress.result    Thu Nov 12 16:31:22 2020
  @@ -323,7 +323,7 @@
    | ...
       test_run:wait_cond(function() return c.space.test ~= nil and c.space.test:get{i} ~= nil end)
    | ---
  - | - true
  + | - false
    | ...
  @@ -380,7 +380,7 @@
    | ...
       test_run:wait_cond(function() return c.space.test ~= nil and c.space.test:get{i} ~= nil end)
    | ---
  - | - true
  + | - false
    | ...
  @@ -494,687 +494,3 @@
    | ---
    | ...
       test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
  - | ---
  - | - true
  - | …

but the test already failed before on getting 'c.space.test:get{i}'.
To avoid of the hang and make test code more correct running way it
were added log.error messages and return calls. Also the test was
changed to use function for each loop iteration to be able to check
return values and break the loop just after the fail.

Needed for #5395
---

Github: https://github.com/tarantool/tarantool/tree/avtikhon/fix-election_qsync_stress
Issue: https://github.com/tarantool/tarantool/issues/5395

 test/replication/election_qsync_stress.result | 33 ++++++++++++++-----
 .../election_qsync_stress.test.lua            | 29 +++++++++++-----
 test/replication/suite.ini                    |  2 +-
 3 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/test/replication/election_qsync_stress.result b/test/replication/election_qsync_stress.result
index 9fab2f1d7..cdb00e34c 100644
--- a/test/replication/election_qsync_stress.result
+++ b/test/replication/election_qsync_stress.result
@@ -5,6 +5,9 @@ test_run = require('test_run').new()
 netbox = require('net.box')
  | ---
  | ...
+log = require('log')
+ | ---
+ | ...
 
 --
 -- gh-1146: Leader election + Qsync
@@ -83,23 +86,32 @@ test_run:cmd('setopt delimiter ";"')
  | ---
  | - true
  | ...
-for i = 1,10 do
+function run_iter(i)
     c:eval('box.cfg{replication_synchro_quorum=4, replication_synchro_timeout=1000}')
     c.space.test:insert({i}, {is_async=true})
-    test_run:wait_cond(function() return c.space.test:get{i} ~= nil end)
-    test_run:cmd('stop server '..old_leader)
+    if not test_run:wait_cond(function() return c.space.test ~= nil
+            and c.space.test:get{i} ~= nil end) then
+        log.error('error: hanged on first call to c.space.test:get(' .. i .. ')')
+        return false
+    end
+    test_run:cmd('stop server '..old_leader..' with signal=KILL')
     nrs[old_leader_nr] = false
-    new_leader_nr = get_leader(nrs)
-    new_leader = 'election_replica'..new_leader_nr
-    leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
+    local new_leader_nr = get_leader(nrs)
+    local new_leader = 'election_replica'..new_leader_nr
+    local leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
     c:eval('box.cfg{replication_synchro_timeout=1000}')
     c.space._schema:replace{'smth'}
-    c.space.test:get{i}
+    if not test_run:wait_cond(function() return c.space.test ~= nil
+            and c.space.test:get{i} ~= nil end) then
+        log.error('error: hanged on second call to c.space.test:get(' .. i .. ')')
+        return false
+    end
     test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
     nrs[old_leader_nr] = true
     old_leader_nr = new_leader_nr
     old_leader = new_leader
+    return true
 end;
  | ---
  | ...
@@ -107,8 +119,13 @@ test_run:cmd('setopt delimiter ""');
  | ---
  | - true
  | ...
+
+for i = 1,10 do if not run_iter(i) then break end end
+ | ---
+ | ...
+
 -- We're connected to some leader.
-#c.space.test:select{} == 10 or require('log').error(c.space.test:select{})
+#c.space.test:select{} == 10 or log.error(c.space.test:select{})
  | ---
  | - true
  | ...
diff --git a/test/replication/election_qsync_stress.test.lua b/test/replication/election_qsync_stress.test.lua
index 0ba15eef7..8b654d063 100644
--- a/test/replication/election_qsync_stress.test.lua
+++ b/test/replication/election_qsync_stress.test.lua
@@ -1,5 +1,6 @@
 test_run = require('test_run').new()
 netbox = require('net.box')
+log = require('log')
 
 --
 -- gh-1146: Leader election + Qsync
@@ -47,26 +48,38 @@ _ = c:eval('box.space.test:create_index("pk")')
 -- Insert some data to a synchronous space, then kill the leader before the
 -- confirmation is written. Check successful confirmation on the new leader.
 test_run:cmd('setopt delimiter ";"')
-for i = 1,10 do
+function run_iter(i)
     c:eval('box.cfg{replication_synchro_quorum=4, replication_synchro_timeout=1000}')
     c.space.test:insert({i}, {is_async=true})
-    test_run:wait_cond(function() return c.space.test:get{i} ~= nil end)
-    test_run:cmd('stop server '..old_leader)
+    if not test_run:wait_cond(function() return c.space.test ~= nil
+            and c.space.test:get{i} ~= nil end) then
+        log.error('error: hanged on first call to c.space.test:get(' .. i .. ')')
+        return false
+    end
+    test_run:cmd('stop server '..old_leader..' with signal=KILL')
     nrs[old_leader_nr] = false
-    new_leader_nr = get_leader(nrs)
-    new_leader = 'election_replica'..new_leader_nr
-    leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
+    local new_leader_nr = get_leader(nrs)
+    local new_leader = 'election_replica'..new_leader_nr
+    local leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
     c:eval('box.cfg{replication_synchro_timeout=1000}')
     c.space._schema:replace{'smth'}
-    c.space.test:get{i}
+    if not test_run:wait_cond(function() return c.space.test ~= nil
+            and c.space.test:get{i} ~= nil end) then
+        log.error('error: hanged on second call to c.space.test:get(' .. i .. ')')
+        return false
+    end
     test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
     nrs[old_leader_nr] = true
     old_leader_nr = new_leader_nr
     old_leader = new_leader
+    return true
 end;
 test_run:cmd('setopt delimiter ""');
+
+for i = 1,10 do if not run_iter(i) then break end end
+
 -- We're connected to some leader.
-#c.space.test:select{} == 10 or require('log').error(c.space.test:select{})
+#c.space.test:select{} == 10 or log.error(c.space.test:select{})
 
 test_run:drop_cluster(SERVERS)
diff --git a/test/replication/suite.ini b/test/replication/suite.ini
index 34ee32550..c456678c1 100644
--- a/test/replication/suite.ini
+++ b/test/replication/suite.ini
@@ -119,7 +119,7 @@ fragile = {
         },
         "election_qsync_stress.test.lua": {
             "issues": [ "gh-5395" ],
-            "checksums": [ "634bda94accdcdef7b1db3e14f28f445", "36bcdae426c18a60fd13025c09f197d0", "209c865525154a91435c63850f15eca0", "ca38ff2cdfa65b3defb26607b24454c6", "3b6c573d2aa06ee382d6f27b6a76657b", "bcf08e055cf3ccd9958af25d0fba29f8", "411e7462760bc3fc968b68b4b7267ea1", "37e671aea27e3396098f9d13c7fa7316" ]
+            "checksums": [ "0ace688f0a267c44c37a5f904fb5d6ce" ]
         },
         "gh-3711-misc-no-restart-on-same-configuration.test.lua": {
             "issues": [ "gh-5407" ],
-- 
2.25.1



More information about the Tarantool-patches mailing list