Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH v1] test: gc.test.lua needs to use default timeout
@ 2019-04-16  8:01 avtikhon
  2019-04-19  7:20 ` [tarantool-patches] " Alexander Turenko
  0 siblings, 1 reply; 3+ messages in thread
From: avtikhon @ 2019-04-16  8:01 UTC (permalink / raw)
  To: Alexander Turenko; +Cc: avtikhon, tarantool-patches

gc.test.lua test cleaned up to use default 60 secs timeout for
wait_cond routine instead of local 10 secs, because it doesn't
need to have its own special value. Also the diagnostic messages
added on wait_* routines fails.

[029] --- replication/gc.result Mon Apr 15 14:58:09 2019
[029] +++ replication/gc.reject Tue Apr 16 09:17:47 2019
[029] @@ -290,7 +290,12 @@
[029] ...
[029] wait_xlog(1) or fio.listdir('./master')
[029] ---
[048] replication/gc.test.lua vinyl [ fail ]
[048]
[048] Test failed! Result content mismatch:
[029] -- true
[029] +- - 00000000000000000305.vylog
[029] + - 00000000000000000305.xlog
[029] + - '512'
[029] + - 00000000000000000310.xlog
[029] + - 00000000000000000310.vylog
[029] + - 00000000000000000310.snap
[029] ...
[029] -- Stop the replica.
[029] test_run:cmd("stop server replica")
[029] @@ -326,7 +331,13 @@
[029] ...
[029] wait_xlog(2) or fio.listdir('./master')
[029] ---
[029] -- true
[029] +- - 00000000000000000305.xlog
[029] + - 00000000000000000316.xlog
[029] + - 00000000000000000316.vylog
[029] + - '512'
[029] + - 00000000000000000310.xlog
[029] + - 00000000000000000317.vylog
[029] + - 00000000000000000317.snap
[029] ...
[029] -- The xlog should only be deleted after the replica
[029] -- is unregistered.
[029]

Close #4162
---

Github: https://github.com/tarantool/tarantool/tree/avtikhon/gh-4162-gc-default-timeout
Issue: https://github.com/tarantool/tarantool/issues/4162

 test/replication/gc.result   | 42 ++++++++++++++++++------------------
 test/replication/gc.test.lua | 42 ++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/test/replication/gc.result b/test/replication/gc.result
index 65785f47b..010418db8 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -34,14 +34,14 @@ test_run:cmd("setopt delimiter ';'")
 function wait_gc(n)
     return test_run:wait_cond(function()
         return #box.info.gc().checkpoints == n
-    end, 10)
+    end) or box.info.gc()
 end;
 ---
 ...
-function wait_xlog(n, timeout)
+function wait_xlog(n)
     return test_run:wait_cond(function()
         return #fio.glob('./master/*.xlog') == n
-    end, 10)
+    end) or fio.glob('./master/*.xlog')
 end;
 ---
 ...
@@ -117,7 +117,7 @@ test_run:cmd("switch replica")
 ---
 - true
 ...
-test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count()
 ---
 - true
 ...
@@ -131,11 +131,11 @@ test_run:cmd("switch default")
 ...
 -- Check that garbage collection removed the snapshot once
 -- the replica released the corresponding checkpoint.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+wait_xlog(1)
 ---
 - true
 ...
@@ -168,11 +168,11 @@ box.snapshot()
 ---
 - ok
 ...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
 ---
 - true
 ...
@@ -187,7 +187,7 @@ test_run:cmd("switch replica")
 ---
 - true
 ...
-test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count()
 ---
 - true
 ...
@@ -201,11 +201,11 @@ test_run:cmd("switch default")
 ...
 -- Now garbage collection should resume and delete files left
 -- from the old checkpoint.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(0) or fio.listdir('./master')
+wait_xlog(0)
 ---
 - true
 ...
@@ -244,11 +244,11 @@ fiber.sleep(0.1) -- wait for master to relay data
 -- Garbage collection must not delete the old xlog file
 -- because it is still needed by the replica, but remove
 -- the old snapshot.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
 ---
 - true
 ...
@@ -271,7 +271,7 @@ test_run:cmd("switch replica")
 ---
 - true
 ...
-test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count()
 ---
 - true
 ...
@@ -284,11 +284,11 @@ test_run:cmd("switch default")
 - true
 ...
 -- Now it's safe to drop the old xlog.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(1) or fio.listdir('./master')
+wait_xlog(1)
 ---
 - true
 ...
@@ -320,11 +320,11 @@ box.snapshot()
 ---
 - ok
 ...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
 ---
 - true
 ...
@@ -333,11 +333,11 @@ wait_xlog(2) or fio.listdir('./master')
 test_run:cleanup_cluster()
 ---
 ...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(1) or fio.listdir('./master')
+wait_xlog(1)
 ---
 - true
 ...
@@ -438,7 +438,7 @@ box.snapshot()
 ---
 - ok
 ...
-wait_xlog(0, 10) or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
 ---
 - true
 ...
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 890fe29ae..017fca9de 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -15,12 +15,12 @@ test_run:cmd("setopt delimiter ';'")
 function wait_gc(n)
     return test_run:wait_cond(function()
         return #box.info.gc().checkpoints == n
-    end, 10)
+    end) or box.info.gc()
 end;
-function wait_xlog(n, timeout)
+function wait_xlog(n)
     return test_run:wait_cond(function()
         return #fio.glob('./master/*.xlog') == n
-    end, 10)
+    end) or fio.glob('./master/*.xlog')
 end;
 test_run:cmd("setopt delimiter ''");
 
@@ -63,14 +63,14 @@ test_run:cmd("start server replica")
 -- bootstrapped from, the replica should still receive all
 -- data from the master. Check it.
 test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count()
 box.space.test:count()
 test_run:cmd("switch default")
 
 -- Check that garbage collection removed the snapshot once
 -- the replica released the corresponding checkpoint.
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+wait_gc(1)
+wait_xlog(1)
 -- we test garbage collection.
 box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 
@@ -86,8 +86,8 @@ box.snapshot()
 -- Invoke garbage collection. Check that it doesn't remove
 -- xlogs needed by the replica.
 box.snapshot()
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
 
 -- Resume replication so that the replica catches
 -- up quickly.
@@ -95,14 +95,14 @@ box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 
 -- Check that the replica received all data from the master.
 test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count()
 box.space.test:count()
 test_run:cmd("switch default")
 
 -- Now garbage collection should resume and delete files left
 -- from the old checkpoint.
-wait_gc(1) or box.info.gc()
-wait_xlog(0) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(0)
 --
 -- Check that the master doesn't delete xlog files sent to the
 -- replica until it receives a confirmation that the data has
@@ -120,8 +120,8 @@ fiber.sleep(0.1) -- wait for master to relay data
 -- Garbage collection must not delete the old xlog file
 -- because it is still needed by the replica, but remove
 -- the old snapshot.
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
 test_run:cmd("switch replica")
 -- Unblock the replica and break replication.
 box.error.injection.set("ERRINJ_WAL_DELAY", false)
@@ -130,12 +130,12 @@ box.cfg{replication = {}}
 test_run:cmd("restart server replica")
 -- Wait for the replica to catch up.
 test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count()
 box.space.test:count()
 test_run:cmd("switch default")
 -- Now it's safe to drop the old xlog.
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(1)
 -- Stop the replica.
 test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
@@ -149,14 +149,14 @@ _ = s:auto_increment{}
 box.snapshot()
 _ = s:auto_increment{}
 box.snapshot()
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
 
 -- The xlog should only be deleted after the replica
 -- is unregistered.
 test_run:cleanup_cluster()
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(1)
 --
 -- Test that concurrent invocation of the garbage collector works fine.
 --
@@ -201,7 +201,7 @@ wait_xlog(3) or fio.listdir('./master')
 -- all xlog files are removed.
 test_run:cleanup_cluster()
 box.snapshot()
-wait_xlog(0, 10) or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
 
 -- Restore the config.
 box.cfg{replication = {}}
-- 
2.17.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [tarantool-patches] Re: [PATCH v1] test: gc.test.lua needs to use default timeout
  2019-04-16  8:01 [tarantool-patches] [PATCH v1] test: gc.test.lua needs to use default timeout avtikhon
@ 2019-04-19  7:20 ` Alexander Turenko
  0 siblings, 0 replies; 3+ messages in thread
From: Alexander Turenko @ 2019-04-19  7:20 UTC (permalink / raw)
  To: avtikhon; +Cc: tarantool-patches

Hi,

10 seconds should be enough for several context switches, several small
writes and reads via a socket and to delete a file from a filesystem.
Even with amount of processes that several times larger then CPUs count
tarantool should obtain enough time quanta / memory bandwith / disk iops
from schedulers.

Intuitively I think that 10 seconds is already very large time for that
and so I looked into the case deeper.

The test case that fails is the following (it is the test code from the
current master with markers):

--
-- Check that the master doesn't delete xlog files sent to the
-- replica until it receives a confirmation that the data has
-- been applied (gh-2825).
--
test_run:cmd("switch replica")
-- Prevent the replica from applying any rows.
box.error.injection.set("ERRINJ_WAL_DELAY", true)
test_run:cmd("switch default")
-- Generate some data on the master.
for i = 1, 5 do s:auto_increment{} end
box.snapshot() -- rotate xlog
for i = 1, 5 do s:auto_increment{} end
fiber.sleep(0.1) -- wait for master to relay data
-- Garbage collection must not delete the old xlog file
-- because it is still needed by the replica, but remove
-- the old snapshot.
wait_gc(1) or box.info.gc()
wait_xlog(2) or fio.listdir('./master')
test_run:cmd("switch replica")
-- Unblock the replica and break replication.
box.error.injection.set("ERRINJ_WAL_DELAY", false)
<mark 1>
box.cfg{replication = {}}
-- Restart the replica to reestablish replication.
test_run:cmd("restart server replica")
-- Wait for the replica to catch up.
test_run:cmd("switch replica")
test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
box.space.test:count()
test_run:cmd("switch default")
-- Now it's safe to drop the old xlog.
wait_gc(1) or box.info.gc()
wait_xlog(1) or fio.listdir('./master')
-- Stop the replica.
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")

Actually we do two checks here: the first one the we have 1 snapshot and
2 xlogs and the second one that we have 1 snapshot and 1 xlogs (the
first xlog from the two ones is removed after restarting the replica).

Logs show that in the successful case (when the test passes) the master
removes a first xlog before server restart. So the second xlog count
check checks virtually nothing.

The reason of that is that the replica sends ACKs to the master at <mark
1> when we unset ERRINJ_WAL_DELAY and before we drop replication at the
next line. The master see that the replica reads all data from the
first xlog and removes it.

I guess that here we imitate the situation when a replica is bit before
a master (doesn't receive most actual data yet) and crashed. Then we
wake it up and everything should be good: the replica receives all data,
the master removes unneeded xlogs.

So I propose to fix the test case:

diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 890fe29ae..e0bd00fac 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -122,12 +122,15 @@ fiber.sleep(0.1) -- wait for master to relay data
 -- the old snapshot.
 wait_gc(1) or box.info.gc()
 wait_xlog(2) or fio.listdir('./master')
-test_run:cmd("switch replica")
--- Unblock the replica and break replication.
-box.error.injection.set("ERRINJ_WAL_DELAY", false)
-box.cfg{replication = {}}
--- Restart the replica to reestablish replication.
-test_run:cmd("restart server replica")
+-- Imitate the replica crash and, then, wake up.
+-- Just 'stop server replica' (SIGTERM) is not sufficient to stop
+-- a tarantool instance when ERRINJ_WAL_DELAY is set, because
+-- "tarantool" thread wait for paused "wal" thread infinitely.
+-- But 'stop server replica' is needed to remove the intance from
+-- test-run records.
+test_run:eval('replica', [[ffi = require('ffi') ffi.cdef([=[ int getpid(void); int kill(int, int); ]=]) ffi.C.kill(ffi.C.getpid(), 9)]])
+test_run:cmd("stop server replica")
+test_run:cmd("start server replica")
 -- Wait for the replica to catch up.
 test_run:cmd("switch replica")
 test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)

It is just for idea. I think that 'kill -9' within the test looks ugly
and it is better to support 'signal=KILL' argument for 'stop server
replica' test-run command.

Surprisingly this change fixes the flaky fail (or almost fix, see below).

However once I observed the following hang (but it seems to be rare):

No output during 120 seconds. Will abort after 120 seconds without output. List of workers not reporting the status:
- 010_replication [replication/gc.test.lua, memtx] at var/010_replication/gc.result:438
Test hung! Result content mismatch:
--- replication/gc.result	Fri Apr 19 06:27:55 2019
+++ var/010_replication/gc.result	Fri Apr 19 06:44:12 2019
@@ -436,186 +436,3 @@
 ...
 -- Stop the replica and write a few WALs.
 test_run:cmd("stop server replica")

I guess that the main thread ("tarantool") waits for some other, maybe
"wal", which stucks because of unknown reason, but I didn't check.
(There is the issue #4127 that describes the same behaviour, so may be
related.)

So I see the following things need to be done here:

* Support 'stop server replica with signal=KILL' command in test-run.
* Use it in the test case to fix current flaky fails (also don't forget
  to ask a test case author about this change).
* Reduce the case to reproduce current flaky fails and provide a
  reproducer (file an issue).
* Elaborate how rare are hangs as above and check whether it connected
  somehow to some of known problem (or, maybe, it is a test problem?).

Alexander, hope you'll catch these activities.

WBR, Alexander Turenko.

On Tue, Apr 16, 2019 at 11:01:22AM +0300, avtikhon wrote:
> gc.test.lua test cleaned up to use default 60 secs timeout for
> wait_cond routine instead of local 10 secs, because it doesn't
> need to have its own special value. Also the diagnostic messages
> added on wait_* routines fails.
> 
> [029] --- replication/gc.result Mon Apr 15 14:58:09 2019
> [029] +++ replication/gc.reject Tue Apr 16 09:17:47 2019
> [029] @@ -290,7 +290,12 @@
> [029] ...
> [029] wait_xlog(1) or fio.listdir('./master')
> [029] ---
> [048] replication/gc.test.lua vinyl [ fail ]
> [048]
> [048] Test failed! Result content mismatch:
> [029] -- true
> [029] +- - 00000000000000000305.vylog
> [029] + - 00000000000000000305.xlog
> [029] + - '512'
> [029] + - 00000000000000000310.xlog
> [029] + - 00000000000000000310.vylog
> [029] + - 00000000000000000310.snap
> [029] ...
> [029] -- Stop the replica.
> [029] test_run:cmd("stop server replica")
> [029] @@ -326,7 +331,13 @@
> [029] ...
> [029] wait_xlog(2) or fio.listdir('./master')
> [029] ---
> [029] -- true
> [029] +- - 00000000000000000305.xlog
> [029] + - 00000000000000000316.xlog
> [029] + - 00000000000000000316.vylog
> [029] + - '512'
> [029] + - 00000000000000000310.xlog
> [029] + - 00000000000000000317.vylog
> [029] + - 00000000000000000317.snap
> [029] ...
> [029] -- The xlog should only be deleted after the replica
> [029] -- is unregistered.
> [029]
> 
> Close #4162
> ---
> 
> Github: https://github.com/tarantool/tarantool/tree/avtikhon/gh-4162-gc-default-timeout
> Issue: https://github.com/tarantool/tarantool/issues/4162

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [tarantool-patches] [PATCH v1] test: gc.test.lua needs to use default timeout
@ 2019-04-16  7:08 avtikhon
  0 siblings, 0 replies; 3+ messages in thread
From: avtikhon @ 2019-04-16  7:08 UTC (permalink / raw)
  To: Alexander Turenko; +Cc: avtikhon, tarantool-patches

gc.test.lua test cleaned up to use default 60 secs timeout for
wait_cond routine instead of local 10 secs, because it doesn't
need to have its own special value. Also the diagnostic messages
added on wait_* routines fails.

[029] --- replication/gc.result Mon Apr 15 14:58:09 2019
[029] +++ replication/gc.reject Tue Apr 16 09:17:47 2019
[029] @@ -290,7 +290,12 @@
[029] ...
[029] wait_xlog(1) or fio.listdir('./master')
[029] ---
[048] replication/gc.test.lua vinyl [ fail ]
[048]
[048] Test failed! Result content mismatch:
[029] -- true
[029] +- - 00000000000000000305.vylog
[029] + - 00000000000000000305.xlog
[029] + - '512'
[029] + - 00000000000000000310.xlog
[029] + - 00000000000000000310.vylog
[029] + - 00000000000000000310.snap
[029] ...
[029] -- Stop the replica.
[029] test_run:cmd("stop server replica")
[029] @@ -326,7 +331,13 @@
[029] ...
[029] wait_xlog(2) or fio.listdir('./master')
[029] ---
[029] -- true
[029] +- - 00000000000000000305.xlog
[029] + - 00000000000000000316.xlog
[029] + - 00000000000000000316.vylog
[029] + - '512'
[029] + - 00000000000000000310.xlog
[029] + - 00000000000000000317.vylog
[029] + - 00000000000000000317.snap
[029] ...
[029] -- The xlog should only be deleted after the replica
[029] -- is unregistered.
[029]

Close #4162
---
 test/replication/gc.result   | 42 ++++++++++++++++++------------------
 test/replication/gc.test.lua | 42 ++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/test/replication/gc.result b/test/replication/gc.result
index 65785f47b..010418db8 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -34,14 +34,14 @@ test_run:cmd("setopt delimiter ';'")
 function wait_gc(n)
     return test_run:wait_cond(function()
         return #box.info.gc().checkpoints == n
-    end, 10)
+    end) or box.info.gc()
 end;
 ---
 ...
-function wait_xlog(n, timeout)
+function wait_xlog(n)
     return test_run:wait_cond(function()
         return #fio.glob('./master/*.xlog') == n
-    end, 10)
+    end) or fio.glob('./master/*.xlog')
 end;
 ---
 ...
@@ -117,7 +117,7 @@ test_run:cmd("switch replica")
 ---
 - true
 ...
-test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count()
 ---
 - true
 ...
@@ -131,11 +131,11 @@ test_run:cmd("switch default")
 ...
 -- Check that garbage collection removed the snapshot once
 -- the replica released the corresponding checkpoint.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+wait_xlog(1)
 ---
 - true
 ...
@@ -168,11 +168,11 @@ box.snapshot()
 ---
 - ok
 ...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
 ---
 - true
 ...
@@ -187,7 +187,7 @@ test_run:cmd("switch replica")
 ---
 - true
 ...
-test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count()
 ---
 - true
 ...
@@ -201,11 +201,11 @@ test_run:cmd("switch default")
 ...
 -- Now garbage collection should resume and delete files left
 -- from the old checkpoint.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(0) or fio.listdir('./master')
+wait_xlog(0)
 ---
 - true
 ...
@@ -244,11 +244,11 @@ fiber.sleep(0.1) -- wait for master to relay data
 -- Garbage collection must not delete the old xlog file
 -- because it is still needed by the replica, but remove
 -- the old snapshot.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
 ---
 - true
 ...
@@ -271,7 +271,7 @@ test_run:cmd("switch replica")
 ---
 - true
 ...
-test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count()
 ---
 - true
 ...
@@ -284,11 +284,11 @@ test_run:cmd("switch default")
 - true
 ...
 -- Now it's safe to drop the old xlog.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(1) or fio.listdir('./master')
+wait_xlog(1)
 ---
 - true
 ...
@@ -320,11 +320,11 @@ box.snapshot()
 ---
 - ok
 ...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
 ---
 - true
 ...
@@ -333,11 +333,11 @@ wait_xlog(2) or fio.listdir('./master')
 test_run:cleanup_cluster()
 ---
 ...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
 ---
 - true
 ...
-wait_xlog(1) or fio.listdir('./master')
+wait_xlog(1)
 ---
 - true
 ...
@@ -438,7 +438,7 @@ box.snapshot()
 ---
 - ok
 ...
-wait_xlog(0, 10) or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
 ---
 - true
 ...
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 890fe29ae..017fca9de 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -15,12 +15,12 @@ test_run:cmd("setopt delimiter ';'")
 function wait_gc(n)
     return test_run:wait_cond(function()
         return #box.info.gc().checkpoints == n
-    end, 10)
+    end) or box.info.gc()
 end;
-function wait_xlog(n, timeout)
+function wait_xlog(n)
     return test_run:wait_cond(function()
         return #fio.glob('./master/*.xlog') == n
-    end, 10)
+    end) or fio.glob('./master/*.xlog')
 end;
 test_run:cmd("setopt delimiter ''");
 
@@ -63,14 +63,14 @@ test_run:cmd("start server replica")
 -- bootstrapped from, the replica should still receive all
 -- data from the master. Check it.
 test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count()
 box.space.test:count()
 test_run:cmd("switch default")
 
 -- Check that garbage collection removed the snapshot once
 -- the replica released the corresponding checkpoint.
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+wait_gc(1)
+wait_xlog(1)
 -- we test garbage collection.
 box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 
@@ -86,8 +86,8 @@ box.snapshot()
 -- Invoke garbage collection. Check that it doesn't remove
 -- xlogs needed by the replica.
 box.snapshot()
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
 
 -- Resume replication so that the replica catches
 -- up quickly.
@@ -95,14 +95,14 @@ box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 
 -- Check that the replica received all data from the master.
 test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count()
 box.space.test:count()
 test_run:cmd("switch default")
 
 -- Now garbage collection should resume and delete files left
 -- from the old checkpoint.
-wait_gc(1) or box.info.gc()
-wait_xlog(0) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(0)
 --
 -- Check that the master doesn't delete xlog files sent to the
 -- replica until it receives a confirmation that the data has
@@ -120,8 +120,8 @@ fiber.sleep(0.1) -- wait for master to relay data
 -- Garbage collection must not delete the old xlog file
 -- because it is still needed by the replica, but remove
 -- the old snapshot.
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
 test_run:cmd("switch replica")
 -- Unblock the replica and break replication.
 box.error.injection.set("ERRINJ_WAL_DELAY", false)
@@ -130,12 +130,12 @@ box.cfg{replication = {}}
 test_run:cmd("restart server replica")
 -- Wait for the replica to catch up.
 test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count()
 box.space.test:count()
 test_run:cmd("switch default")
 -- Now it's safe to drop the old xlog.
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(1)
 -- Stop the replica.
 test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
@@ -149,14 +149,14 @@ _ = s:auto_increment{}
 box.snapshot()
 _ = s:auto_increment{}
 box.snapshot()
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
 
 -- The xlog should only be deleted after the replica
 -- is unregistered.
 test_run:cleanup_cluster()
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(1)
 --
 -- Test that concurrent invocation of the garbage collector works fine.
 --
@@ -201,7 +201,7 @@ wait_xlog(3) or fio.listdir('./master')
 -- all xlog files are removed.
 test_run:cleanup_cluster()
 box.snapshot()
-wait_xlog(0, 10) or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
 
 -- Restore the config.
 box.cfg{replication = {}}
-- 
2.17.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2019-04-19  7:20 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-16  8:01 [tarantool-patches] [PATCH v1] test: gc.test.lua needs to use default timeout avtikhon
2019-04-19  7:20 ` [tarantool-patches] " Alexander Turenko
  -- strict thread matches above, loose matches on Subject: below --
2019-04-16  7:08 [tarantool-patches] " avtikhon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox