* [tarantool-patches] [PATCH v1] test: gc.test.lua needs to use default timeout
@ 2019-04-16 8:01 avtikhon
2019-04-19 7:20 ` [tarantool-patches] " Alexander Turenko
0 siblings, 1 reply; 3+ messages in thread
From: avtikhon @ 2019-04-16 8:01 UTC (permalink / raw)
To: Alexander Turenko; +Cc: avtikhon, tarantool-patches
gc.test.lua test cleaned up to use default 60 secs timeout for
wait_cond routine instead of local 10 secs, because it doesn't
need to have its own special value. Also the diagnostic messages
added on wait_* routines fails.
[029] --- replication/gc.result Mon Apr 15 14:58:09 2019
[029] +++ replication/gc.reject Tue Apr 16 09:17:47 2019
[029] @@ -290,7 +290,12 @@
[029] ...
[029] wait_xlog(1) or fio.listdir('./master')
[029] ---
[048] replication/gc.test.lua vinyl [ fail ]
[048]
[048] Test failed! Result content mismatch:
[029] -- true
[029] +- - 00000000000000000305.vylog
[029] + - 00000000000000000305.xlog
[029] + - '512'
[029] + - 00000000000000000310.xlog
[029] + - 00000000000000000310.vylog
[029] + - 00000000000000000310.snap
[029] ...
[029] -- Stop the replica.
[029] test_run:cmd("stop server replica")
[029] @@ -326,7 +331,13 @@
[029] ...
[029] wait_xlog(2) or fio.listdir('./master')
[029] ---
[029] -- true
[029] +- - 00000000000000000305.xlog
[029] + - 00000000000000000316.xlog
[029] + - 00000000000000000316.vylog
[029] + - '512'
[029] + - 00000000000000000310.xlog
[029] + - 00000000000000000317.vylog
[029] + - 00000000000000000317.snap
[029] ...
[029] -- The xlog should only be deleted after the replica
[029] -- is unregistered.
[029]
Close #4162
---
Github: https://github.com/tarantool/tarantool/tree/avtikhon/gh-4162-gc-default-timeout
Issue: https://github.com/tarantool/tarantool/issues/4162
test/replication/gc.result | 42 ++++++++++++++++++------------------
test/replication/gc.test.lua | 42 ++++++++++++++++++------------------
2 files changed, 42 insertions(+), 42 deletions(-)
diff --git a/test/replication/gc.result b/test/replication/gc.result
index 65785f47b..010418db8 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -34,14 +34,14 @@ test_run:cmd("setopt delimiter ';'")
function wait_gc(n)
return test_run:wait_cond(function()
return #box.info.gc().checkpoints == n
- end, 10)
+ end) or box.info.gc()
end;
---
...
-function wait_xlog(n, timeout)
+function wait_xlog(n)
return test_run:wait_cond(function()
return #fio.glob('./master/*.xlog') == n
- end, 10)
+ end) or fio.glob('./master/*.xlog')
end;
---
...
@@ -117,7 +117,7 @@ test_run:cmd("switch replica")
---
- true
...
-test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count()
---
- true
...
@@ -131,11 +131,11 @@ test_run:cmd("switch default")
...
-- Check that garbage collection removed the snapshot once
-- the replica released the corresponding checkpoint.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+wait_xlog(1)
---
- true
...
@@ -168,11 +168,11 @@ box.snapshot()
---
- ok
...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
---
- true
...
@@ -187,7 +187,7 @@ test_run:cmd("switch replica")
---
- true
...
-test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count()
---
- true
...
@@ -201,11 +201,11 @@ test_run:cmd("switch default")
...
-- Now garbage collection should resume and delete files left
-- from the old checkpoint.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(0) or fio.listdir('./master')
+wait_xlog(0)
---
- true
...
@@ -244,11 +244,11 @@ fiber.sleep(0.1) -- wait for master to relay data
-- Garbage collection must not delete the old xlog file
-- because it is still needed by the replica, but remove
-- the old snapshot.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
---
- true
...
@@ -271,7 +271,7 @@ test_run:cmd("switch replica")
---
- true
...
-test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count()
---
- true
...
@@ -284,11 +284,11 @@ test_run:cmd("switch default")
- true
...
-- Now it's safe to drop the old xlog.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(1) or fio.listdir('./master')
+wait_xlog(1)
---
- true
...
@@ -320,11 +320,11 @@ box.snapshot()
---
- ok
...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
---
- true
...
@@ -333,11 +333,11 @@ wait_xlog(2) or fio.listdir('./master')
test_run:cleanup_cluster()
---
...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(1) or fio.listdir('./master')
+wait_xlog(1)
---
- true
...
@@ -438,7 +438,7 @@ box.snapshot()
---
- ok
...
-wait_xlog(0, 10) or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
---
- true
...
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 890fe29ae..017fca9de 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -15,12 +15,12 @@ test_run:cmd("setopt delimiter ';'")
function wait_gc(n)
return test_run:wait_cond(function()
return #box.info.gc().checkpoints == n
- end, 10)
+ end) or box.info.gc()
end;
-function wait_xlog(n, timeout)
+function wait_xlog(n)
return test_run:wait_cond(function()
return #fio.glob('./master/*.xlog') == n
- end, 10)
+ end) or fio.glob('./master/*.xlog')
end;
test_run:cmd("setopt delimiter ''");
@@ -63,14 +63,14 @@ test_run:cmd("start server replica")
-- bootstrapped from, the replica should still receive all
-- data from the master. Check it.
test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count()
box.space.test:count()
test_run:cmd("switch default")
-- Check that garbage collection removed the snapshot once
-- the replica released the corresponding checkpoint.
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+wait_gc(1)
+wait_xlog(1)
-- we test garbage collection.
box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
@@ -86,8 +86,8 @@ box.snapshot()
-- Invoke garbage collection. Check that it doesn't remove
-- xlogs needed by the replica.
box.snapshot()
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
-- Resume replication so that the replica catches
-- up quickly.
@@ -95,14 +95,14 @@ box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
-- Check that the replica received all data from the master.
test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count()
box.space.test:count()
test_run:cmd("switch default")
-- Now garbage collection should resume and delete files left
-- from the old checkpoint.
-wait_gc(1) or box.info.gc()
-wait_xlog(0) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(0)
--
-- Check that the master doesn't delete xlog files sent to the
-- replica until it receives a confirmation that the data has
@@ -120,8 +120,8 @@ fiber.sleep(0.1) -- wait for master to relay data
-- Garbage collection must not delete the old xlog file
-- because it is still needed by the replica, but remove
-- the old snapshot.
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
test_run:cmd("switch replica")
-- Unblock the replica and break replication.
box.error.injection.set("ERRINJ_WAL_DELAY", false)
@@ -130,12 +130,12 @@ box.cfg{replication = {}}
test_run:cmd("restart server replica")
-- Wait for the replica to catch up.
test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count()
box.space.test:count()
test_run:cmd("switch default")
-- Now it's safe to drop the old xlog.
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(1)
-- Stop the replica.
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
@@ -149,14 +149,14 @@ _ = s:auto_increment{}
box.snapshot()
_ = s:auto_increment{}
box.snapshot()
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
-- The xlog should only be deleted after the replica
-- is unregistered.
test_run:cleanup_cluster()
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(1)
--
-- Test that concurrent invocation of the garbage collector works fine.
--
@@ -201,7 +201,7 @@ wait_xlog(3) or fio.listdir('./master')
-- all xlog files are removed.
test_run:cleanup_cluster()
box.snapshot()
-wait_xlog(0, 10) or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
-- Restore the config.
box.cfg{replication = {}}
--
2.17.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* [tarantool-patches] Re: [PATCH v1] test: gc.test.lua needs to use default timeout
2019-04-16 8:01 [tarantool-patches] [PATCH v1] test: gc.test.lua needs to use default timeout avtikhon
@ 2019-04-19 7:20 ` Alexander Turenko
0 siblings, 0 replies; 3+ messages in thread
From: Alexander Turenko @ 2019-04-19 7:20 UTC (permalink / raw)
To: avtikhon; +Cc: tarantool-patches
Hi,
10 seconds should be enough for several context switches, several small
writes and reads via a socket and to delete a file from a filesystem.
Even with amount of processes that several times larger then CPUs count
tarantool should obtain enough time quanta / memory bandwith / disk iops
from schedulers.
Intuitively I think that 10 seconds is already very large time for that
and so I looked into the case deeper.
The test case that fails is the following (it is the test code from the
current master with markers):
--
-- Check that the master doesn't delete xlog files sent to the
-- replica until it receives a confirmation that the data has
-- been applied (gh-2825).
--
test_run:cmd("switch replica")
-- Prevent the replica from applying any rows.
box.error.injection.set("ERRINJ_WAL_DELAY", true)
test_run:cmd("switch default")
-- Generate some data on the master.
for i = 1, 5 do s:auto_increment{} end
box.snapshot() -- rotate xlog
for i = 1, 5 do s:auto_increment{} end
fiber.sleep(0.1) -- wait for master to relay data
-- Garbage collection must not delete the old xlog file
-- because it is still needed by the replica, but remove
-- the old snapshot.
wait_gc(1) or box.info.gc()
wait_xlog(2) or fio.listdir('./master')
test_run:cmd("switch replica")
-- Unblock the replica and break replication.
box.error.injection.set("ERRINJ_WAL_DELAY", false)
<mark 1>
box.cfg{replication = {}}
-- Restart the replica to reestablish replication.
test_run:cmd("restart server replica")
-- Wait for the replica to catch up.
test_run:cmd("switch replica")
test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
box.space.test:count()
test_run:cmd("switch default")
-- Now it's safe to drop the old xlog.
wait_gc(1) or box.info.gc()
wait_xlog(1) or fio.listdir('./master')
-- Stop the replica.
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
Actually we do two checks here: the first one the we have 1 snapshot and
2 xlogs and the second one that we have 1 snapshot and 1 xlogs (the
first xlog from the two ones is removed after restarting the replica).
Logs show that in the successful case (when the test passes) the master
removes a first xlog before server restart. So the second xlog count
check checks virtually nothing.
The reason of that is that the replica sends ACKs to the master at <mark
1> when we unset ERRINJ_WAL_DELAY and before we drop replication at the
next line. The master see that the replica reads all data from the
first xlog and removes it.
I guess that here we imitate the situation when a replica is bit before
a master (doesn't receive most actual data yet) and crashed. Then we
wake it up and everything should be good: the replica receives all data,
the master removes unneeded xlogs.
So I propose to fix the test case:
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 890fe29ae..e0bd00fac 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -122,12 +122,15 @@ fiber.sleep(0.1) -- wait for master to relay data
-- the old snapshot.
wait_gc(1) or box.info.gc()
wait_xlog(2) or fio.listdir('./master')
-test_run:cmd("switch replica")
--- Unblock the replica and break replication.
-box.error.injection.set("ERRINJ_WAL_DELAY", false)
-box.cfg{replication = {}}
--- Restart the replica to reestablish replication.
-test_run:cmd("restart server replica")
+-- Imitate the replica crash and, then, wake up.
+-- Just 'stop server replica' (SIGTERM) is not sufficient to stop
+-- a tarantool instance when ERRINJ_WAL_DELAY is set, because
+-- "tarantool" thread wait for paused "wal" thread infinitely.
+-- But 'stop server replica' is needed to remove the intance from
+-- test-run records.
+test_run:eval('replica', [[ffi = require('ffi') ffi.cdef([=[ int getpid(void); int kill(int, int); ]=]) ffi.C.kill(ffi.C.getpid(), 9)]])
+test_run:cmd("stop server replica")
+test_run:cmd("start server replica")
-- Wait for the replica to catch up.
test_run:cmd("switch replica")
test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
It is just for idea. I think that 'kill -9' within the test looks ugly
and it is better to support 'signal=KILL' argument for 'stop server
replica' test-run command.
Surprisingly this change fixes the flaky fail (or almost fix, see below).
However once I observed the following hang (but it seems to be rare):
No output during 120 seconds. Will abort after 120 seconds without output. List of workers not reporting the status:
- 010_replication [replication/gc.test.lua, memtx] at var/010_replication/gc.result:438
Test hung! Result content mismatch:
--- replication/gc.result Fri Apr 19 06:27:55 2019
+++ var/010_replication/gc.result Fri Apr 19 06:44:12 2019
@@ -436,186 +436,3 @@
...
-- Stop the replica and write a few WALs.
test_run:cmd("stop server replica")
I guess that the main thread ("tarantool") waits for some other, maybe
"wal", which stucks because of unknown reason, but I didn't check.
(There is the issue #4127 that describes the same behaviour, so may be
related.)
So I see the following things need to be done here:
* Support 'stop server replica with signal=KILL' command in test-run.
* Use it in the test case to fix current flaky fails (also don't forget
to ask a test case author about this change).
* Reduce the case to reproduce current flaky fails and provide a
reproducer (file an issue).
* Elaborate how rare are hangs as above and check whether it connected
somehow to some of known problem (or, maybe, it is a test problem?).
Alexander, hope you'll catch these activities.
WBR, Alexander Turenko.
On Tue, Apr 16, 2019 at 11:01:22AM +0300, avtikhon wrote:
> gc.test.lua test cleaned up to use default 60 secs timeout for
> wait_cond routine instead of local 10 secs, because it doesn't
> need to have its own special value. Also the diagnostic messages
> added on wait_* routines fails.
>
> [029] --- replication/gc.result Mon Apr 15 14:58:09 2019
> [029] +++ replication/gc.reject Tue Apr 16 09:17:47 2019
> [029] @@ -290,7 +290,12 @@
> [029] ...
> [029] wait_xlog(1) or fio.listdir('./master')
> [029] ---
> [048] replication/gc.test.lua vinyl [ fail ]
> [048]
> [048] Test failed! Result content mismatch:
> [029] -- true
> [029] +- - 00000000000000000305.vylog
> [029] + - 00000000000000000305.xlog
> [029] + - '512'
> [029] + - 00000000000000000310.xlog
> [029] + - 00000000000000000310.vylog
> [029] + - 00000000000000000310.snap
> [029] ...
> [029] -- Stop the replica.
> [029] test_run:cmd("stop server replica")
> [029] @@ -326,7 +331,13 @@
> [029] ...
> [029] wait_xlog(2) or fio.listdir('./master')
> [029] ---
> [029] -- true
> [029] +- - 00000000000000000305.xlog
> [029] + - 00000000000000000316.xlog
> [029] + - 00000000000000000316.vylog
> [029] + - '512'
> [029] + - 00000000000000000310.xlog
> [029] + - 00000000000000000317.vylog
> [029] + - 00000000000000000317.snap
> [029] ...
> [029] -- The xlog should only be deleted after the replica
> [029] -- is unregistered.
> [029]
>
> Close #4162
> ---
>
> Github: https://github.com/tarantool/tarantool/tree/avtikhon/gh-4162-gc-default-timeout
> Issue: https://github.com/tarantool/tarantool/issues/4162
^ permalink raw reply [flat|nested] 3+ messages in thread
* [tarantool-patches] [PATCH v1] test: gc.test.lua needs to use default timeout
@ 2019-04-16 7:08 avtikhon
0 siblings, 0 replies; 3+ messages in thread
From: avtikhon @ 2019-04-16 7:08 UTC (permalink / raw)
To: Alexander Turenko; +Cc: avtikhon, tarantool-patches
gc.test.lua test cleaned up to use default 60 secs timeout for
wait_cond routine instead of local 10 secs, because it doesn't
need to have its own special value. Also the diagnostic messages
added on wait_* routines fails.
[029] --- replication/gc.result Mon Apr 15 14:58:09 2019
[029] +++ replication/gc.reject Tue Apr 16 09:17:47 2019
[029] @@ -290,7 +290,12 @@
[029] ...
[029] wait_xlog(1) or fio.listdir('./master')
[029] ---
[048] replication/gc.test.lua vinyl [ fail ]
[048]
[048] Test failed! Result content mismatch:
[029] -- true
[029] +- - 00000000000000000305.vylog
[029] + - 00000000000000000305.xlog
[029] + - '512'
[029] + - 00000000000000000310.xlog
[029] + - 00000000000000000310.vylog
[029] + - 00000000000000000310.snap
[029] ...
[029] -- Stop the replica.
[029] test_run:cmd("stop server replica")
[029] @@ -326,7 +331,13 @@
[029] ...
[029] wait_xlog(2) or fio.listdir('./master')
[029] ---
[029] -- true
[029] +- - 00000000000000000305.xlog
[029] + - 00000000000000000316.xlog
[029] + - 00000000000000000316.vylog
[029] + - '512'
[029] + - 00000000000000000310.xlog
[029] + - 00000000000000000317.vylog
[029] + - 00000000000000000317.snap
[029] ...
[029] -- The xlog should only be deleted after the replica
[029] -- is unregistered.
[029]
Close #4162
---
test/replication/gc.result | 42 ++++++++++++++++++------------------
test/replication/gc.test.lua | 42 ++++++++++++++++++------------------
2 files changed, 42 insertions(+), 42 deletions(-)
diff --git a/test/replication/gc.result b/test/replication/gc.result
index 65785f47b..010418db8 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -34,14 +34,14 @@ test_run:cmd("setopt delimiter ';'")
function wait_gc(n)
return test_run:wait_cond(function()
return #box.info.gc().checkpoints == n
- end, 10)
+ end) or box.info.gc()
end;
---
...
-function wait_xlog(n, timeout)
+function wait_xlog(n)
return test_run:wait_cond(function()
return #fio.glob('./master/*.xlog') == n
- end, 10)
+ end) or fio.glob('./master/*.xlog')
end;
---
...
@@ -117,7 +117,7 @@ test_run:cmd("switch replica")
---
- true
...
-test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count()
---
- true
...
@@ -131,11 +131,11 @@ test_run:cmd("switch default")
...
-- Check that garbage collection removed the snapshot once
-- the replica released the corresponding checkpoint.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+wait_xlog(1)
---
- true
...
@@ -168,11 +168,11 @@ box.snapshot()
---
- ok
...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
---
- true
...
@@ -187,7 +187,7 @@ test_run:cmd("switch replica")
---
- true
...
-test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count()
---
- true
...
@@ -201,11 +201,11 @@ test_run:cmd("switch default")
...
-- Now garbage collection should resume and delete files left
-- from the old checkpoint.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(0) or fio.listdir('./master')
+wait_xlog(0)
---
- true
...
@@ -244,11 +244,11 @@ fiber.sleep(0.1) -- wait for master to relay data
-- Garbage collection must not delete the old xlog file
-- because it is still needed by the replica, but remove
-- the old snapshot.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
---
- true
...
@@ -271,7 +271,7 @@ test_run:cmd("switch replica")
---
- true
...
-test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count()
---
- true
...
@@ -284,11 +284,11 @@ test_run:cmd("switch default")
- true
...
-- Now it's safe to drop the old xlog.
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(1) or fio.listdir('./master')
+wait_xlog(1)
---
- true
...
@@ -320,11 +320,11 @@ box.snapshot()
---
- ok
...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(2) or fio.listdir('./master')
+wait_xlog(2)
---
- true
...
@@ -333,11 +333,11 @@ wait_xlog(2) or fio.listdir('./master')
test_run:cleanup_cluster()
---
...
-wait_gc(1) or box.info.gc()
+wait_gc(1)
---
- true
...
-wait_xlog(1) or fio.listdir('./master')
+wait_xlog(1)
---
- true
...
@@ -438,7 +438,7 @@ box.snapshot()
---
- ok
...
-wait_xlog(0, 10) or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
---
- true
...
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 890fe29ae..017fca9de 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -15,12 +15,12 @@ test_run:cmd("setopt delimiter ';'")
function wait_gc(n)
return test_run:wait_cond(function()
return #box.info.gc().checkpoints == n
- end, 10)
+ end) or box.info.gc()
end;
-function wait_xlog(n, timeout)
+function wait_xlog(n)
return test_run:wait_cond(function()
return #fio.glob('./master/*.xlog') == n
- end, 10)
+ end) or fio.glob('./master/*.xlog')
end;
test_run:cmd("setopt delimiter ''");
@@ -63,14 +63,14 @@ test_run:cmd("start server replica")
-- bootstrapped from, the replica should still receive all
-- data from the master. Check it.
test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 200 end) or box.space.test:count()
box.space.test:count()
test_run:cmd("switch default")
-- Check that garbage collection removed the snapshot once
-- the replica released the corresponding checkpoint.
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+wait_gc(1)
+wait_xlog(1)
-- we test garbage collection.
box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
@@ -86,8 +86,8 @@ box.snapshot()
-- Invoke garbage collection. Check that it doesn't remove
-- xlogs needed by the replica.
box.snapshot()
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
-- Resume replication so that the replica catches
-- up quickly.
@@ -95,14 +95,14 @@ box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
-- Check that the replica received all data from the master.
test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 300 end) or box.space.test:count()
box.space.test:count()
test_run:cmd("switch default")
-- Now garbage collection should resume and delete files left
-- from the old checkpoint.
-wait_gc(1) or box.info.gc()
-wait_xlog(0) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(0)
--
-- Check that the master doesn't delete xlog files sent to the
-- replica until it receives a confirmation that the data has
@@ -120,8 +120,8 @@ fiber.sleep(0.1) -- wait for master to relay data
-- Garbage collection must not delete the old xlog file
-- because it is still needed by the replica, but remove
-- the old snapshot.
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
test_run:cmd("switch replica")
-- Unblock the replica and break replication.
box.error.injection.set("ERRINJ_WAL_DELAY", false)
@@ -130,12 +130,12 @@ box.cfg{replication = {}}
test_run:cmd("restart server replica")
-- Wait for the replica to catch up.
test_run:cmd("switch replica")
-test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
+test_run:wait_cond(function() return box.space.test:count() == 310 end) or box.space.test:count()
box.space.test:count()
test_run:cmd("switch default")
-- Now it's safe to drop the old xlog.
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(1)
-- Stop the replica.
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
@@ -149,14 +149,14 @@ _ = s:auto_increment{}
box.snapshot()
_ = s:auto_increment{}
box.snapshot()
-wait_gc(1) or box.info.gc()
-wait_xlog(2) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(2)
-- The xlog should only be deleted after the replica
-- is unregistered.
test_run:cleanup_cluster()
-wait_gc(1) or box.info.gc()
-wait_xlog(1) or fio.listdir('./master')
+wait_gc(1)
+wait_xlog(1)
--
-- Test that concurrent invocation of the garbage collector works fine.
--
@@ -201,7 +201,7 @@ wait_xlog(3) or fio.listdir('./master')
-- all xlog files are removed.
test_run:cleanup_cluster()
box.snapshot()
-wait_xlog(0, 10) or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
-- Restore the config.
box.cfg{replication = {}}
--
2.17.1
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2019-04-19 7:20 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-16 8:01 [tarantool-patches] [PATCH v1] test: gc.test.lua needs to use default timeout avtikhon
2019-04-19 7:20 ` [tarantool-patches] " Alexander Turenko
-- strict thread matches above, loose matches on Subject: below --
2019-04-16 7:08 [tarantool-patches] " avtikhon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox