[PATCH v3 2/5] test: errinj for pause relay_send

Thu Dec 6 16:38:49 MSK 2018

Instead of using timeout we need just pause `relay_send`. Can't rely
on timeout because of various system load in parallel mode. Add new
errinj which checks boolean in loop and until it is not `True` do not
pass the method `relay_send` to the next statement.

To check the read-only mode, need to make a modification of tuple. It
is enough to call `replace` method. Instead of `delete` and then
useless verification that we have not delete tuple by using `get`
method.

And lookup the xlog files in loop with a little sleep, until the file
count is not as expected.

Update box/errinj.result because new errinj was added.

Part of #2436, #3232
---
 src/box/relay.cc                |  7 ++-
 src/errinj.h                    |  1 +
 test/box/errinj.result          | 34 ++++++------
 test/replication/catch.result   | 48 ++++++++---------
 test/replication/catch.test.lua | 41 +++++++--------
 test/replication/gc.result      | 92 +++++++++++++++++++++------------
 test/replication/gc.test.lua    | 83 +++++++++++++++++++----------
 7 files changed, 181 insertions(+), 125 deletions(-)

diff --git a/src/box/relay.cc b/src/box/relay.cc
index 0034f99a0..17daf76bf 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -635,12 +635,17 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
 static void
 relay_send(struct relay *relay, struct xrow_header *packet)
 {
+	struct errinj *inj = errinj(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL);
+	while (inj != NULL && inj->bparam) {
+		fiber_sleep(0.01);
+		inj = errinj(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL);
+	}
 	packet->sync = relay->sync;
 	relay->last_row_tm = ev_monotonic_now(loop());
 	coio_write_xrow(&relay->io, packet);
 	fiber_gc();
 
-	struct errinj *inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE);
+	inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE);
 	if (inj != NULL && inj->dparam > 0)
 		fiber_sleep(inj->dparam);
 }
diff --git a/src/errinj.h b/src/errinj.h
index aed570e79..39de63d19 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -95,6 +95,7 @@ struct errinj {
 	_(ERRINJ_VY_GC, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_VY_LOG_FLUSH, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_VY_LOG_FLUSH_DELAY, ERRINJ_BOOL, {.bparam = false}) \
+	_(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \
 	_(ERRINJ_RELAY_REPORT_INTERVAL, ERRINJ_DOUBLE, {.dparam = 0}) \
 	_(ERRINJ_RELAY_FINAL_SLEEP, ERRINJ_BOOL, {.bparam = false}) \
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 825bb3696..12303670e 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -30,7 +30,7 @@ errinj.info()
     state: false
   ERRINJ_WAL_DELAY:
     state: false
-  ERRINJ_XLOG_READ:
+  ERRINJ_VY_INDEX_DUMP:
     state: -1
   ERRINJ_WAL_WRITE_EOF:
     state: false
@@ -46,6 +46,8 @@ errinj.info()
     state: false
   ERRINJ_WAL_FALLOCATE:
     state: 0
+  ERRINJ_SNAP_COMMIT_DELAY:
+    state: false
   ERRINJ_TUPLE_ALLOC:
     state: false
   ERRINJ_VY_RUN_WRITE_DELAY:
@@ -54,25 +56,25 @@ errinj.info()
     state: false
   ERRINJ_RELAY_REPORT_INTERVAL:
     state: 0
-  ERRINJ_HTTP_RESPONSE_ADD_WAIT:
-    state: false
+  ERRINJ_WAL_BREAK_LSN:
+    state: -1
   ERRINJ_VY_READ_PAGE_TIMEOUT:
     state: 0
   ERRINJ_XLOG_META:
     state: false
-  ERRINJ_WAL_BREAK_LSN:
-    state: -1
   ERRINJ_RELAY_BREAK_LSN:
     state: -1
-  ERRINJ_WAL_WRITE_DISK:
-    state: false
   ERRINJ_VY_INDEX_FILE_RENAME:
     state: false
+  ERRINJ_WAL_WRITE_DISK:
+    state: false
   ERRINJ_VY_RUN_FILE_RENAME:
     state: false
+  ERRINJ_VY_LOG_FILE_RENAME:
+    state: false
   ERRINJ_VY_RUN_WRITE:
     state: false
-  ERRINJ_VY_LOG_FILE_RENAME:
+  ERRINJ_HTTP_RESPONSE_ADD_WAIT:
     state: false
   ERRINJ_VY_LOG_FLUSH_DELAY:
     state: false
@@ -86,18 +88,18 @@ errinj.info()
     state: false
   ERRINJ_WAL_ROTATE:
     state: false
-  ERRINJ_SNAP_COMMIT_DELAY:
-    state: false
   ERRINJ_LOG_ROTATE:
     state: false
+  ERRINJ_VY_POINT_ITER_WAIT:
+    state: false
   ERRINJ_RELAY_EXIT_DELAY:
     state: 0
   ERRINJ_IPROTO_TX_DELAY:
     state: false
-  ERRINJ_VY_POINT_ITER_WAIT:
-    state: false
   ERRINJ_BUILD_INDEX:
     state: -1
+  ERRINJ_XLOG_READ:
+    state: -1
   ERRINJ_XLOG_GARBAGE:
     state: false
   ERRINJ_TUPLE_FIELD:
@@ -106,14 +108,14 @@ errinj.info()
     state: false
   ERRINJ_TESTING:
     state: false
-  ERRINJ_RELAY_TIMEOUT:
-    state: 0
+  ERRINJ_RELAY_SEND_DELAY:
+    state: false
   ERRINJ_VY_SQUASH_TIMEOUT:
     state: 0
   ERRINJ_VY_LOG_FLUSH:
     state: false
-  ERRINJ_VY_INDEX_DUMP:
-    state: -1
+  ERRINJ_RELAY_TIMEOUT:
+    state: 0
 ...
 errinj.set("some-injection", true)
 ---
diff --git a/test/replication/catch.result b/test/replication/catch.result
index 663bdc758..e1b2995ec 100644
--- a/test/replication/catch.result
+++ b/test/replication/catch.result
@@ -35,7 +35,7 @@ test_run:cmd("switch default")
 s = box.schema.space.create('test', {engine = engine});
 ---
 ...
--- vinyl does not support hash index
+-- Vinyl does not support hash index.
 index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') })
 ---
 ...
@@ -57,14 +57,14 @@ test_run:cmd("stop server replica")
 ---
 - true
 ...
--- insert values on the master while replica is stopped and can't fetch them
-for i=1,100 do s:insert{i, 'this is test message12345'} end
+-- Insert values on the master while replica is stopped and can't
+-- fetch them.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', true)
 ---
+- ok
 ...
--- sleep after every tuple
-errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0)
+for i = 1, 100 do s:insert{i, 'this is test message12345'} end
 ---
-- ok
 ...
 test_run:cmd("start server replica with args='0.01'")
 ---
@@ -75,28 +75,26 @@ test_run:cmd("switch replica")
 - true
 ...
 -- Check that replica doesn't enter read-write mode before
--- catching up with the master: to check that we inject sleep into
--- the master relay_send function and attempt a data modifying
--- statement in replica while it's still fetching data from the
--- master.
--- In the next two cases we try to delete a tuple while replica is
--- catching up with the master (local delete, remote delete) case
+-- catching up with the master: to check that we stop sending
+-- rows on the master in relay_send function and attempt a data
+-- modifying statement in replica while it's still fetching data
+-- from the master.
+--
+-- In the next two cases we try to replace a tuple while replica
+-- is catching up with the master (local replace, remote replace)
+-- case.
 --
--- #1: delete tuple on replica
+-- Case #1: replace tuple on replica locally.
 --
 box.space.test ~= nil
 ---
 - true
 ...
-d = box.space.test:delete{1}
+box.space.test:replace{1}
 ---
 - error: Can't modify data because this instance is in read-only mode.
 ...
-box.space.test:get(1) ~= nil
----
-- true
-...
--- case #2: delete tuple by net.box
+-- Case #2: replace tuple on replica by net.box.
 test_run:cmd("switch default")
 ---
 - true
@@ -108,20 +106,16 @@ test_run:cmd("set variable r_uri to 'replica.listen'")
 c = net_box.connect(r_uri)
 ---
 ...
-d = c.space.test:delete{1}
+d = c.space.test:replace{1}
 ---
 - error: Can't modify data because this instance is in read-only mode.
 ...
-c.space.test:get(1) ~= nil
----
-- true
-...
--- check sync
-errinj.set("ERRINJ_RELAY_TIMEOUT", 0)
+-- Resume replication.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', false)
 ---
 - ok
 ...
--- cleanup
+-- Cleanup.
 test_run:cmd("stop server replica")
 ---
 - true
diff --git a/test/replication/catch.test.lua b/test/replication/catch.test.lua
index 6773675d0..d5de88642 100644
--- a/test/replication/catch.test.lua
+++ b/test/replication/catch.test.lua
@@ -13,7 +13,7 @@ test_run:cmd("switch replica")
 
 test_run:cmd("switch default")
 s = box.schema.space.create('test', {engine = engine});
--- vinyl does not support hash index
+-- Vinyl does not support hash index.
 index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') })
 
 test_run:cmd("switch replica")
@@ -22,41 +22,40 @@ while box.space.test == nil do fiber.sleep(0.01) end
 test_run:cmd("switch default")
 test_run:cmd("stop server replica")
 
--- insert values on the master while replica is stopped and can't fetch them
-for i=1,100 do s:insert{i, 'this is test message12345'} end
-
--- sleep after every tuple
-errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0)
+-- Insert values on the master while replica is stopped and can't
+-- fetch them.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', true)
+for i = 1, 100 do s:insert{i, 'this is test message12345'} end
 
 test_run:cmd("start server replica with args='0.01'")
 test_run:cmd("switch replica")
 
 -- Check that replica doesn't enter read-write mode before
--- catching up with the master: to check that we inject sleep into
--- the master relay_send function and attempt a data modifying
--- statement in replica while it's still fetching data from the
--- master.
--- In the next two cases we try to delete a tuple while replica is
--- catching up with the master (local delete, remote delete) case
+-- catching up with the master: to check that we stop sending
+-- rows on the master in relay_send function and attempt a data
+-- modifying statement in replica while it's still fetching data
+-- from the master.
+--
+-- In the next two cases we try to replace a tuple while replica
+-- is catching up with the master (local replace, remote replace)
+-- case.
 --
--- #1: delete tuple on replica
+-- Case #1: replace tuple on replica locally.
 --
 box.space.test ~= nil
-d = box.space.test:delete{1}
-box.space.test:get(1) ~= nil
+box.space.test:replace{1}
 
--- case #2: delete tuple by net.box
+-- Case #2: replace tuple on replica by net.box.
 
 test_run:cmd("switch default")
 test_run:cmd("set variable r_uri to 'replica.listen'")
 c = net_box.connect(r_uri)
-d = c.space.test:delete{1}
-c.space.test:get(1) ~= nil
+d = c.space.test:replace{1}
 
--- check sync
-errinj.set("ERRINJ_RELAY_TIMEOUT", 0)
+-- Resume replication.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', false)
 
--- cleanup
+-- Cleanup.
 test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
 test_run:cmd("delete server replica")
diff --git a/test/replication/gc.result b/test/replication/gc.result
index c73544d95..273b77efc 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -27,6 +27,38 @@ default_checkpoint_count = box.cfg.checkpoint_count
 box.cfg{checkpoint_count = 1}
 ---
 ...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+function wait_gc(n)
+    return test_run:wait_cond(function()
+        return #box.info.gc().checkpoints == n
+    end, 10)
+end
+
+function value_in(val, arr)
+    for _, elem in ipairs(arr) do
+        if val == elem then
+            return true
+        end
+    end
+    return false
+end
+
+function wait_xlog(n, timeout)
+    timeout = timeout or 1.0
+    if type(n) ~= 'table' then
+        n = {n}
+    end
+    return test_run:wait_cond(function()
+        return value_in(#fio.glob('./master/*.xlog'), n)
+    end, timeout)
+end
+
+test_run:cmd("setopt delimiter ''") ;
+---
+...
 -- Grant permissions needed for replication.
 box.schema.user.grant('guest', 'replication')
 ---
@@ -63,14 +95,13 @@ for i = 1, 100 do s:auto_increment{} end
 ...
 -- Make sure replica join will take long enough for us to
 -- invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 ---
 - ok
 ...
 -- While the replica is receiving the initial data set,
 -- make a snapshot and invoke garbage collection, then
--- remove the timeout injection so that we don't have to
--- wait too long for the replica to start.
+-- remove delay to allow replica to start.
 test_run:cmd("setopt delimiter ';'")
 ---
 - true
@@ -78,7 +109,7 @@ test_run:cmd("setopt delimiter ';'")
 fiber.create(function()
     fiber.sleep(0.1)
     box.snapshot()
-    box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+    box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 end)
 test_run:cmd("setopt delimiter ''");
 ---
@@ -110,21 +141,17 @@ test_run:cmd("switch default")
 ...
 -- Check that garbage collection removed the snapshot once
 -- the replica released the corresponding checkpoint.
-test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10)
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_xlog(1) or fio.listdir('./master')
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
----
-- true
-...
--- Make sure the replica will receive data it is subscribed
--- to long enough for us to invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+-- Make sure the replica will not receive data until
+-- we test garbage collection.
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 ---
 - ok
 ...
@@ -152,17 +179,17 @@ box.snapshot()
 ---
 - ok
 ...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_xlog(2) or fio.listdir('./master')
 ---
 - true
 ...
--- Remove the timeout injection so that the replica catches
+-- Resume replication so that the replica catches
 -- up quickly.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 ---
 - ok
 ...
@@ -185,11 +212,11 @@ test_run:cmd("switch default")
 ...
 -- Now garbage collection should resume and delete files left
 -- from the old checkpoint.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
 ---
 - true
 ...
@@ -228,11 +255,11 @@ fiber.sleep(0.1) -- wait for master to relay data
 -- Garbage collection must not delete the old xlog file
 -- because it is still needed by the replica, but remove
 -- the old snapshot.
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_xlog(2) or fio.listdir('./master')
 ---
 - true
 ...
@@ -268,11 +295,11 @@ test_run:cmd("switch default")
 - true
 ...
 -- Now it's safe to drop the old xlog.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_xlog(1) or fio.listdir('./master')
 ---
 - true
 ...
@@ -304,11 +331,14 @@ box.snapshot()
 ---
 - ok
 ...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+-- The replica may have managed to download all data
+-- from xlog #1 before it was stopped, in which case
+-- it's OK to collect xlog #1.
+wait_xlog({2, 3}) or fio.listdir('./master')
 ---
 - true
 ...
@@ -317,11 +347,11 @@ box.snapshot()
 test_run:cleanup_cluster()
 ---
 ...
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_xlog(1) or fio.listdir('./master')
 ---
 - true
 ...
@@ -413,7 +443,7 @@ box.snapshot()
 ---
 - ok
 ...
-#fio.glob('./master/*.xlog') == 3 or fio.listdir('./master')
+wait_xlog(3) or fio.listdir('./master')
 ---
 - true
 ...
@@ -426,11 +456,7 @@ box.snapshot()
 ---
 - ok
 ...
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
----
-- true
-...
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0, 10) or fio.listdir('./master')
 ---
 - true
 ...
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 1e4e02df9..9f79120e9 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -11,6 +11,35 @@ test_run:cmd("create server replica with rpl_master=default, script='replication
 default_checkpoint_count = box.cfg.checkpoint_count
 box.cfg{checkpoint_count = 1}
 
+test_run:cmd("setopt delimiter ';'")
+
+function wait_gc(n)
+    return test_run:wait_cond(function()
+        return #box.info.gc().checkpoints == n
+    end, 10)
+end
+
+function value_in(val, arr)
+    for _, elem in ipairs(arr) do
+        if val == elem then
+            return true
+        end
+    end
+    return false
+end
+
+function wait_xlog(n, timeout)
+    timeout = timeout or 1.0
+    if type(n) ~= 'table' then
+        n = {n}
+    end
+    return test_run:wait_cond(function()
+        return value_in(#fio.glob('./master/*.xlog'), n)
+    end, timeout)
+end
+
+test_run:cmd("setopt delimiter ''") ;
+
 -- Grant permissions needed for replication.
 box.schema.user.grant('guest', 'replication')
 
@@ -29,17 +58,16 @@ for i = 1, 100 do s:auto_increment{} end
 
 -- Make sure replica join will take long enough for us to
 -- invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 
 -- While the replica is receiving the initial data set,
 -- make a snapshot and invoke garbage collection, then
--- remove the timeout injection so that we don't have to
--- wait too long for the replica to start.
+-- remove delay to allow replica to start.
 test_run:cmd("setopt delimiter ';'")
 fiber.create(function()
     fiber.sleep(0.1)
     box.snapshot()
-    box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+    box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 end)
 test_run:cmd("setopt delimiter ''");
 
@@ -57,12 +85,11 @@ test_run:cmd("switch default")
 
 -- Check that garbage collection removed the snapshot once
 -- the replica released the corresponding checkpoint.
-test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10)
-#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
--- Make sure the replica will receive data it is subscribed
--- to long enough for us to invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master')
+-- Make sure the replica will not receive data until
+-- we test garbage collection.
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 
 -- Send more data to the replica.
 -- Need to do 2 snapshots here, otherwise the replica would
@@ -76,12 +103,12 @@ box.snapshot()
 -- Invoke garbage collection. Check that it doesn't remove
 -- xlogs needed by the replica.
 box.snapshot()
-#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(2) or fio.listdir('./master')
 
--- Remove the timeout injection so that the replica catches
+-- Resume replication so that the replica catches
 -- up quickly.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 
 -- Check that the replica received all data from the master.
 test_run:cmd("switch replica")
@@ -91,8 +118,8 @@ test_run:cmd("switch default")
 
 -- Now garbage collection should resume and delete files left
 -- from the old checkpoint.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(0) or fio.listdir('./master')
 --
 -- Check that the master doesn't delete xlog files sent to the
 -- replica until it receives a confirmation that the data has
@@ -110,8 +137,8 @@ fiber.sleep(0.1) -- wait for master to relay data
 -- Garbage collection must not delete the old xlog file
 -- because it is still needed by the replica, but remove
 -- the old snapshot.
-#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(2) or fio.listdir('./master')
 test_run:cmd("switch replica")
 -- Unblock the replica and break replication.
 box.error.injection.set("ERRINJ_WAL_DELAY", false)
@@ -124,8 +151,8 @@ test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
 box.space.test:count()
 test_run:cmd("switch default")
 -- Now it's safe to drop the old xlog.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master')
 -- Stop the replica.
 test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
@@ -139,14 +166,17 @@ _ = s:auto_increment{}
 box.snapshot()
 _ = s:auto_increment{}
 box.snapshot()
-#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+-- The replica may have managed to download all data
+-- from xlog #1 before it was stopped, in which case
+-- it's OK to collect xlog #1.
+wait_xlog({2, 3}) or fio.listdir('./master')
 
 -- The xlog should only be deleted after the replica
 -- is unregistered.
 test_run:cleanup_cluster()
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master')
 --
 -- Test that concurrent invocation of the garbage collector works fine.
 --
@@ -186,14 +216,13 @@ _ = s:auto_increment{}
 box.snapshot()
 _ = s:auto_increment{}
 box.snapshot()
-#fio.glob('./master/*.xlog') == 3 or fio.listdir('./master')
+wait_xlog(3) or fio.listdir('./master')
 
 -- Delete the replica from the cluster table and check that
 -- all xlog files are removed.
 test_run:cleanup_cluster()
 box.snapshot()
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0, 10) or fio.listdir('./master')
 
 -- Restore the config.
 box.cfg{replication = {}}
-- 
2.18.0