[Tarantool-patches] [PATCH v1 12/12] test: errinj for pause relay_send

Alexander V. Tikhonov avtikhon at tarantool.org
Tue Nov 26 09:21:48 MSK 2019


From: Sergei Voronezhskii <sergw at tarantool.org>

This commit is the rest part of changes cherry picked from commit
1c34c91fa725ab254619d23c2f1d99f1e8269324. The initial part of changes
were cherry-picked at commit 8f2bd50105e62b0133032a717cfaa6f8fab26c29.

And lookup the xlog files in loop with a little sleep, until the file
count is not as expected.

Part of #3232

(cherry picked from commit 1c34c91fa725ab254619d23c2f1d99f1e8269324)
---
 test/replication/gc.result   | 86 ++++++++++++++++++++----------------
 test/replication/gc.test.lua | 62 ++++++++++++++------------
 2 files changed, 82 insertions(+), 66 deletions(-)

diff --git a/test/replication/gc.result b/test/replication/gc.result
index 5d55403b0..050a6100c 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -27,6 +27,28 @@ default_checkpoint_count = box.cfg.checkpoint_count
 box.cfg{checkpoint_count = 1}
 ---
 ...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+function wait_gc(n)
+    return test_run:wait_cond(function()
+        return #box.info.gc().checkpoints == n
+    end, 10)
+end;
+---
+...
+function wait_xlog(n, timeout)
+    return test_run:wait_cond(function()
+        return #fio.glob('./master/*.xlog') == n
+    end, 10)
+end;
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
 -- Grant permissions needed for replication.
 box.schema.user.grant('guest', 'replication')
 ---
@@ -63,14 +85,13 @@ for i = 1, 100 do s:auto_increment{} end
 ...
 -- Make sure replica join will take long enough for us to
 -- invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 ---
 - ok
 ...
 -- While the replica is receiving the initial data set,
 -- make a snapshot and invoke garbage collection, then
--- remove the timeout injection so that we don't have to
--- wait too long for the replica to start.
+-- remove delay to allow replica to start.
 test_run:cmd("setopt delimiter ';'")
 ---
 - true
@@ -78,7 +99,7 @@ test_run:cmd("setopt delimiter ';'")
 fiber.create(function()
     fiber.sleep(0.1)
     box.snapshot()
-    box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+    box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 end)
 test_run:cmd("setopt delimiter ''");
 ---
@@ -110,21 +131,16 @@ test_run:cmd("switch default")
 ...
 -- Check that garbage collection removed the snapshot once
 -- the replica released the corresponding checkpoint.
-test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10)
----
-- true
-...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
 ---
 - true
 ...
--- Make sure the replica will receive data it is subscribed
--- to long enough for us to invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+-- we test garbage collection.
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 ---
 - ok
 ...
@@ -152,17 +168,17 @@ box.snapshot()
 ---
 - ok
 ...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_xlog(2) or fio.listdir('./master')
 ---
 - true
 ...
--- Remove the timeout injection so that the replica catches
+-- Resume replication so that the replica catches
 -- up quickly.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 ---
 - ok
 ...
@@ -185,11 +201,11 @@ test_run:cmd("switch default")
 ...
 -- Now garbage collection should resume and delete files left
 -- from the old checkpoint.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
 ---
 - true
 ...
@@ -228,11 +244,11 @@ fiber.sleep(0.1) -- wait for master to relay data
 -- Garbage collection must not delete the old xlog file
 -- because it is still needed by the replica, but remove
 -- the old snapshot.
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_xlog(2) or fio.listdir('./master')
 ---
 - true
 ...
@@ -266,11 +282,11 @@ test_run:cmd("switch default")
 - true
 ...
 -- Now it's safe to drop the old xlog.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_xlog(1) or fio.listdir('./master')
 ---
 - true
 ...
@@ -302,17 +318,11 @@ box.snapshot()
 ---
 - ok
 ...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
 ---
 - true
 ...
-xlog_count = #fio.glob('./master/*.xlog')
----
-...
--- the replica may have managed to download all data
--- from xlog #1 before it was stopped, in which case
--- it's OK to collect xlog #1
-xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
+wait_xlog(2) or fio.listdir('./master')
 ---
 - true
 ...
@@ -321,7 +331,11 @@ xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
 test_run:cleanup_cluster()
 ---
 ...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+---
+- true
+...
+wait_xlog(1) or fio.listdir('./master')
 ---
 - true
 ...
@@ -409,7 +423,7 @@ box.snapshot()
 ---
 - ok
 ...
-#fio.glob('./master/*.xlog') == 3 or fio.listdir('./master')
+wait_xlog(3) or fio.listdir('./master')
 ---
 - true
 ...
@@ -422,11 +436,7 @@ box.snapshot()
 ---
 - ok
 ...
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
----
-- true
-...
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0, 10) or fio.listdir('./master')
 ---
 - true
 ...
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 40a349167..7cd18402c 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -11,6 +11,19 @@ test_run:cmd("create server replica with rpl_master=default, script='replication
 default_checkpoint_count = box.cfg.checkpoint_count
 box.cfg{checkpoint_count = 1}
 
+test_run:cmd("setopt delimiter ';'")
+function wait_gc(n)
+    return test_run:wait_cond(function()
+        return #box.info.gc().checkpoints == n
+    end, 10)
+end;
+function wait_xlog(n, timeout)
+    return test_run:wait_cond(function()
+        return #fio.glob('./master/*.xlog') == n
+    end, 10)
+end;
+test_run:cmd("setopt delimiter ''");
+
 -- Grant permissions needed for replication.
 box.schema.user.grant('guest', 'replication')
 
@@ -29,17 +42,16 @@ for i = 1, 100 do s:auto_increment{} end
 
 -- Make sure replica join will take long enough for us to
 -- invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 
 -- While the replica is receiving the initial data set,
 -- make a snapshot and invoke garbage collection, then
--- remove the timeout injection so that we don't have to
--- wait too long for the replica to start.
+-- remove delay to allow replica to start.
 test_run:cmd("setopt delimiter ';'")
 fiber.create(function()
     fiber.sleep(0.1)
     box.snapshot()
-    box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+    box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 end)
 test_run:cmd("setopt delimiter ''");
 
@@ -57,12 +69,10 @@ test_run:cmd("switch default")
 
 -- Check that garbage collection removed the snapshot once
 -- the replica released the corresponding checkpoint.
-test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10)
-#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
--- Make sure the replica will receive data it is subscribed
--- to long enough for us to invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+-- we test garbage collection.
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
 
 -- Send more data to the replica.
 -- Need to do 2 snapshots here, otherwise the replica would
@@ -76,12 +86,12 @@ box.snapshot()
 -- Invoke garbage collection. Check that it doesn't remove
 -- xlogs needed by the replica.
 box.snapshot()
-#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(2) or fio.listdir('./master')
 
--- Remove the timeout injection so that the replica catches
+-- Resume replication so that the replica catches
 -- up quickly.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
 
 -- Check that the replica received all data from the master.
 test_run:cmd("switch replica")
@@ -91,8 +101,8 @@ test_run:cmd("switch default")
 
 -- Now garbage collection should resume and delete files left
 -- from the old checkpoint.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(0) or fio.listdir('./master')
 --
 -- Check that the master doesn't delete xlog files sent to the
 -- replica until it receives a confirmation that the data has
@@ -124,8 +134,8 @@ test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
 box.space.test:count()
 test_run:cmd("switch default")
 -- Now it's safe to drop the old xlog.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master')
 -- Stop the replica.
 test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
@@ -139,17 +149,14 @@ _ = s:auto_increment{}
 box.snapshot()
 _ = s:auto_increment{}
 box.snapshot()
-#box.info.gc().checkpoints == 1 or box.info.gc()
-xlog_count = #fio.glob('./master/*.xlog')
--- the replica may have managed to download all data
--- from xlog #1 before it was stopped, in which case
--- it's OK to collect xlog #1
-xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(2) or fio.listdir('./master')
 
 -- The xlog should only be deleted after the replica
 -- is unregistered.
 test_run:cleanup_cluster()
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master')
 --
 -- Test that concurrent invocation of the garbage collector works fine.
 --
@@ -188,14 +195,13 @@ _ = s:auto_increment{}
 box.snapshot()
 _ = s:auto_increment{}
 box.snapshot()
-#fio.glob('./master/*.xlog') == 3 or fio.listdir('./master')
+wait_xlog(3) or fio.listdir('./master')
 
 -- Delete the replica from the cluster table and check that
 -- all xlog files are removed.
 test_run:cleanup_cluster()
 box.snapshot()
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0, 10) or fio.listdir('./master')
 
 -- Restore the config.
 box.cfg{replication = {}}
-- 
2.17.1



More information about the Tarantool-patches mailing list