[Tarantool-patches] [PATCH v1 12/12] test: errinj for pause relay_send
Alexander V. Tikhonov
avtikhon at tarantool.org
Tue Nov 26 09:21:48 MSK 2019
From: Sergei Voronezhskii <sergw at tarantool.org>
This commit is the rest part of changes cherry picked from commit
1c34c91fa725ab254619d23c2f1d99f1e8269324. The initial part of changes
were cherry-picked at commit 8f2bd50105e62b0133032a717cfaa6f8fab26c29.
And lookup the xlog files in loop with a little sleep, until the file
count is not as expected.
Part of #3232
(cherry picked from commit 1c34c91fa725ab254619d23c2f1d99f1e8269324)
---
test/replication/gc.result | 86 ++++++++++++++++++++----------------
test/replication/gc.test.lua | 62 ++++++++++++++------------
2 files changed, 82 insertions(+), 66 deletions(-)
diff --git a/test/replication/gc.result b/test/replication/gc.result
index 5d55403b0..050a6100c 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -27,6 +27,28 @@ default_checkpoint_count = box.cfg.checkpoint_count
box.cfg{checkpoint_count = 1}
---
...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+function wait_gc(n)
+ return test_run:wait_cond(function()
+ return #box.info.gc().checkpoints == n
+ end, 10)
+end;
+---
+...
+function wait_xlog(n, timeout)
+ return test_run:wait_cond(function()
+ return #fio.glob('./master/*.xlog') == n
+ end, 10)
+end;
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
-- Grant permissions needed for replication.
box.schema.user.grant('guest', 'replication')
---
@@ -63,14 +85,13 @@ for i = 1, 100 do s:auto_increment{} end
...
-- Make sure replica join will take long enough for us to
-- invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
---
- ok
...
-- While the replica is receiving the initial data set,
-- make a snapshot and invoke garbage collection, then
--- remove the timeout injection so that we don't have to
--- wait too long for the replica to start.
+-- remove delay to allow replica to start.
test_run:cmd("setopt delimiter ';'")
---
- true
@@ -78,7 +99,7 @@ test_run:cmd("setopt delimiter ';'")
fiber.create(function()
fiber.sleep(0.1)
box.snapshot()
- box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+ box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
end)
test_run:cmd("setopt delimiter ''");
---
@@ -110,21 +131,16 @@ test_run:cmd("switch default")
...
-- Check that garbage collection removed the snapshot once
-- the replica released the corresponding checkpoint.
-test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10)
----
-- true
-...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
---
- true
...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
---
- true
...
--- Make sure the replica will receive data it is subscribed
--- to long enough for us to invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+-- we test garbage collection.
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
---
- ok
...
@@ -152,17 +168,17 @@ box.snapshot()
---
- ok
...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
---
- true
...
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_xlog(2) or fio.listdir('./master')
---
- true
...
--- Remove the timeout injection so that the replica catches
+-- Resume replication so that the replica catches
-- up quickly.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
---
- ok
...
@@ -185,11 +201,11 @@ test_run:cmd("switch default")
...
-- Now garbage collection should resume and delete files left
-- from the old checkpoint.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
+wait_gc(1) or box.info.gc()
---
- true
...
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0) or fio.listdir('./master')
---
- true
...
@@ -228,11 +244,11 @@ fiber.sleep(0.1) -- wait for master to relay data
-- Garbage collection must not delete the old xlog file
-- because it is still needed by the replica, but remove
-- the old snapshot.
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
---
- true
...
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_xlog(2) or fio.listdir('./master')
---
- true
...
@@ -266,11 +282,11 @@ test_run:cmd("switch default")
- true
...
-- Now it's safe to drop the old xlog.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
+wait_gc(1) or box.info.gc()
---
- true
...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_xlog(1) or fio.listdir('./master')
---
- true
...
@@ -302,17 +318,11 @@ box.snapshot()
---
- ok
...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+wait_gc(1) or box.info.gc()
---
- true
...
-xlog_count = #fio.glob('./master/*.xlog')
----
-...
--- the replica may have managed to download all data
--- from xlog #1 before it was stopped, in which case
--- it's OK to collect xlog #1
-xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
+wait_xlog(2) or fio.listdir('./master')
---
- true
...
@@ -321,7 +331,11 @@ xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
test_run:cleanup_cluster()
---
...
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+---
+- true
+...
+wait_xlog(1) or fio.listdir('./master')
---
- true
...
@@ -409,7 +423,7 @@ box.snapshot()
---
- ok
...
-#fio.glob('./master/*.xlog') == 3 or fio.listdir('./master')
+wait_xlog(3) or fio.listdir('./master')
---
- true
...
@@ -422,11 +436,7 @@ box.snapshot()
---
- ok
...
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
----
-- true
-...
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0, 10) or fio.listdir('./master')
---
- true
...
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 40a349167..7cd18402c 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -11,6 +11,19 @@ test_run:cmd("create server replica with rpl_master=default, script='replication
default_checkpoint_count = box.cfg.checkpoint_count
box.cfg{checkpoint_count = 1}
+test_run:cmd("setopt delimiter ';'")
+function wait_gc(n)
+ return test_run:wait_cond(function()
+ return #box.info.gc().checkpoints == n
+ end, 10)
+end;
+function wait_xlog(n, timeout)
+ return test_run:wait_cond(function()
+ return #fio.glob('./master/*.xlog') == n
+ end, 10)
+end;
+test_run:cmd("setopt delimiter ''");
+
-- Grant permissions needed for replication.
box.schema.user.grant('guest', 'replication')
@@ -29,17 +42,16 @@ for i = 1, 100 do s:auto_increment{} end
-- Make sure replica join will take long enough for us to
-- invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
-- While the replica is receiving the initial data set,
-- make a snapshot and invoke garbage collection, then
--- remove the timeout injection so that we don't have to
--- wait too long for the replica to start.
+-- remove delay to allow replica to start.
test_run:cmd("setopt delimiter ';'")
fiber.create(function()
fiber.sleep(0.1)
box.snapshot()
- box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+ box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
end)
test_run:cmd("setopt delimiter ''");
@@ -57,12 +69,10 @@ test_run:cmd("switch default")
-- Check that garbage collection removed the snapshot once
-- the replica released the corresponding checkpoint.
-test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10)
-#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
--- Make sure the replica will receive data it is subscribed
--- to long enough for us to invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until
+-- we test garbage collection.
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
-- Send more data to the replica.
-- Need to do 2 snapshots here, otherwise the replica would
@@ -76,12 +86,12 @@ box.snapshot()
-- Invoke garbage collection. Check that it doesn't remove
-- xlogs needed by the replica.
box.snapshot()
-#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(2) or fio.listdir('./master')
--- Remove the timeout injection so that the replica catches
+-- Resume replication so that the replica catches
-- up quickly.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
-- Check that the replica received all data from the master.
test_run:cmd("switch replica")
@@ -91,8 +101,8 @@ test_run:cmd("switch default")
-- Now garbage collection should resume and delete files left
-- from the old checkpoint.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(0) or fio.listdir('./master')
--
-- Check that the master doesn't delete xlog files sent to the
-- replica until it receives a confirmation that the data has
@@ -124,8 +134,8 @@ test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
box.space.test:count()
test_run:cmd("switch default")
-- Now it's safe to drop the old xlog.
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master')
-- Stop the replica.
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
@@ -139,17 +149,14 @@ _ = s:auto_increment{}
box.snapshot()
_ = s:auto_increment{}
box.snapshot()
-#box.info.gc().checkpoints == 1 or box.info.gc()
-xlog_count = #fio.glob('./master/*.xlog')
--- the replica may have managed to download all data
--- from xlog #1 before it was stopped, in which case
--- it's OK to collect xlog #1
-xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(2) or fio.listdir('./master')
-- The xlog should only be deleted after the replica
-- is unregistered.
test_run:cleanup_cluster()
-#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
+wait_gc(1) or box.info.gc()
+wait_xlog(1) or fio.listdir('./master')
--
-- Test that concurrent invocation of the garbage collector works fine.
--
@@ -188,14 +195,13 @@ _ = s:auto_increment{}
box.snapshot()
_ = s:auto_increment{}
box.snapshot()
-#fio.glob('./master/*.xlog') == 3 or fio.listdir('./master')
+wait_xlog(3) or fio.listdir('./master')
-- Delete the replica from the cluster table and check that
-- all xlog files are removed.
test_run:cleanup_cluster()
box.snapshot()
-test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+wait_xlog(0, 10) or fio.listdir('./master')
-- Restore the config.
box.cfg{replication = {}}
--
2.17.1
More information about the Tarantool-patches
mailing list