From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpng2.m.smailru.net (smtpng2.m.smailru.net [94.100.179.3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 473044696C9 for ; Tue, 26 Nov 2019 09:22:05 +0300 (MSK) From: "Alexander V. Tikhonov" Date: Tue, 26 Nov 2019 09:21:48 +0300 Message-Id: In-Reply-To: <1c42ad20160f47d942cab405ce9896d6d31cc05f.1574749278.git.avtikhon@tarantool.org> References: <1c42ad20160f47d942cab405ce9896d6d31cc05f.1574749278.git.avtikhon@tarantool.org> In-Reply-To: <1c42ad20160f47d942cab405ce9896d6d31cc05f.1574749278.git.avtikhon@tarantool.org> References: <1c42ad20160f47d942cab405ce9896d6d31cc05f.1574749278.git.avtikhon@tarantool.org> Subject: [Tarantool-patches] [PATCH v1 12/12] test: errinj for pause relay_send List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Kirill Yukhin Cc: Sergei Voronezhskii , tarantool-patches@dev.tarantool.org From: Sergei Voronezhskii This commit is the rest part of changes cherry picked from commit 1c34c91fa725ab254619d23c2f1d99f1e8269324. The initial part of changes were cherry-picked at commit 8f2bd50105e62b0133032a717cfaa6f8fab26c29. And lookup the xlog files in loop with a little sleep, until the file count is not as expected. Part of #3232 (cherry picked from commit 1c34c91fa725ab254619d23c2f1d99f1e8269324) --- test/replication/gc.result | 86 ++++++++++++++++++++---------------- test/replication/gc.test.lua | 62 ++++++++++++++------------ 2 files changed, 82 insertions(+), 66 deletions(-) diff --git a/test/replication/gc.result b/test/replication/gc.result index 5d55403b0..050a6100c 100644 --- a/test/replication/gc.result +++ b/test/replication/gc.result @@ -27,6 +27,28 @@ default_checkpoint_count = box.cfg.checkpoint_count box.cfg{checkpoint_count = 1} --- ... +test_run:cmd("setopt delimiter ';'") +--- +- true +... +function wait_gc(n) + return test_run:wait_cond(function() + return #box.info.gc().checkpoints == n + end, 10) +end; +--- +... +function wait_xlog(n, timeout) + return test_run:wait_cond(function() + return #fio.glob('./master/*.xlog') == n + end, 10) +end; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... -- Grant permissions needed for replication. box.schema.user.grant('guest', 'replication') --- @@ -63,14 +85,13 @@ for i = 1, 100 do s:auto_increment{} end ... -- Make sure replica join will take long enough for us to -- invoke garbage collection. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05) +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true) --- - ok ... -- While the replica is receiving the initial data set, -- make a snapshot and invoke garbage collection, then --- remove the timeout injection so that we don't have to --- wait too long for the replica to start. +-- remove delay to allow replica to start. test_run:cmd("setopt delimiter ';'") --- - true @@ -78,7 +99,7 @@ test_run:cmd("setopt delimiter ';'") fiber.create(function() fiber.sleep(0.1) box.snapshot() - box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0) + box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false) end) test_run:cmd("setopt delimiter ''"); --- @@ -110,21 +131,16 @@ test_run:cmd("switch default") ... -- Check that garbage collection removed the snapshot once -- the replica released the corresponding checkpoint. -test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10) ---- -- true -... -#box.info.gc().checkpoints == 1 or box.info.gc() +wait_gc(1) or box.info.gc() --- - true ... -#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master') +wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until --- - true ... --- Make sure the replica will receive data it is subscribed --- to long enough for us to invoke garbage collection. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05) +-- we test garbage collection. +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true) --- - ok ... @@ -152,17 +168,17 @@ box.snapshot() --- - ok ... -#box.info.gc().checkpoints == 1 or box.info.gc() +wait_gc(1) or box.info.gc() --- - true ... -#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master') +wait_xlog(2) or fio.listdir('./master') --- - true ... --- Remove the timeout injection so that the replica catches +-- Resume replication so that the replica catches -- up quickly. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0) +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false) --- - ok ... @@ -185,11 +201,11 @@ test_run:cmd("switch default") ... -- Now garbage collection should resume and delete files left -- from the old checkpoint. -test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10) +wait_gc(1) or box.info.gc() --- - true ... -#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master') +wait_xlog(0) or fio.listdir('./master') --- - true ... @@ -228,11 +244,11 @@ fiber.sleep(0.1) -- wait for master to relay data -- Garbage collection must not delete the old xlog file -- because it is still needed by the replica, but remove -- the old snapshot. -#box.info.gc().checkpoints == 1 or box.info.gc() +wait_gc(1) or box.info.gc() --- - true ... -#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master') +wait_xlog(2) or fio.listdir('./master') --- - true ... @@ -266,11 +282,11 @@ test_run:cmd("switch default") - true ... -- Now it's safe to drop the old xlog. -test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10) +wait_gc(1) or box.info.gc() --- - true ... -#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master') +wait_xlog(1) or fio.listdir('./master') --- - true ... @@ -302,17 +318,11 @@ box.snapshot() --- - ok ... -#box.info.gc().checkpoints == 1 or box.info.gc() +wait_gc(1) or box.info.gc() --- - true ... -xlog_count = #fio.glob('./master/*.xlog') ---- -... --- the replica may have managed to download all data --- from xlog #1 before it was stopped, in which case --- it's OK to collect xlog #1 -xlog_count == 3 or xlog_count == 2 or fio.listdir('./master') +wait_xlog(2) or fio.listdir('./master') --- - true ... @@ -321,7 +331,11 @@ xlog_count == 3 or xlog_count == 2 or fio.listdir('./master') test_run:cleanup_cluster() --- ... -#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master') +wait_gc(1) or box.info.gc() +--- +- true +... +wait_xlog(1) or fio.listdir('./master') --- - true ... @@ -409,7 +423,7 @@ box.snapshot() --- - ok ... -#fio.glob('./master/*.xlog') == 3 or fio.listdir('./master') +wait_xlog(3) or fio.listdir('./master') --- - true ... @@ -422,11 +436,7 @@ box.snapshot() --- - ok ... -test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10) ---- -- true -... -#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master') +wait_xlog(0, 10) or fio.listdir('./master') --- - true ... diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua index 40a349167..7cd18402c 100644 --- a/test/replication/gc.test.lua +++ b/test/replication/gc.test.lua @@ -11,6 +11,19 @@ test_run:cmd("create server replica with rpl_master=default, script='replication default_checkpoint_count = box.cfg.checkpoint_count box.cfg{checkpoint_count = 1} +test_run:cmd("setopt delimiter ';'") +function wait_gc(n) + return test_run:wait_cond(function() + return #box.info.gc().checkpoints == n + end, 10) +end; +function wait_xlog(n, timeout) + return test_run:wait_cond(function() + return #fio.glob('./master/*.xlog') == n + end, 10) +end; +test_run:cmd("setopt delimiter ''"); + -- Grant permissions needed for replication. box.schema.user.grant('guest', 'replication') @@ -29,17 +42,16 @@ for i = 1, 100 do s:auto_increment{} end -- Make sure replica join will take long enough for us to -- invoke garbage collection. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05) +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true) -- While the replica is receiving the initial data set, -- make a snapshot and invoke garbage collection, then --- remove the timeout injection so that we don't have to --- wait too long for the replica to start. +-- remove delay to allow replica to start. test_run:cmd("setopt delimiter ';'") fiber.create(function() fiber.sleep(0.1) box.snapshot() - box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0) + box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false) end) test_run:cmd("setopt delimiter ''"); @@ -57,12 +69,10 @@ test_run:cmd("switch default") -- Check that garbage collection removed the snapshot once -- the replica released the corresponding checkpoint. -test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10) -#box.info.gc().checkpoints == 1 or box.info.gc() -#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master') --- Make sure the replica will receive data it is subscribed --- to long enough for us to invoke garbage collection. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05) +wait_gc(1) or box.info.gc() +wait_xlog(1) or fio.listdir('./master') -- Make sure the replica will not receive data until +-- we test garbage collection. +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true) -- Send more data to the replica. -- Need to do 2 snapshots here, otherwise the replica would @@ -76,12 +86,12 @@ box.snapshot() -- Invoke garbage collection. Check that it doesn't remove -- xlogs needed by the replica. box.snapshot() -#box.info.gc().checkpoints == 1 or box.info.gc() -#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master') +wait_gc(1) or box.info.gc() +wait_xlog(2) or fio.listdir('./master') --- Remove the timeout injection so that the replica catches +-- Resume replication so that the replica catches -- up quickly. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0) +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false) -- Check that the replica received all data from the master. test_run:cmd("switch replica") @@ -91,8 +101,8 @@ test_run:cmd("switch default") -- Now garbage collection should resume and delete files left -- from the old checkpoint. -test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10) -#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master') +wait_gc(1) or box.info.gc() +wait_xlog(0) or fio.listdir('./master') -- -- Check that the master doesn't delete xlog files sent to the -- replica until it receives a confirmation that the data has @@ -124,8 +134,8 @@ test_run:wait_cond(function() return box.space.test:count() == 310 end, 10) box.space.test:count() test_run:cmd("switch default") -- Now it's safe to drop the old xlog. -test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10) -#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master') +wait_gc(1) or box.info.gc() +wait_xlog(1) or fio.listdir('./master') -- Stop the replica. test_run:cmd("stop server replica") test_run:cmd("cleanup server replica") @@ -139,17 +149,14 @@ _ = s:auto_increment{} box.snapshot() _ = s:auto_increment{} box.snapshot() -#box.info.gc().checkpoints == 1 or box.info.gc() -xlog_count = #fio.glob('./master/*.xlog') --- the replica may have managed to download all data --- from xlog #1 before it was stopped, in which case --- it's OK to collect xlog #1 -xlog_count == 3 or xlog_count == 2 or fio.listdir('./master') +wait_gc(1) or box.info.gc() +wait_xlog(2) or fio.listdir('./master') -- The xlog should only be deleted after the replica -- is unregistered. test_run:cleanup_cluster() -#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master') +wait_gc(1) or box.info.gc() +wait_xlog(1) or fio.listdir('./master') -- -- Test that concurrent invocation of the garbage collector works fine. -- @@ -188,14 +195,13 @@ _ = s:auto_increment{} box.snapshot() _ = s:auto_increment{} box.snapshot() -#fio.glob('./master/*.xlog') == 3 or fio.listdir('./master') +wait_xlog(3) or fio.listdir('./master') -- Delete the replica from the cluster table and check that -- all xlog files are removed. test_run:cleanup_cluster() box.snapshot() -test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10) -#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master') +wait_xlog(0, 10) or fio.listdir('./master') -- Restore the config. box.cfg{replication = {}} -- 2.17.1