From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Sergei Voronezhskii Subject: [PATCH 2/4] test: errinj for pause relay_send Date: Fri, 5 Oct 2018 12:02:13 +0300 Message-Id: <20181005090215.6160-3-sergw@tarantool.org> In-Reply-To: <20181005090215.6160-1-sergw@tarantool.org> References: <20181003145057.68820-1-sergw@tarantool.org> <20181005090215.6160-1-sergw@tarantool.org> Reply-To: tarantool-patches@freelists.org To: tarantool-patches@freelists.org Cc: Alexander Turenko , Vladimir Davydov List-ID: Instead of using timeout we need just pause `relay_send`. Can't relay on timeout because of various system load in parallel mode. Add new errinj which checks boolean in loop and until it is not `True` do not pass the method `relay_send` to the next statement. Also here we change `delete` to `replace`. And lookup the xlog files in loop with a little sleep, until the file count is not as expected. Part of #2436, #3232 --- src/box/relay.cc | 7 +++++- src/errinj.h | 1 + test/replication/catch.result | 44 ++++++++++++++------------------- test/replication/catch.test.lua | 36 +++++++++++++-------------- test/replication/gc.result | 18 ++++++-------- test/replication/gc.test.lua | 16 ++++++------ 6 files changed, 58 insertions(+), 64 deletions(-) diff --git a/src/box/relay.cc b/src/box/relay.cc index c90383d4a..8618fa81a 100644 --- a/src/box/relay.cc +++ b/src/box/relay.cc @@ -622,12 +622,17 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync, static void relay_send(struct relay *relay, struct xrow_header *packet) { + struct errinj *inj = errinj(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL); + while (inj->bparam) { + fiber_sleep(0.01); + inj = errinj(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL); + } packet->sync = relay->sync; relay->last_row_tm = ev_monotonic_now(loop()); coio_write_xrow(&relay->io, packet); fiber_gc(); - struct errinj *inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE); + inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE); if (inj != NULL && inj->dparam > 0) fiber_sleep(inj->dparam); } diff --git a/src/errinj.h b/src/errinj.h index 84a1fbb5e..bf6c15ba5 100644 --- a/src/errinj.h +++ b/src/errinj.h @@ -94,6 +94,7 @@ struct errinj { _(ERRINJ_VY_GC, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_VY_LOG_FLUSH, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_VY_LOG_FLUSH_DELAY, ERRINJ_BOOL, {.bparam = false}) \ + _(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \ _(ERRINJ_RELAY_REPORT_INTERVAL, ERRINJ_DOUBLE, {.dparam = 0}) \ _(ERRINJ_RELAY_FINAL_SLEEP, ERRINJ_BOOL, {.bparam = false}) \ diff --git a/test/replication/catch.result b/test/replication/catch.result index e23f33cef..b4ddc5d51 100644 --- a/test/replication/catch.result +++ b/test/replication/catch.result @@ -35,7 +35,7 @@ test_run:cmd("switch default") s = box.schema.space.create('test', {engine = engine}); --- ... --- vinyl does not support hash index +-- Vinyl does not support hash index index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') }) --- ... @@ -57,14 +57,13 @@ test_run:cmd("stop server replica") --- - true ... --- insert values on the master while replica is stopped and can't fetch them -for i=1,100 do s:insert{i, 'this is test message12345'} end +-- Insert values on the master while replica is stopped and can't fetch them. +errinj.set('ERRINJ_RELAY_SEND_DELAY', true) --- +- ok ... --- sleep after every tuple -errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0) +for i=1,100 do s:insert{i, 'this is test message12345'} end --- -- ok ... test_run:cmd("start server replica with args='0.01'") --- @@ -75,28 +74,25 @@ test_run:cmd("switch replica") - true ... -- Check that replica doesn't enter read-write mode before --- catching up with the master: to check that we inject sleep into --- the master relay_send function and attempt a data modifying --- statement in replica while it's still fetching data from the --- master. --- In the next two cases we try to delete a tuple while replica is +-- catching up with the master: to check that we stop sending +-- rows on the master in relay_send function and attempt a data +-- modifying statement in replica while it's still fetching data +-- from the master. +-- +-- In the next two cases we try to replace a tuple while replica is -- catching up with the master (local delete, remote delete) case -- --- #1: delete tuple on replica +-- Case #1: replace tuple on replica locally. -- box.space.test ~= nil --- - true ... -d = box.space.test:delete{1} +box.space.test:replace{1} --- - error: Can't modify data because this instance is in read-only mode. ... -box.space.test:get(1) ~= nil ---- -- true -... --- case #2: delete tuple by net.box +-- Case #2: replace tuple on replica by net.box. test_run:cmd("switch default") --- - true @@ -108,20 +104,16 @@ test_run:cmd("set variable r_uri to 'replica.listen'") c = net_box.connect(r_uri) --- ... -d = c.space.test:delete{1} +d = c.space.test:replace{1} --- - error: Can't modify data because this instance is in read-only mode. ... -c.space.test:get(1) ~= nil ---- -- true -... --- check sync -errinj.set("ERRINJ_RELAY_TIMEOUT", 0) +-- Resume replicaton +errinj.set('ERRINJ_RELAY_SEND_DELAY', false) --- - ok ... --- cleanup +-- Cleanup test_run:cmd("stop server replica") --- - true diff --git a/test/replication/catch.test.lua b/test/replication/catch.test.lua index 217328772..5223e3a24 100644 --- a/test/replication/catch.test.lua +++ b/test/replication/catch.test.lua @@ -13,7 +13,7 @@ test_run:cmd("switch replica") test_run:cmd("switch default") s = box.schema.space.create('test', {engine = engine}); --- vinyl does not support hash index +-- Vinyl does not support hash index index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') }) test_run:cmd("switch replica") @@ -22,41 +22,39 @@ while box.space.test == nil do fiber.sleep(0.01) end test_run:cmd("switch default") test_run:cmd("stop server replica") --- insert values on the master while replica is stopped and can't fetch them +-- Insert values on the master while replica is stopped and can't fetch them. +errinj.set('ERRINJ_RELAY_SEND_DELAY', true) for i=1,100 do s:insert{i, 'this is test message12345'} end --- sleep after every tuple -errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0) - test_run:cmd("start server replica with args='0.01'") test_run:cmd("switch replica") -- Check that replica doesn't enter read-write mode before --- catching up with the master: to check that we inject sleep into --- the master relay_send function and attempt a data modifying --- statement in replica while it's still fetching data from the --- master. --- In the next two cases we try to delete a tuple while replica is +-- catching up with the master: to check that we stop sending +-- rows on the master in relay_send function and attempt a data +-- modifying statement in replica while it's still fetching data +-- from the master. +-- +-- In the next two cases we try to replace a tuple while replica is -- catching up with the master (local delete, remote delete) case -- --- #1: delete tuple on replica +-- Case #1: replace tuple on replica locally. -- box.space.test ~= nil -d = box.space.test:delete{1} -box.space.test:get(1) ~= nil +box.space.test:replace{1} --- case #2: delete tuple by net.box +-- Case #2: replace tuple on replica by net.box. test_run:cmd("switch default") test_run:cmd("set variable r_uri to 'replica.listen'") c = net_box.connect(r_uri) -d = c.space.test:delete{1} -c.space.test:get(1) ~= nil +d = c.space.test:replace{1} + +-- Resume replicaton +errinj.set('ERRINJ_RELAY_SEND_DELAY', false) --- check sync -errinj.set("ERRINJ_RELAY_TIMEOUT", 0) --- cleanup +-- Cleanup test_run:cmd("stop server replica") test_run:cmd("cleanup server replica") test_run:cleanup_cluster() diff --git a/test/replication/gc.result b/test/replication/gc.result index 83d0de293..ef6463d87 100644 --- a/test/replication/gc.result +++ b/test/replication/gc.result @@ -95,7 +95,7 @@ test_run:cmd("switch replica") fiber = require('fiber') --- ... -while box.space.test:count() < 200 do fiber.sleep(0.01) end +while box.space.test == nil or box.space.test:count() < 200 do fiber.sleep(0.01) end --- ... box.space.test:count() @@ -119,9 +119,9 @@ wait_gc(1) --- - true ... --- Make sure the replica will receive data it is subscribed --- to long enough for us to invoke garbage collection. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05) +-- Make sure the replica will not receive data until +-- we test garbage collection. +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true) --- - ok ... @@ -153,13 +153,12 @@ box.snapshot() --- - true ... -#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master') +while #fio.glob('./master/*.xlog') ~= 2 do fiber.sleep(0.01) end --- -- true ... --- Remove the timeout injection so that the replica catches +-- Resume replicaton so that the replica catches -- up quickly. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0) +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false) --- - ok ... @@ -188,9 +187,8 @@ wait_gc(1) --- - true ... -#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master') +while #fio.glob('./master/*.xlog') ~= 0 do fiber.sleep(0.01) end --- -- true ... -- -- Check that the master doesn't delete xlog files sent to the diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua index eed76850c..ec3bf6baa 100644 --- a/test/replication/gc.test.lua +++ b/test/replication/gc.test.lua @@ -52,7 +52,7 @@ test_run:cmd("start server replica") -- data from the master. Check it. test_run:cmd("switch replica") fiber = require('fiber') -while box.space.test:count() < 200 do fiber.sleep(0.01) end +while box.space.test == nil or box.space.test:count() < 200 do fiber.sleep(0.01) end box.space.test:count() test_run:cmd("switch default") @@ -61,9 +61,9 @@ test_run:cmd("switch default") wait_gc(1) #box.info.gc().checkpoints == 1 or box.info.gc() #fio.glob('./master/*.xlog') == 1 or fio.listdir('./master') --- Make sure the replica will receive data it is subscribed --- to long enough for us to invoke garbage collection. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05) +-- Make sure the replica will not receive data until +-- we test garbage collection. +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true) -- Send more data to the replica. -- Need to do 2 snapshots here, otherwise the replica would @@ -78,11 +78,11 @@ box.snapshot() -- xlogs needed by the replica. box.snapshot() #box.info.gc().checkpoints == 1 or box.info.gc() -#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master') +while #fio.glob('./master/*.xlog') ~= 2 do fiber.sleep(0.01) end --- Remove the timeout injection so that the replica catches +-- Resume replicaton so that the replica catches -- up quickly. -box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0) +box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false) -- Check that the replica received all data from the master. test_run:cmd("switch replica") @@ -94,7 +94,7 @@ test_run:cmd("switch default") -- from the old checkpoint. wait_gc(1) #box.info.gc().checkpoints == 1 or box.info.gc() -#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master') +while #fio.glob('./master/*.xlog') ~= 0 do fiber.sleep(0.01) end -- -- Check that the master doesn't delete xlog files sent to the -- replica until it receives a confirmation that the data has -- 2.18.0