[PATCH 2/4] test: errinj for pause relay_send
Sergei Voronezhskii
sergw at tarantool.org
Fri Oct 5 12:02:13 MSK 2018
Instead of using timeout we need just pause `relay_send`. Can't relay
on timeout because of various system load in parallel mode. Add new
errinj which checks boolean in loop and until it is not `True` do not
pass the method `relay_send` to the next statement.
Also here we change `delete` to `replace`. And lookup the xlog files
in loop with a little sleep, until the file count is not as expected.
Part of #2436, #3232
---
src/box/relay.cc | 7 +++++-
src/errinj.h | 1 +
test/replication/catch.result | 44 ++++++++++++++-------------------
test/replication/catch.test.lua | 36 +++++++++++++--------------
test/replication/gc.result | 18 ++++++--------
test/replication/gc.test.lua | 16 ++++++------
6 files changed, 58 insertions(+), 64 deletions(-)
diff --git a/src/box/relay.cc b/src/box/relay.cc
index c90383d4a..8618fa81a 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -622,12 +622,17 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
static void
relay_send(struct relay *relay, struct xrow_header *packet)
{
+ struct errinj *inj = errinj(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL);
+ while (inj->bparam) {
+ fiber_sleep(0.01);
+ inj = errinj(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL);
+ }
packet->sync = relay->sync;
relay->last_row_tm = ev_monotonic_now(loop());
coio_write_xrow(&relay->io, packet);
fiber_gc();
- struct errinj *inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE);
+ inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE);
if (inj != NULL && inj->dparam > 0)
fiber_sleep(inj->dparam);
}
diff --git a/src/errinj.h b/src/errinj.h
index 84a1fbb5e..bf6c15ba5 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -94,6 +94,7 @@ struct errinj {
_(ERRINJ_VY_GC, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_VY_LOG_FLUSH, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_VY_LOG_FLUSH_DELAY, ERRINJ_BOOL, {.bparam = false}) \
+ _(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \
_(ERRINJ_RELAY_REPORT_INTERVAL, ERRINJ_DOUBLE, {.dparam = 0}) \
_(ERRINJ_RELAY_FINAL_SLEEP, ERRINJ_BOOL, {.bparam = false}) \
diff --git a/test/replication/catch.result b/test/replication/catch.result
index e23f33cef..b4ddc5d51 100644
--- a/test/replication/catch.result
+++ b/test/replication/catch.result
@@ -35,7 +35,7 @@ test_run:cmd("switch default")
s = box.schema.space.create('test', {engine = engine});
---
...
--- vinyl does not support hash index
+-- Vinyl does not support hash index
index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') })
---
...
@@ -57,14 +57,13 @@ test_run:cmd("stop server replica")
---
- true
...
--- insert values on the master while replica is stopped and can't fetch them
-for i=1,100 do s:insert{i, 'this is test message12345'} end
+-- Insert values on the master while replica is stopped and can't fetch them.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', true)
---
+- ok
...
--- sleep after every tuple
-errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0)
+for i=1,100 do s:insert{i, 'this is test message12345'} end
---
-- ok
...
test_run:cmd("start server replica with args='0.01'")
---
@@ -75,28 +74,25 @@ test_run:cmd("switch replica")
- true
...
-- Check that replica doesn't enter read-write mode before
--- catching up with the master: to check that we inject sleep into
--- the master relay_send function and attempt a data modifying
--- statement in replica while it's still fetching data from the
--- master.
--- In the next two cases we try to delete a tuple while replica is
+-- catching up with the master: to check that we stop sending
+-- rows on the master in relay_send function and attempt a data
+-- modifying statement in replica while it's still fetching data
+-- from the master.
+--
+-- In the next two cases we try to replace a tuple while replica is
-- catching up with the master (local delete, remote delete) case
--
--- #1: delete tuple on replica
+-- Case #1: replace tuple on replica locally.
--
box.space.test ~= nil
---
- true
...
-d = box.space.test:delete{1}
+box.space.test:replace{1}
---
- error: Can't modify data because this instance is in read-only mode.
...
-box.space.test:get(1) ~= nil
----
-- true
-...
--- case #2: delete tuple by net.box
+-- Case #2: replace tuple on replica by net.box.
test_run:cmd("switch default")
---
- true
@@ -108,20 +104,16 @@ test_run:cmd("set variable r_uri to 'replica.listen'")
c = net_box.connect(r_uri)
---
...
-d = c.space.test:delete{1}
+d = c.space.test:replace{1}
---
- error: Can't modify data because this instance is in read-only mode.
...
-c.space.test:get(1) ~= nil
----
-- true
-...
--- check sync
-errinj.set("ERRINJ_RELAY_TIMEOUT", 0)
+-- Resume replicaton
+errinj.set('ERRINJ_RELAY_SEND_DELAY', false)
---
- ok
...
--- cleanup
+-- Cleanup
test_run:cmd("stop server replica")
---
- true
diff --git a/test/replication/catch.test.lua b/test/replication/catch.test.lua
index 217328772..5223e3a24 100644
--- a/test/replication/catch.test.lua
+++ b/test/replication/catch.test.lua
@@ -13,7 +13,7 @@ test_run:cmd("switch replica")
test_run:cmd("switch default")
s = box.schema.space.create('test', {engine = engine});
--- vinyl does not support hash index
+-- Vinyl does not support hash index
index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') })
test_run:cmd("switch replica")
@@ -22,41 +22,39 @@ while box.space.test == nil do fiber.sleep(0.01) end
test_run:cmd("switch default")
test_run:cmd("stop server replica")
--- insert values on the master while replica is stopped and can't fetch them
+-- Insert values on the master while replica is stopped and can't fetch them.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', true)
for i=1,100 do s:insert{i, 'this is test message12345'} end
--- sleep after every tuple
-errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0)
-
test_run:cmd("start server replica with args='0.01'")
test_run:cmd("switch replica")
-- Check that replica doesn't enter read-write mode before
--- catching up with the master: to check that we inject sleep into
--- the master relay_send function and attempt a data modifying
--- statement in replica while it's still fetching data from the
--- master.
--- In the next two cases we try to delete a tuple while replica is
+-- catching up with the master: to check that we stop sending
+-- rows on the master in relay_send function and attempt a data
+-- modifying statement in replica while it's still fetching data
+-- from the master.
+--
+-- In the next two cases we try to replace a tuple while replica is
-- catching up with the master (local delete, remote delete) case
--
--- #1: delete tuple on replica
+-- Case #1: replace tuple on replica locally.
--
box.space.test ~= nil
-d = box.space.test:delete{1}
-box.space.test:get(1) ~= nil
+box.space.test:replace{1}
--- case #2: delete tuple by net.box
+-- Case #2: replace tuple on replica by net.box.
test_run:cmd("switch default")
test_run:cmd("set variable r_uri to 'replica.listen'")
c = net_box.connect(r_uri)
-d = c.space.test:delete{1}
-c.space.test:get(1) ~= nil
+d = c.space.test:replace{1}
+
+-- Resume replicaton
+errinj.set('ERRINJ_RELAY_SEND_DELAY', false)
--- check sync
-errinj.set("ERRINJ_RELAY_TIMEOUT", 0)
--- cleanup
+-- Cleanup
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
test_run:cleanup_cluster()
diff --git a/test/replication/gc.result b/test/replication/gc.result
index 83d0de293..ef6463d87 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -95,7 +95,7 @@ test_run:cmd("switch replica")
fiber = require('fiber')
---
...
-while box.space.test:count() < 200 do fiber.sleep(0.01) end
+while box.space.test == nil or box.space.test:count() < 200 do fiber.sleep(0.01) end
---
...
box.space.test:count()
@@ -119,9 +119,9 @@ wait_gc(1)
---
- true
...
--- Make sure the replica will receive data it is subscribed
--- to long enough for us to invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+-- Make sure the replica will not receive data until
+-- we test garbage collection.
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
---
- ok
...
@@ -153,13 +153,12 @@ box.snapshot()
---
- true
...
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+while #fio.glob('./master/*.xlog') ~= 2 do fiber.sleep(0.01) end
---
-- true
...
--- Remove the timeout injection so that the replica catches
+-- Resume replicaton so that the replica catches
-- up quickly.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
---
- ok
...
@@ -188,9 +187,8 @@ wait_gc(1)
---
- true
...
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+while #fio.glob('./master/*.xlog') ~= 0 do fiber.sleep(0.01) end
---
-- true
...
--
-- Check that the master doesn't delete xlog files sent to the
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index eed76850c..ec3bf6baa 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -52,7 +52,7 @@ test_run:cmd("start server replica")
-- data from the master. Check it.
test_run:cmd("switch replica")
fiber = require('fiber')
-while box.space.test:count() < 200 do fiber.sleep(0.01) end
+while box.space.test == nil or box.space.test:count() < 200 do fiber.sleep(0.01) end
box.space.test:count()
test_run:cmd("switch default")
@@ -61,9 +61,9 @@ test_run:cmd("switch default")
wait_gc(1)
#box.info.gc().checkpoints == 1 or box.info.gc()
#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
--- Make sure the replica will receive data it is subscribed
--- to long enough for us to invoke garbage collection.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.05)
+-- Make sure the replica will not receive data until
+-- we test garbage collection.
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", true)
-- Send more data to the replica.
-- Need to do 2 snapshots here, otherwise the replica would
@@ -78,11 +78,11 @@ box.snapshot()
-- xlogs needed by the replica.
box.snapshot()
#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
+while #fio.glob('./master/*.xlog') ~= 2 do fiber.sleep(0.01) end
--- Remove the timeout injection so that the replica catches
+-- Resume replicaton so that the replica catches
-- up quickly.
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
+box.error.injection.set("ERRINJ_RELAY_SEND_DELAY", false)
-- Check that the replica received all data from the master.
test_run:cmd("switch replica")
@@ -94,7 +94,7 @@ test_run:cmd("switch default")
-- from the old checkpoint.
wait_gc(1)
#box.info.gc().checkpoints == 1 or box.info.gc()
-#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
+while #fio.glob('./master/*.xlog') ~= 0 do fiber.sleep(0.01) end
--
-- Check that the master doesn't delete xlog files sent to the
-- replica until it receives a confirmation that the data has
--
2.18.0
More information about the Tarantool-patches
mailing list