* [Tarantool-patches] [PATCH v1 1/2] test: cleanup replication/gc
2019-10-29 10:20 [Tarantool-patches] [PATCH v1 0/2] Fix 1.10 tests at replication suite Alexander V. Tikhonov
@ 2019-10-29 10:21 ` Alexander V. Tikhonov
2019-10-29 10:21 ` [Tarantool-patches] [PATCH v1 2/2] test: errinj for pause relay_send Alexander V. Tikhonov
2019-10-30 5:09 ` [Tarantool-patches] [PATCH v1 0/2] Fix 1.10 tests at replication suite Kirill Yukhin
2 siblings, 0 replies; 4+ messages in thread
From: Alexander V. Tikhonov @ 2019-10-29 10:21 UTC (permalink / raw)
To: Kirill Yukhin; +Cc: tarantool-patches, Vladimir Davydov, tarantool-patches
From: Vladimir Davydov <vdavydov.dev@gmail.com>
- Before checking that old WAL files have been removed, wait for the
garabe collection to remove them to avoid a spurious test failure.
Currently, the test waits until old checkpoints are removed, but the
garbage collector can now remove checkpoints, but keep WAL files.
This is a follow-up for commit 9c5d851d7830 ("replication: remove old
snapshot files not needed by replicas").
- Remove a few pointless box.info.gc().checkpoints checks.
- Use test_run.wait_cond for waiting instead of while-do-sleep loops.
(cherry picked from commit 84c7d0f723655b445ac04ed52682270f06a59f50)
---
test/replication/gc.result | 44 ++++++++++--------------------------
test/replication/gc.test.lua | 23 ++++++-------------
2 files changed, 19 insertions(+), 48 deletions(-)
diff --git a/test/replication/gc.result b/test/replication/gc.result
index 5922c7d0e..5b44284bf 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -27,9 +27,6 @@ default_checkpoint_count = box.cfg.checkpoint_count
box.cfg{checkpoint_count = 1}
---
...
-function wait_gc(n) while #box.info.gc().checkpoints > n do fiber.sleep(0.01) end end
----
-...
-- Grant permissions needed for replication.
box.schema.user.grant('guest', 'replication')
---
@@ -99,11 +96,9 @@ test_run:cmd("switch replica")
---
- true
...
-fiber = require('fiber')
----
-...
-while box.space.test:count() < 200 do fiber.sleep(0.01) end
+test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
---
+- true
...
box.space.test:count()
---
@@ -115,8 +110,9 @@ test_run:cmd("switch default")
...
-- Check that garbage collection removed the snapshot once
-- the replica released the corresponding checkpoint.
-wait_gc(1)
+test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10)
---
+- true
...
#box.info.gc().checkpoints == 1 or box.info.gc()
---
@@ -175,8 +171,9 @@ test_run:cmd("switch replica")
---
- true
...
-while box.space.test:count() < 300 do fiber.sleep(0.01) end
+test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
---
+- true
...
box.space.test:count()
---
@@ -188,10 +185,7 @@ test_run:cmd("switch default")
...
-- Now garbage collection should resume and delete files left
-- from the old checkpoint.
-wait_gc(1)
----
-...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
---
- true
...
@@ -261,11 +255,9 @@ test_run:cmd("switch replica")
---
- true
...
-fiber = require('fiber')
----
-...
-while box.space.test:count() < 310 do fiber.sleep(0.01) end
+test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
---
+- true
...
box.space.test:count()
---
@@ -276,10 +268,7 @@ test_run:cmd("switch default")
- true
...
-- Now it's safe to drop the old xlog.
-wait_gc(1)
----
-...
-#box.info.gc().checkpoints == 1 or box.info.gc()
+test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
---
- true
...
@@ -334,10 +323,6 @@ xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
test_run:cleanup_cluster()
---
...
-#box.info.gc().checkpoints == 1 or box.info.gc()
----
-- true
-...
#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
---
- true
@@ -381,9 +366,6 @@ replica_set.drop_all(test_run)
fio = require('fio')
---
...
-fiber = require('fiber')
----
-...
-- Start a replica and set it up as a master for this instance.
test_run:cmd("start server replica")
---
@@ -442,11 +424,9 @@ box.snapshot()
---
- ok
...
-t = fiber.time()
----
-...
-while #fio.glob('./master/*xlog') > 0 and fiber.time() - t < 10 do fiber.sleep(0.01) end
+test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
---
+- true
...
#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
---
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index f19fd2e7c..fee1fe968 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -11,8 +11,6 @@ test_run:cmd("create server replica with rpl_master=default, script='replication
default_checkpoint_count = box.cfg.checkpoint_count
box.cfg{checkpoint_count = 1}
-function wait_gc(n) while #box.info.gc().checkpoints > n do fiber.sleep(0.01) end end
-
-- Grant permissions needed for replication.
box.schema.user.grant('guest', 'replication')
@@ -53,14 +51,13 @@ test_run:cmd("start server replica")
-- bootstrapped from, the replica should still receive all
-- data from the master. Check it.
test_run:cmd("switch replica")
-fiber = require('fiber')
-while box.space.test:count() < 200 do fiber.sleep(0.01) end
+test_run:wait_cond(function() return box.space.test:count() == 200 end, 10)
box.space.test:count()
test_run:cmd("switch default")
-- Check that garbage collection removed the snapshot once
-- the replica released the corresponding checkpoint.
-wait_gc(1)
+test_run:wait_cond(function() return #box.info.gc().checkpoints == 1 end, 10)
#box.info.gc().checkpoints == 1 or box.info.gc()
#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
-- Make sure the replica will receive data it is subscribed
@@ -88,14 +85,13 @@ box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0)
-- Check that the replica received all data from the master.
test_run:cmd("switch replica")
-while box.space.test:count() < 300 do fiber.sleep(0.01) end
+test_run:wait_cond(function() return box.space.test:count() == 300 end, 10)
box.space.test:count()
test_run:cmd("switch default")
-- Now garbage collection should resume and delete files left
-- from the old checkpoint.
-wait_gc(1)
-#box.info.gc().checkpoints == 1 or box.info.gc()
+test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
--
-- Check that the master doesn't delete xlog files sent to the
@@ -124,13 +120,11 @@ box.cfg{replication = {}}
test_run:cmd("restart server replica")
-- Wait for the replica to catch up.
test_run:cmd("switch replica")
-fiber = require('fiber')
-while box.space.test:count() < 310 do fiber.sleep(0.01) end
+test_run:wait_cond(function() return box.space.test:count() == 310 end, 10)
box.space.test:count()
test_run:cmd("switch default")
-- Now it's safe to drop the old xlog.
-wait_gc(1)
-#box.info.gc().checkpoints == 1 or box.info.gc()
+test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 1 end, 10)
#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
-- Stop the replica.
test_run:cmd("stop server replica")
@@ -155,7 +149,6 @@ xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
-- The xlog should only be deleted after the replica
-- is unregistered.
test_run:cleanup_cluster()
-#box.info.gc().checkpoints == 1 or box.info.gc()
#fio.glob('./master/*.xlog') == 1 or fio.listdir('./master')
--
-- Test that concurrent invocation of the garbage collector works fine.
@@ -179,7 +172,6 @@ replica_set.drop_all(test_run)
-- a replication master (gh-3546).
--
fio = require('fio')
-fiber = require('fiber')
-- Start a replica and set it up as a master for this instance.
test_run:cmd("start server replica")
@@ -202,8 +194,7 @@ box.snapshot()
-- all xlog files are removed.
test_run:cleanup_cluster()
box.snapshot()
-t = fiber.time()
-while #fio.glob('./master/*xlog') > 0 and fiber.time() - t < 10 do fiber.sleep(0.01) end
+test_run:wait_cond(function() return #fio.glob('./master/*.xlog') == 0 end, 10)
#fio.glob('./master/*.xlog') == 0 or fio.listdir('./master')
-- Restore the config.
--
2.17.1
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Tarantool-patches] [PATCH v1 2/2] test: errinj for pause relay_send
2019-10-29 10:20 [Tarantool-patches] [PATCH v1 0/2] Fix 1.10 tests at replication suite Alexander V. Tikhonov
2019-10-29 10:21 ` [Tarantool-patches] [PATCH v1 1/2] test: cleanup replication/gc Alexander V. Tikhonov
@ 2019-10-29 10:21 ` Alexander V. Tikhonov
2019-10-30 5:09 ` [Tarantool-patches] [PATCH v1 0/2] Fix 1.10 tests at replication suite Kirill Yukhin
2 siblings, 0 replies; 4+ messages in thread
From: Alexander V. Tikhonov @ 2019-10-29 10:21 UTC (permalink / raw)
To: Kirill Yukhin; +Cc: Sergei Voronezhskii, tarantool-patches, tarantool-patches
From: Sergei Voronezhskii <sergw@tarantool.org>
Instead of using timeout we need just pause `relay_send`. Can't rely
on timeout because of various system load in parallel mode. Add new
errinj which checks boolean in loop and until it is not `True` do not
pass the method `relay_send` to the next statement.
To check the read-only mode, need to make a modification of tuple. It
is enough to call `replace` method. Instead of `delete` and then
useless verification that we have not delete tuple by using `get`
method.
And lookup the xlog files in loop with a little sleep, until the file
count is not as expected.
Update box/errinj.result because new errinj was added.
Part of #2436, #3232
(cherry picked from commit 1c34c91fa725ab254619d23c2f1d99f1e8269324)
---
src/box/relay.cc | 6 ++++-
src/errinj.h | 1 +
test/box/errinj.result | 38 +++++++++++++-------------
test/replication/catch.result | 48 +++++++++++++++------------------
test/replication/catch.test.lua | 41 ++++++++++++++--------------
5 files changed, 67 insertions(+), 67 deletions(-)
diff --git a/src/box/relay.cc b/src/box/relay.cc
index cbc1a918b..fe9664131 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -693,12 +693,16 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
static void
relay_send(struct relay *relay, struct xrow_header *packet)
{
+ struct errinj *inj = errinj(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL);
+ while (inj != NULL && inj->bparam)
+ fiber_sleep(0.01);
+
packet->sync = relay->sync;
relay->last_row_time = ev_monotonic_now(loop());
coio_write_xrow(&relay->io, packet);
fiber_gc();
- struct errinj *inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE);
+ inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE);
if (inj != NULL && inj->dparam > 0)
fiber_sleep(inj->dparam);
}
diff --git a/src/errinj.h b/src/errinj.h
index 57dbafcb7..3eb86c949 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -96,6 +96,7 @@ struct errinj {
_(ERRINJ_VY_GC, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_VY_LOG_FLUSH, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_VY_LOG_FLUSH_DELAY, ERRINJ_BOOL, {.bparam = false}) \
+ _(ERRINJ_RELAY_SEND_DELAY, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \
_(ERRINJ_RELAY_REPORT_INTERVAL, ERRINJ_DOUBLE, {.dparam = 0}) \
_(ERRINJ_RELAY_FINAL_SLEEP, ERRINJ_BOOL, {.bparam = false}) \
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 9f6aa5760..82a1bb377 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -46,10 +46,12 @@ errinj.info()
state: false
ERRINJ_WAL_FALLOCATE:
state: 0
- ERRINJ_SNAP_COMMIT_DELAY:
+ ERRINJ_LOG_ROTATE:
state: false
ERRINJ_VY_DUMP_DELAY:
state: false
+ ERRINJ_WAL_BREAK_LSN:
+ state: -1
ERRINJ_TUPLE_ALLOC:
state: false
ERRINJ_VY_RUN_WRITE_DELAY:
@@ -58,25 +60,25 @@ errinj.info()
state: false
ERRINJ_RELAY_REPORT_INTERVAL:
state: 0
- ERRINJ_WAL_BREAK_LSN:
+ ERRINJ_RELAY_BREAK_LSN:
state: -1
ERRINJ_VY_READ_PAGE_TIMEOUT:
state: 0
ERRINJ_XLOG_META:
state: false
- ERRINJ_RELAY_BREAK_LSN:
- state: -1
ERRINJ_VY_INDEX_FILE_RENAME:
state: false
- ERRINJ_WAL_WRITE_DISK:
- state: false
ERRINJ_VY_RUN_FILE_RENAME:
state: false
+ ERRINJ_WAL_WRITE_DISK:
+ state: false
ERRINJ_VY_LOG_FILE_RENAME:
state: false
+ ERRINJ_HTTP_RESPONSE_ADD_WAIT:
+ state: false
ERRINJ_VY_RUN_WRITE:
state: false
- ERRINJ_HTTP_RESPONSE_ADD_WAIT:
+ ERRINJ_SNAP_COMMIT_DELAY:
state: false
ERRINJ_VY_LOG_FLUSH_DELAY:
state: false
@@ -90,34 +92,34 @@ errinj.info()
state: false
ERRINJ_WAL_ROTATE:
state: false
- ERRINJ_LOG_ROTATE:
- state: false
- ERRINJ_VY_POINT_ITER_WAIT:
- state: false
+ ERRINJ_BUILD_INDEX:
+ state: -1
ERRINJ_RELAY_EXIT_DELAY:
state: 0
+ ERRINJ_VY_POINT_ITER_WAIT:
+ state: false
ERRINJ_IPROTO_TX_DELAY:
state: false
- ERRINJ_BUILD_INDEX:
- state: -1
ERRINJ_XLOG_READ:
state: -1
+ ERRINJ_TUPLE_FIELD:
+ state: false
ERRINJ_XLOG_GARBAGE:
state: false
- ERRINJ_TUPLE_FIELD:
+ ERRINJ_INDEX_ALLOC:
state: false
ERRINJ_VY_READ_PAGE_DELAY:
state: false
ERRINJ_TESTING:
state: false
- ERRINJ_INDEX_ALLOC:
- state: false
+ ERRINJ_RELAY_TIMEOUT:
+ state: 0
ERRINJ_VY_SQUASH_TIMEOUT:
state: 0
ERRINJ_VY_LOG_FLUSH:
state: false
- ERRINJ_RELAY_TIMEOUT:
- state: 0
+ ERRINJ_RELAY_SEND_DELAY:
+ state: false
...
errinj.set("some-injection", true)
---
diff --git a/test/replication/catch.result b/test/replication/catch.result
index 663bdc758..e1b2995ec 100644
--- a/test/replication/catch.result
+++ b/test/replication/catch.result
@@ -35,7 +35,7 @@ test_run:cmd("switch default")
s = box.schema.space.create('test', {engine = engine});
---
...
--- vinyl does not support hash index
+-- Vinyl does not support hash index.
index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') })
---
...
@@ -57,14 +57,14 @@ test_run:cmd("stop server replica")
---
- true
...
--- insert values on the master while replica is stopped and can't fetch them
-for i=1,100 do s:insert{i, 'this is test message12345'} end
+-- Insert values on the master while replica is stopped and can't
+-- fetch them.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', true)
---
+- ok
...
--- sleep after every tuple
-errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0)
+for i = 1, 100 do s:insert{i, 'this is test message12345'} end
---
-- ok
...
test_run:cmd("start server replica with args='0.01'")
---
@@ -75,28 +75,26 @@ test_run:cmd("switch replica")
- true
...
-- Check that replica doesn't enter read-write mode before
--- catching up with the master: to check that we inject sleep into
--- the master relay_send function and attempt a data modifying
--- statement in replica while it's still fetching data from the
--- master.
--- In the next two cases we try to delete a tuple while replica is
--- catching up with the master (local delete, remote delete) case
+-- catching up with the master: to check that we stop sending
+-- rows on the master in relay_send function and attempt a data
+-- modifying statement in replica while it's still fetching data
+-- from the master.
+--
+-- In the next two cases we try to replace a tuple while replica
+-- is catching up with the master (local replace, remote replace)
+-- case.
--
--- #1: delete tuple on replica
+-- Case #1: replace tuple on replica locally.
--
box.space.test ~= nil
---
- true
...
-d = box.space.test:delete{1}
+box.space.test:replace{1}
---
- error: Can't modify data because this instance is in read-only mode.
...
-box.space.test:get(1) ~= nil
----
-- true
-...
--- case #2: delete tuple by net.box
+-- Case #2: replace tuple on replica by net.box.
test_run:cmd("switch default")
---
- true
@@ -108,20 +106,16 @@ test_run:cmd("set variable r_uri to 'replica.listen'")
c = net_box.connect(r_uri)
---
...
-d = c.space.test:delete{1}
+d = c.space.test:replace{1}
---
- error: Can't modify data because this instance is in read-only mode.
...
-c.space.test:get(1) ~= nil
----
-- true
-...
--- check sync
-errinj.set("ERRINJ_RELAY_TIMEOUT", 0)
+-- Resume replication.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', false)
---
- ok
...
--- cleanup
+-- Cleanup.
test_run:cmd("stop server replica")
---
- true
diff --git a/test/replication/catch.test.lua b/test/replication/catch.test.lua
index 6773675d0..d5de88642 100644
--- a/test/replication/catch.test.lua
+++ b/test/replication/catch.test.lua
@@ -13,7 +13,7 @@ test_run:cmd("switch replica")
test_run:cmd("switch default")
s = box.schema.space.create('test', {engine = engine});
--- vinyl does not support hash index
+-- Vinyl does not support hash index.
index = s:create_index('primary', {type = (engine == 'vinyl' and 'tree' or 'hash') })
test_run:cmd("switch replica")
@@ -22,41 +22,40 @@ while box.space.test == nil do fiber.sleep(0.01) end
test_run:cmd("switch default")
test_run:cmd("stop server replica")
--- insert values on the master while replica is stopped and can't fetch them
-for i=1,100 do s:insert{i, 'this is test message12345'} end
-
--- sleep after every tuple
-errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0)
+-- Insert values on the master while replica is stopped and can't
+-- fetch them.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', true)
+for i = 1, 100 do s:insert{i, 'this is test message12345'} end
test_run:cmd("start server replica with args='0.01'")
test_run:cmd("switch replica")
-- Check that replica doesn't enter read-write mode before
--- catching up with the master: to check that we inject sleep into
--- the master relay_send function and attempt a data modifying
--- statement in replica while it's still fetching data from the
--- master.
--- In the next two cases we try to delete a tuple while replica is
--- catching up with the master (local delete, remote delete) case
+-- catching up with the master: to check that we stop sending
+-- rows on the master in relay_send function and attempt a data
+-- modifying statement in replica while it's still fetching data
+-- from the master.
+--
+-- In the next two cases we try to replace a tuple while replica
+-- is catching up with the master (local replace, remote replace)
+-- case.
--
--- #1: delete tuple on replica
+-- Case #1: replace tuple on replica locally.
--
box.space.test ~= nil
-d = box.space.test:delete{1}
-box.space.test:get(1) ~= nil
+box.space.test:replace{1}
--- case #2: delete tuple by net.box
+-- Case #2: replace tuple on replica by net.box.
test_run:cmd("switch default")
test_run:cmd("set variable r_uri to 'replica.listen'")
c = net_box.connect(r_uri)
-d = c.space.test:delete{1}
-c.space.test:get(1) ~= nil
+d = c.space.test:replace{1}
--- check sync
-errinj.set("ERRINJ_RELAY_TIMEOUT", 0)
+-- Resume replication.
+errinj.set('ERRINJ_RELAY_SEND_DELAY', false)
--- cleanup
+-- Cleanup.
test_run:cmd("stop server replica")
test_run:cmd("cleanup server replica")
test_run:cmd("delete server replica")
--
2.17.1
^ permalink raw reply [flat|nested] 4+ messages in thread