[Tarantool-patches] [PATCH v31 3/3] test: add gh-6036-qsync-order test
Cyrill Gorcunov
gorcunov at gmail.com
Wed Mar 2 23:27:11 MSK 2022
To test that promotion requests are handled only when appropriate
write to WAL completes, because we update memory data before the
write finishes.
Part-of #6036
Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
.../gh_6036_qsync_order_test.lua | 198 ++++++++++++++++++
test/replication-luatest/suite.ini | 1 +
2 files changed, 199 insertions(+)
create mode 100644 test/replication-luatest/gh_6036_qsync_order_test.lua
diff --git a/test/replication-luatest/gh_6036_qsync_order_test.lua b/test/replication-luatest/gh_6036_qsync_order_test.lua
new file mode 100644
index 000000000..c23c7a3a1
--- /dev/null
+++ b/test/replication-luatest/gh_6036_qsync_order_test.lua
@@ -0,0 +1,198 @@
+local t = require('luatest')
+local cluster = require('test.luatest_helpers.cluster')
+local server = require('test.luatest_helpers.server')
+local fiber = require('fiber')
+
+local g = t.group('gh-6036')
+
+g.before_all(function(cg)
+ cg.cluster = cluster:new({})
+
+ cg.box_cfg = {
+ replication = {
+ server.build_instance_uri('r1'),
+ server.build_instance_uri('r2'),
+ },
+ replication_timeout = 0.1,
+ replication_connect_quorum = 1,
+ election_mode = 'manual',
+ election_timeout = 0.1,
+ replication_synchro_quorum = 1,
+ replication_synchro_timeout = 0.1,
+ log_level = 6,
+ }
+
+ cg.r1 = cg.cluster:build_server({ alias = 'r1', box_cfg = cg.box_cfg })
+ cg.r2 = cg.cluster:build_server({ alias = 'r2', box_cfg = cg.box_cfg })
+
+ cg.cluster:add_server(cg.r1)
+ cg.cluster:add_server(cg.r2)
+ cg.cluster:start()
+end)
+
+g.after_all(function(cg)
+ cg.cluster:drop()
+ cg.cluster.servers = nil
+end)
+
+local function update_replication(...)
+ return (box.cfg{ replication = { ... } })
+end
+
+--
+-- The test requires 3rd replica to graft in.
+g.before_test("test_qsync_order", function(cg)
+ cg.box_cfg.replication[3] = server.build_instance_uri("r3")
+ cg.r3 = cg.cluster:build_server({ alias = 'r3', box_cfg = cg.box_cfg })
+ cg.cluster:add_server(cg.r3)
+ cg.r3:start()
+ cg.r1:exec(update_replication, cg.box_cfg.replication)
+ cg.r2:exec(update_replication, cg.box_cfg.replication)
+end)
+
+g.test_qsync_order = function(cg)
+ cg.cluster:wait_fullmesh()
+
+ --
+ -- Create a synchro space on the r1 node and make
+ -- sure the write processed just fine.
+ cg.r1:exec(function()
+ box.ctl.promote()
+ box.ctl.wait_rw()
+ local s = box.schema.create_space('test', {is_sync = true})
+ s:create_index('pk')
+ s:insert{1}
+ end)
+
+ local vclock = cg.r1:get_vclock()
+ vclock[0] = nil
+ cg.r2:wait_vclock(vclock)
+ cg.r3:wait_vclock(vclock)
+
+ t.assert_equals(cg.r1:eval("return box.space.test:select()"), {{1}})
+ t.assert_equals(cg.r2:eval("return box.space.test:select()"), {{1}})
+ t.assert_equals(cg.r3:eval("return box.space.test:select()"), {{1}})
+
+ --
+ -- Drop connection between r1 and r2.
+ cg.r1:exec(update_replication, {
+ server.build_instance_uri("r1"),
+ server.build_instance_uri("r3"),
+ })
+
+ --
+ -- Drop connection between r2 and r1.
+ cg.r2:exec(update_replication, {
+ server.build_instance_uri("r2"),
+ server.build_instance_uri("r3"),
+ })
+
+ --
+ -- Here we have the following scheme
+ --
+ -- r3 (WAL delay)
+ -- / \
+ -- r1 r2
+ --
+
+ --
+ -- Initiate disk delay in a bit tricky way: the next write will
+ -- fall into forever sleep.
+ cg.r3:exec(function()
+ box.error.injection.set('ERRINJ_WAL_DELAY', true)
+ end)
+
+ --
+ -- Make r2 been a leader and start writting data, the PROMOTE
+ -- request get queued on r3 and not yet processed, same time
+ -- the INSERT won't complete either waiting for the PROMOTE
+ -- completion first. Note that we enter r3 as well just to be
+ -- sure the PROMOTE has reached it via queue state test.
+ cg.r2:exec(function()
+ box.ctl.promote()
+ box.ctl.wait_rw()
+ end)
+ t.helpers.retrying({}, function()
+ assert(cg.r3:exec(function()
+ return box.info.synchro.queue.busy == true
+ end))
+ end)
+ cg.r2:exec(function()
+ box.space.test:insert{2}
+ end)
+
+ --
+ -- The r1 node has no clue that there is a new leader and continue
+ -- writing data with obsolete term. Since r3 is delayed now
+ -- the INSERT won't proceed yet but get queued.
+ cg.r1:exec(function()
+ box.space.test:insert{3}
+ end)
+
+ --
+ -- Finally enable r3 back. Make sure the data from new r2 leader get
+ -- writing while old leader's data ignored.
+ cg.r3:exec(function()
+ box.error.injection.set('ERRINJ_WAL_DELAY', false)
+ end)
+ t.helpers.retrying({}, function()
+ assert(cg.r3:exec(function()
+ return box.space.test:get{2} ~= nil
+ end))
+ end)
+
+ t.assert_equals(cg.r3:eval("return box.space.test:select()"), {{1},{2}})
+end
+
+--
+-- Drop the r3 replica, since it is no longer needed for this test.
+g.after_test("test_qsync_order", function(cg)
+ cg.box_cfg.replication[3] = nil
+ cg.r1:exec(update_replication, cg.box_cfg.replication)
+ cg.r2:exec(update_replication, cg.box_cfg.replication)
+ cg.r3:stop()
+ cg.r3:cleanup()
+ cg.r3 = nil
+end)
+
+g.test_promote_order = function(cg)
+ --
+ -- Make sure that while we're processing PROMOTE no other records
+ -- get sneaked in via applier code from other replicas. For this
+ -- sake initiate voting and stop inside wal thread just before
+ -- PROMOTE get written. Another replica sends us new record and
+ -- it should be dropped.
+ cg.r1:exec(function()
+ box.ctl.promote()
+ box.ctl.wait_rw()
+ end)
+ local vclock = cg.r1:get_vclock()
+ vclock[0] = nil
+ cg.r2:wait_vclock(vclock)
+
+ --
+ -- Drop connection between r1 and the rest of the cluster.
+ -- Otherwise r1 might become Raft follower before attempting
+ -- insert{4}.
+ cg.r1:exec(function() box.cfg{replication=""} end)
+ cg.r2:exec(function()
+ box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 2)
+ require('fiber').create(function() box.ctl.promote() end)
+ end)
+ t.helpers.retrying({}, function()
+ t.assert(cg.r2:exec(function()
+ return box.info.synchro.queue.busy
+ end))
+ end)
+ t.assert(cg.r1:exec(function() return box.info.ro == false end))
+ cg.r1:exec(function()
+ box.space.test:insert{4}
+ end)
+ cg.r2:exec(function()
+ assert(box.info.synchro.queue.busy == true)
+ box.error.injection.set('ERRINJ_WAL_DELAY', false)
+ box.ctl.wait_rw()
+ end)
+
+ t.assert_equals(cg.r2:eval("return box.space.test:select()"), {{1},{2}})
+end
diff --git a/test/replication-luatest/suite.ini b/test/replication-luatest/suite.ini
index 374f1b87a..07ec93a52 100644
--- a/test/replication-luatest/suite.ini
+++ b/test/replication-luatest/suite.ini
@@ -2,3 +2,4 @@
core = luatest
description = replication luatests
is_parallel = True
+release_disabled = gh_6036_qsync_order_test.lua
--
2.35.1
More information about the Tarantool-patches
mailing list