[Tarantool-patches] [PATCH v17 5/5] test: add gh-6036-term-order test
Cyrill Gorcunov
gorcunov at gmail.com
Wed Sep 22 16:05:35 MSK 2021
To test that promotion requests are handled only when appropriate
write to WAL completes, because we update memory data before the
write finishes.
Part-of #6036
Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
test/replication/gh-6036-order-master.lua | 1 +
test/replication/gh-6036-order-node.lua | 60 ++++++
test/replication/gh-6036-order-replica1.lua | 1 +
test/replication/gh-6036-order-replica2.lua | 1 +
test/replication/gh-6036-term-order.result | 216 +++++++++++++++++++
test/replication/gh-6036-term-order.test.lua | 94 ++++++++
test/replication/suite.cfg | 1 +
test/replication/suite.ini | 2 +-
8 files changed, 375 insertions(+), 1 deletion(-)
create mode 120000 test/replication/gh-6036-order-master.lua
create mode 100644 test/replication/gh-6036-order-node.lua
create mode 120000 test/replication/gh-6036-order-replica1.lua
create mode 120000 test/replication/gh-6036-order-replica2.lua
create mode 100644 test/replication/gh-6036-term-order.result
create mode 100644 test/replication/gh-6036-term-order.test.lua
diff --git a/test/replication/gh-6036-order-master.lua b/test/replication/gh-6036-order-master.lua
new file mode 120000
index 000000000..82a6073a1
--- /dev/null
+++ b/test/replication/gh-6036-order-master.lua
@@ -0,0 +1 @@
+gh-6036-order-node.lua
\ No newline at end of file
diff --git a/test/replication/gh-6036-order-node.lua b/test/replication/gh-6036-order-node.lua
new file mode 100644
index 000000000..b22a7cb4c
--- /dev/null
+++ b/test/replication/gh-6036-order-node.lua
@@ -0,0 +1,60 @@
+local INSTANCE_ID = string.match(arg[0], "gh%-6036%-order%-(.+)%.lua")
+
+local function unix_socket(name)
+ return "unix/:./" .. name .. '.sock';
+end
+
+require('console').listen(os.getenv('ADMIN'))
+
+if INSTANCE_ID == "master" then
+ box.cfg({
+ listen = unix_socket(INSTANCE_ID),
+ replication = {
+ unix_socket(INSTANCE_ID),
+ unix_socket("replica1"),
+ unix_socket("replica2"),
+ },
+ replication_connect_quorum = 1,
+ replication_synchro_quorum = 1,
+ replication_synchro_timeout = 10000,
+ replication_sync_timeout = 5,
+ read_only = false,
+ election_mode = "off",
+ })
+elseif INSTANCE_ID == "replica1" then
+ box.cfg({
+ listen = unix_socket(INSTANCE_ID),
+ replication = {
+ unix_socket("master"),
+ unix_socket(INSTANCE_ID),
+ unix_socket("replica2"),
+ },
+ replication_connect_quorum = 1,
+ replication_synchro_quorum = 1,
+ replication_synchro_timeout = 10000,
+ replication_sync_timeout = 5,
+ read_only = false,
+ election_mode = "off",
+ })
+else
+ assert(INSTANCE_ID == "replica2")
+ box.cfg({
+ listen = unix_socket(INSTANCE_ID),
+ replication = {
+ unix_socket("master"),
+ unix_socket("replica1"),
+ unix_socket(INSTANCE_ID),
+ },
+ replication_connect_quorum = 1,
+ replication_synchro_quorum = 1,
+ replication_synchro_timeout = 10000,
+ replication_sync_timeout = 5,
+ read_only = true,
+ election_mode = "off",
+ })
+end
+
+--box.ctl.wait_rw()
+box.once("bootstrap", function()
+ box.schema.user.grant('guest', 'super')
+end)
diff --git a/test/replication/gh-6036-order-replica1.lua b/test/replication/gh-6036-order-replica1.lua
new file mode 120000
index 000000000..82a6073a1
--- /dev/null
+++ b/test/replication/gh-6036-order-replica1.lua
@@ -0,0 +1 @@
+gh-6036-order-node.lua
\ No newline at end of file
diff --git a/test/replication/gh-6036-order-replica2.lua b/test/replication/gh-6036-order-replica2.lua
new file mode 120000
index 000000000..82a6073a1
--- /dev/null
+++ b/test/replication/gh-6036-order-replica2.lua
@@ -0,0 +1 @@
+gh-6036-order-node.lua
\ No newline at end of file
diff --git a/test/replication/gh-6036-term-order.result b/test/replication/gh-6036-term-order.result
new file mode 100644
index 000000000..6b19fc2c8
--- /dev/null
+++ b/test/replication/gh-6036-term-order.result
@@ -0,0 +1,216 @@
+-- test-run result file version 2
+--
+-- gh-6036: verify that terms are locked when we're inside journal
+-- write routine, because parallel appliers may ignore the fact that
+-- the term is updated already but not yet written leading to data
+-- inconsistency.
+--
+test_run = require('test_run').new()
+ | ---
+ | ...
+
+test_run:cmd('create server master with script="replication/gh-6036-order-master.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('create server replica1 with script="replication/gh-6036-order-replica1.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('create server replica2 with script="replication/gh-6036-order-replica2.lua"')
+ | ---
+ | - true
+ | ...
+
+test_run:cmd('start server master with wait=False')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica1 with wait=False')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica2 with wait=False')
+ | ---
+ | - true
+ | ...
+
+test_run:wait_fullmesh({"master", "replica1", "replica2"})
+ | ---
+ | ...
+
+test_run:switch("master")
+ | ---
+ | - true
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+
+test_run:switch("replica1")
+ | ---
+ | - true
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+
+test_run:switch("replica2")
+ | ---
+ | - true
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+
+--
+-- Drop connection between master and replica1.
+test_run:switch("master")
+ | ---
+ | - true
+ | ...
+box.cfg({ \
+ replication = { \
+ "unix/:./master.sock", \
+ "unix/:./replica2.sock", \
+ }, \
+})
+ | ---
+ | ...
+test_run:switch("replica1")
+ | ---
+ | - true
+ | ...
+box.cfg({ \
+ replication = { \
+ "unix/:./replica1.sock", \
+ "unix/:./replica2.sock", \
+ }, \
+})
+ | ---
+ | ...
+
+--
+-- Initiate disk delay and remember the max term seen so far.
+test_run:switch("replica2")
+ | ---
+ | - true
+ | ...
+box.error.injection.set('ERRINJ_WAL_DELAY', true)
+ | ---
+ | - ok
+ | ...
+term_max_replica2 = box.info.synchro.promote.term_max
+ | ---
+ | ...
+
+--
+-- Ping-pong the promote action between master and
+-- replica1 nodes, the term updates get queued on
+-- replica2 because of disk being disabled.
+test_run:switch("master")
+ | ---
+ | - true
+ | ...
+box.ctl.promote()
+ | ---
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+
+test_run:switch("replica1")
+ | ---
+ | - true
+ | ...
+box.ctl.promote()
+ | ---
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+
+test_run:switch("master")
+ | ---
+ | - true
+ | ...
+box.ctl.promote()
+ | ---
+ | ...
+
+--
+-- Since we're guarding max promote term make sure that
+-- 1) The max term has not yet been updated because WAL
+-- is in sleeping state.
+-- 2) Max term on master and replica1 nodes are greater
+-- than we have now because terms update is locked.
+-- 3) Once WAL is unlocked we make sure that terms has
+-- reached us.
+test_run:switch("replica2")
+ | ---
+ | - true
+ | ...
+assert(term_max_replica2 == box.info.synchro.promote.term_max)
+ | ---
+ | - true
+ | ...
+
+term_max_master = test_run:eval('master', 'box.info.synchro.promote.term_max')[1]
+ | ---
+ | ...
+term_max_replica1 = test_run:eval('replica1', 'box.info.synchro.promote.term_max')[1]
+ | ---
+ | ...
+assert(term_max_master > term_max_replica2)
+ | ---
+ | - true
+ | ...
+assert(term_max_replica1 > term_max_replica2)
+ | ---
+ | - true
+ | ...
+term_max_wait4 = term_max_master
+ | ---
+ | ...
+if term_max_wait4 < term_max_replica1 then term_max_wait4 = term_max_replica1 end
+ | ---
+ | ...
+
+box.error.injection.set('ERRINJ_WAL_DELAY', false)
+ | ---
+ | - ok
+ | ...
+test_run:wait_cond(function() return box.info.synchro.promote.term_max == term_max_wait4 end)
+ | ---
+ | - true
+ | ...
+
+test_run:switch("default")
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server master')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica1')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica2')
+ | ---
+ | - true
+ | ...
+
+test_run:cmd('delete server master')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica1')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica2')
+ | ---
+ | - true
+ | ...
diff --git a/test/replication/gh-6036-term-order.test.lua b/test/replication/gh-6036-term-order.test.lua
new file mode 100644
index 000000000..01d73ac55
--- /dev/null
+++ b/test/replication/gh-6036-term-order.test.lua
@@ -0,0 +1,94 @@
+--
+-- gh-6036: verify that terms are locked when we're inside journal
+-- write routine, because parallel appliers may ignore the fact that
+-- the term is updated already but not yet written leading to data
+-- inconsistency.
+--
+test_run = require('test_run').new()
+
+test_run:cmd('create server master with script="replication/gh-6036-order-master.lua"')
+test_run:cmd('create server replica1 with script="replication/gh-6036-order-replica1.lua"')
+test_run:cmd('create server replica2 with script="replication/gh-6036-order-replica2.lua"')
+
+test_run:cmd('start server master with wait=False')
+test_run:cmd('start server replica1 with wait=False')
+test_run:cmd('start server replica2 with wait=False')
+
+test_run:wait_fullmesh({"master", "replica1", "replica2"})
+
+test_run:switch("master")
+box.ctl.demote()
+
+test_run:switch("replica1")
+box.ctl.demote()
+
+test_run:switch("replica2")
+box.ctl.demote()
+
+--
+-- Drop connection between master and replica1.
+test_run:switch("master")
+box.cfg({ \
+ replication = { \
+ "unix/:./master.sock", \
+ "unix/:./replica2.sock", \
+ }, \
+})
+test_run:switch("replica1")
+box.cfg({ \
+ replication = { \
+ "unix/:./replica1.sock", \
+ "unix/:./replica2.sock", \
+ }, \
+})
+
+--
+-- Initiate disk delay and remember the max term seen so far.
+test_run:switch("replica2")
+box.error.injection.set('ERRINJ_WAL_DELAY', true)
+term_max_replica2 = box.info.synchro.promote.term_max
+
+--
+-- Ping-pong the promote action between master and
+-- replica1 nodes, the term updates get queued on
+-- replica2 because of disk being disabled.
+test_run:switch("master")
+box.ctl.promote()
+box.ctl.demote()
+
+test_run:switch("replica1")
+box.ctl.promote()
+box.ctl.demote()
+
+test_run:switch("master")
+box.ctl.promote()
+
+--
+-- Since we're guarding max promote term make sure that
+-- 1) The max term has not yet been updated because WAL
+-- is in sleeping state.
+-- 2) Max term on master and replica1 nodes are greater
+-- than we have now because terms update is locked.
+-- 3) Once WAL is unlocked we make sure that terms has
+-- reached us.
+test_run:switch("replica2")
+assert(term_max_replica2 == box.info.synchro.promote.term_max)
+
+term_max_master = test_run:eval('master', 'box.info.synchro.promote.term_max')[1]
+term_max_replica1 = test_run:eval('replica1', 'box.info.synchro.promote.term_max')[1]
+assert(term_max_master > term_max_replica2)
+assert(term_max_replica1 > term_max_replica2)
+term_max_wait4 = term_max_master
+if term_max_wait4 < term_max_replica1 then term_max_wait4 = term_max_replica1 end
+
+box.error.injection.set('ERRINJ_WAL_DELAY', false)
+test_run:wait_cond(function() return box.info.synchro.promote.term_max == term_max_wait4 end)
+
+test_run:switch("default")
+test_run:cmd('stop server master')
+test_run:cmd('stop server replica1')
+test_run:cmd('stop server replica2')
+
+test_run:cmd('delete server master')
+test_run:cmd('delete server replica1')
+test_run:cmd('delete server replica2')
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index 3eee0803c..ac2bedfd9 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -59,6 +59,7 @@
"gh-6094-rs-uuid-mismatch.test.lua": {},
"gh-6127-election-join-new.test.lua": {},
"gh-6035-applier-filter.test.lua": {},
+ "gh-6036-term-order.test.lua": {},
"election-candidate-promote.test.lua": {},
"*": {
"memtx": {"engine": "memtx"},
diff --git a/test/replication/suite.ini b/test/replication/suite.ini
index 77eb95f49..16840e01f 100644
--- a/test/replication/suite.ini
+++ b/test/replication/suite.ini
@@ -3,7 +3,7 @@ core = tarantool
script = master.lua
description = tarantool/box, replication
disabled = consistent.test.lua
-release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5430-qsync-promote-crash.test.lua gh-5430-cluster-mvcc.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua
+release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5430-qsync-promote-crash.test.lua gh-5430-cluster-mvcc.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua gh-6036-term-order.test.lua
config = suite.cfg
lua_libs = lua/fast_replica.lua lua/rlimit.lua
use_unix_sockets = True
--
2.31.1
More information about the Tarantool-patches
mailing list