[Tarantool-patches] [PATCH v9 5/5] test: add replication/gh-6036-rollback-confirm

Cyrill Gorcunov gorcunov at gmail.com
Fri Jul 30 14:35:39 MSK 2021


Follow-up #6036

Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
 test/replication/gh-6036-master.lua           |   1 +
 test/replication/gh-6036-node.lua             |  33 ++++
 test/replication/gh-6036-replica.lua          |   1 +
 .../gh-6036-rollback-confirm.result           | 180 ++++++++++++++++++
 .../gh-6036-rollback-confirm.test.lua         |  92 +++++++++
 5 files changed, 307 insertions(+)
 create mode 120000 test/replication/gh-6036-master.lua
 create mode 100644 test/replication/gh-6036-node.lua
 create mode 120000 test/replication/gh-6036-replica.lua
 create mode 100644 test/replication/gh-6036-rollback-confirm.result
 create mode 100644 test/replication/gh-6036-rollback-confirm.test.lua

diff --git a/test/replication/gh-6036-master.lua b/test/replication/gh-6036-master.lua
new file mode 120000
index 000000000..65baed5de
--- /dev/null
+++ b/test/replication/gh-6036-master.lua
@@ -0,0 +1 @@
+gh-6036-node.lua
\ No newline at end of file
diff --git a/test/replication/gh-6036-node.lua b/test/replication/gh-6036-node.lua
new file mode 100644
index 000000000..ac701b7a2
--- /dev/null
+++ b/test/replication/gh-6036-node.lua
@@ -0,0 +1,33 @@
+local INSTANCE_ID = string.match(arg[0], "gh%-6036%-(.+)%.lua")
+
+local function unix_socket(name)
+    return "unix/:./" .. name .. '.sock';
+end
+
+require('console').listen(os.getenv('ADMIN'))
+
+if INSTANCE_ID == "master" then
+    box.cfg({
+        listen = unix_socket("master"),
+        replication_connect_quorum = 0,
+        election_mode = 'candidate',
+        replication_synchro_quorum = 3,
+        replication_synchro_timeout = 1000,
+    })
+elseif INSTANCE_ID == "replica" then
+    box.cfg({
+        listen = unix_socket("replica"),
+        replication = {
+            unix_socket("master"),
+            unix_socket("replica")
+        },
+        read_only = true,
+        election_mode = 'voter',
+        replication_synchro_quorum = 2,
+        replication_synchro_timeout = 1000,
+    })
+end
+
+box.once("bootstrap", function()
+    box.schema.user.grant('guest', 'super')
+end)
diff --git a/test/replication/gh-6036-replica.lua b/test/replication/gh-6036-replica.lua
new file mode 120000
index 000000000..65baed5de
--- /dev/null
+++ b/test/replication/gh-6036-replica.lua
@@ -0,0 +1 @@
+gh-6036-node.lua
\ No newline at end of file
diff --git a/test/replication/gh-6036-rollback-confirm.result b/test/replication/gh-6036-rollback-confirm.result
new file mode 100644
index 000000000..27419ce21
--- /dev/null
+++ b/test/replication/gh-6036-rollback-confirm.result
@@ -0,0 +1,180 @@
+-- test-run result file version 2
+--
+-- gh-6036: Test for record collision detection. We have a cluster
+-- of two nodes: master and replica. The master initiates syncho write
+-- but fails to gather a quorum. Before it rolls back the record the
+-- network breakage occurs and replica lives with dirty data while
+-- master node goes offline. The replica becomes a new raft leader
+-- and commits the dirty data, same time master node rolls back this
+-- record and tries to connect to the new raft leader back. Such
+-- connection should be refused because old master node is not longer
+-- consistent.
+--
+test_run = require('test_run').new()
+ | ---
+ | ...
+
+test_run:cmd('create server master with script="replication/gh-6036-master.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('create server replica with script="replication/gh-6036-replica.lua"')
+ | ---
+ | - true
+ | ...
+
+test_run:cmd('start server master')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica')
+ | ---
+ | - true
+ | ...
+
+--
+-- Connect master to the replica and write a record. Since the quorum
+-- value is bigger than number of nodes in a cluster it will be rolled
+-- back later.
+test_run:switch('master')
+ | ---
+ | - true
+ | ...
+box.cfg({                                       \
+    replication = {                             \
+            "unix/:./master.sock",              \
+            "unix/:./replica.sock",             \
+    },                                          \
+})
+ | ---
+ | ...
+_ = box.schema.create_space('sync', {is_sync = true})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+
+--
+-- Wait the record to appear on the master.
+f = require('fiber').create(function() box.space.sync:replace{1} end)
+ | ---
+ | ...
+test_run:wait_cond(function() return box.space.sync:get({1}) ~= nil end, 100)
+ | ---
+ | - true
+ | ...
+box.space.sync:select{}
+ | ---
+ | - - [1]
+ | ...
+
+--
+-- Wait the record from master get written and then
+-- drop the replication.
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+test_run:wait_cond(function() return box.space.sync:get({1}) ~= nil end, 100)
+ | ---
+ | - true
+ | ...
+box.space.sync:select{}
+ | ---
+ | - - [1]
+ | ...
+box.cfg{replication = {}}
+ | ---
+ | ...
+
+--
+-- Then we jump back to the master and drop the replication,
+-- thus unconfirmed record get rolled back.
+test_run:switch('master')
+ | ---
+ | - true
+ | ...
+box.cfg({                                       \
+    replication = {},                           \
+    replication_synchro_timeout = 0.001,        \
+    election_mode = 'manual',                   \
+})
+ | ---
+ | ...
+while f:status() ~= 'dead' do require('fiber').sleep(0.1) end
+ | ---
+ | ...
+test_run:wait_cond(function() return box.space.sync:get({1}) == nil end, 100)
+ | ---
+ | - true
+ | ...
+
+--
+-- Force the replica to become a RAFT leader and
+-- commit this new record.
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.cfg({                                       \
+    replication_synchro_quorum = 1,             \
+    election_mode = 'manual'                    \
+})
+ | ---
+ | ...
+box.ctl.promote()
+ | ---
+ | ...
+box.space.sync:select{}
+ | ---
+ | - - [1]
+ | ...
+
+--
+-- Connect master back to the replica, it should
+-- be refused.
+test_run:switch('master')
+ | ---
+ | - true
+ | ...
+box.cfg({                                       \
+    replication = {                             \
+            "unix/:./replica.sock",             \
+    },                                          \
+})
+ | ---
+ | ...
+box.space.sync:select{}
+ | ---
+ | - []
+ | ...
+test_run:wait_cond(function() return            \
+    test_run:grep_log('master',                 \
+        'rejecting PROMOTE') ~= nil end, 100)   \
+test_run:wait_cond(function() return            \
+    test_run:grep_log('master',                 \
+        'ER_UNSUPPORTED') ~= nil end, 100)
+ | ---
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server master')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server master')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
diff --git a/test/replication/gh-6036-rollback-confirm.test.lua b/test/replication/gh-6036-rollback-confirm.test.lua
new file mode 100644
index 000000000..f84f3dd45
--- /dev/null
+++ b/test/replication/gh-6036-rollback-confirm.test.lua
@@ -0,0 +1,92 @@
+--
+-- gh-6036: Test for record collision detection. We have a cluster
+-- of two nodes: master and replica. The master initiates syncho write
+-- but fails to gather a quorum. Before it rolls back the record the
+-- network breakage occurs and replica lives with dirty data while
+-- master node goes offline. The replica becomes a new raft leader
+-- and commits the dirty data, same time master node rolls back this
+-- record and tries to connect to the new raft leader back. Such
+-- connection should be refused because old master node is not longer
+-- consistent.
+--
+test_run = require('test_run').new()
+
+test_run:cmd('create server master with script="replication/gh-6036-master.lua"')
+test_run:cmd('create server replica with script="replication/gh-6036-replica.lua"')
+
+test_run:cmd('start server master')
+test_run:cmd('start server replica')
+
+--
+-- Connect master to the replica and write a record. Since the quorum
+-- value is bigger than number of nodes in a cluster it will be rolled
+-- back later.
+test_run:switch('master')
+box.cfg({                                       \
+    replication = {                             \
+            "unix/:./master.sock",              \
+            "unix/:./replica.sock",             \
+    },                                          \
+})
+_ = box.schema.create_space('sync', {is_sync = true})
+_ = box.space.sync:create_index('pk')
+
+--
+-- Wait the record to appear on the master.
+f = require('fiber').create(function() box.space.sync:replace{1} end)
+test_run:wait_cond(function() return box.space.sync:get({1}) ~= nil end, 100)
+box.space.sync:select{}
+
+--
+-- Wait the record from master get written and then
+-- drop the replication.
+test_run:switch('replica')
+test_run:wait_cond(function() return box.space.sync:get({1}) ~= nil end, 100)
+box.space.sync:select{}
+box.cfg{replication = {}}
+
+--
+-- Then we jump back to the master and drop the replication,
+-- thus unconfirmed record get rolled back.
+test_run:switch('master')
+box.cfg({                                       \
+    replication = {},                           \
+    replication_synchro_timeout = 0.001,        \
+    election_mode = 'manual',                   \
+})
+while f:status() ~= 'dead' do require('fiber').sleep(0.1) end
+test_run:wait_cond(function() return box.space.sync:get({1}) == nil end, 100)
+
+--
+-- Force the replica to become a RAFT leader and
+-- commit this new record.
+test_run:switch('replica')
+box.cfg({                                       \
+    replication_synchro_quorum = 1,             \
+    election_mode = 'manual'                    \
+})
+box.ctl.promote()
+box.space.sync:select{}
+
+--
+-- Connect master back to the replica, it should
+-- be refused.
+test_run:switch('master')
+box.cfg({                                       \
+    replication = {                             \
+            "unix/:./replica.sock",             \
+    },                                          \
+})
+box.space.sync:select{}
+test_run:wait_cond(function() return            \
+    test_run:grep_log('master',                 \
+        'rejecting PROMOTE') ~= nil end, 100)   \
+test_run:wait_cond(function() return            \
+    test_run:grep_log('master',                 \
+        'ER_UNSUPPORTED') ~= nil end, 100)
+
+test_run:switch('default')
+test_run:cmd('stop server master')
+test_run:cmd('delete server master')
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
-- 
2.31.1



More information about the Tarantool-patches mailing list