[Tarantool-patches] [PATCH 7/7] box: make promote/demote always bump the term

Serge Petrenko sergepetrenko at tarantool.org
Thu Jun 10 16:32:57 MSK 2021


When called without elections, promote and demote resulted in multiple
PROMOTE/DEMOTE entries for the same term. This is not right, because all
the promotions for the same term except the first one would be ignored as
already seen.

Follow-up #6034
---
 src/box/box.cc                                |  14 +-
 src/lib/raft/raft.c                           |   3 +-
 .../gh-6034-limbo-ownership.result            | 186 ++++++++++++++++++
 .../gh-6034-limbo-ownership.test.lua          |  68 +++++++
 .../gh-6034-promote-bump-term.result          |  51 +++++
 .../gh-6034-promote-bump-term.test.lua        |  20 ++
 test/replication/suite.cfg                    |   2 +
 7 files changed, 336 insertions(+), 8 deletions(-)
 create mode 100644 test/replication/gh-6034-limbo-ownership.result
 create mode 100644 test/replication/gh-6034-limbo-ownership.test.lua
 create mode 100644 test/replication/gh-6034-promote-bump-term.result
 create mode 100644 test/replication/gh-6034-promote-bump-term.test.lua

diff --git a/src/box/box.cc b/src/box/box.cc
index d3908b4cd..59f8dddae 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1663,14 +1663,16 @@ box_clear_synchro_queue(bool demote)
 			rc = -1;
 		} else {
 promote:
-			/* We cannot possibly get here in a volatile state. */
-			assert(box_raft()->volatile_term == box_raft()->term);
+			if (try_wait)
+				raft_new_term(box_raft());
+
+			uint64_t term = box_raft()->volatile_term;
 			if (demote) {
 				txn_limbo_write_demote(&txn_limbo, wait_lsn,
-						       box_raft()->term);
+						       term);
 			} else {
 				txn_limbo_write_promote(&txn_limbo, wait_lsn,
-							box_raft()->term);
+							term);
 			}
 			uint16_t type = demote ? IPROTO_DEMOTE : IPROTO_PROMOTE;
 			struct synchro_request req = {
@@ -1678,7 +1680,7 @@ promote:
 				.replica_id = former_leader_id,
 				.origin_id = instance_id,
 				.lsn = wait_lsn,
-				.term = box_raft()->term,
+				.term = term,
 			};
 			txn_limbo_process(&txn_limbo, &req);
 			assert(txn_limbo_is_empty(&txn_limbo));
@@ -3511,7 +3513,7 @@ box_cfg_xc(void)
 
 	if (!is_bootstrap_leader) {
 		replicaset_sync();
-	} else {
+	} else if (raft_is_enabled(box_raft())) {
 		/*
 		 * When the cluster is just bootstrapped and this instance is a
 		 * leader, it makes no sense to wait for a leader appearance.
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 247c7a71a..1de05727f 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -983,8 +983,7 @@ raft_cfg_vclock(struct raft *raft, const struct vclock *vclock)
 void
 raft_new_term(struct raft *raft)
 {
-	if (raft->is_enabled)
-		raft_sm_schedule_new_term(raft, raft->volatile_term + 1);
+	raft_sm_schedule_new_term(raft, raft->volatile_term + 1);
 }
 
 static void
diff --git a/test/replication/gh-6034-limbo-ownership.result b/test/replication/gh-6034-limbo-ownership.result
new file mode 100644
index 000000000..ebedc2b17
--- /dev/null
+++ b/test/replication/gh-6034-limbo-ownership.result
@@ -0,0 +1,186 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+fiber = require('fiber')
+ | ---
+ | ...
+
+--
+-- gh-6034: test that transactional limbo isn't accessible without a promotion.
+--
+synchro_quorum = box.cfg.replication_synchro_quorum
+ | ---
+ | ...
+election_mode = box.cfg.election_mode
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum = 1, election_mode='off'}
+ | ---
+ | ...
+
+_ = box.schema.space.create('async'):create_index('pk')
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true}):create_index('pk')
+ | ---
+ | ...
+
+-- Limbo is initially unclaimed, everyone is writeable.
+assert(not box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == 0)
+ | ---
+ | - true
+ | ...
+box.space.async:insert{1} -- success.
+ | ---
+ | - [1]
+ | ...
+-- Synchro spaces aren't writeable
+box.space.sync:insert{1} -- error.
+ | ---
+ | - error: Synchronous transaction limbo doesn't belong to any instance
+ | ...
+
+box.ctl.promote()
+ | ---
+ | ...
+assert(not box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == box.info.id)
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{1} -- success.
+ | ---
+ | - [1]
+ | ...
+
+-- Everyone but the limbo owner is read-only.
+box.schema.user.grant('guest', 'replication')
+ | ---
+ | ...
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=True, wait_load=True')
+ | ---
+ | - true
+ | ...
+test_run:cmd('set variable rpl_listen to "replica.listen"')
+ | ---
+ | - true
+ | ...
+orig_replication = box.cfg.replication
+ | ---
+ | ...
+box.cfg{replication={box.info.listen, rpl_listen}}
+ | ---
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+assert(box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == test_run:eval('default', 'return box.info.id')[1])
+ | ---
+ | - true
+ | ...
+box.space.async:insert{2} -- failure.
+ | ---
+ | - error: Can't modify data because this instance is in read-only mode.
+ | ...
+
+-- Promotion on the other node. Default should become ro.
+box.ctl.promote()
+ | ---
+ | ...
+assert(not box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == box.info.id)
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{2} -- success.
+ | ---
+ | - [2]
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+assert(box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == test_run:eval('replica', 'return box.info.id')[1])
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{3} -- failure.
+ | ---
+ | - error: Can't modify data because this instance is in read-only mode.
+ | ...
+
+box.ctl.demote()
+ | ---
+ | ...
+assert(not box.info.ro)
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{3} -- still fails.
+ | ---
+ | - error: Synchronous transaction limbo doesn't belong to any instance
+ | ...
+assert(box.info.synchro.queue.owner == 0)
+ | ---
+ | - true
+ | ...
+box.space.async:insert{3} -- success.
+ | ---
+ | - [3]
+ | ...
+
+-- Cleanup.
+box.ctl.demote()
+ | ---
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+box.schema.user.revoke('guest', 'replication')
+ | ---
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+box.space.async:drop()
+ | ---
+ | ...
+box.cfg{\
+    replication_synchro_quorum = synchro_quorum,\
+    election_mode = election_mode,\
+    replication = orig_replication,\
+}
+ | ---
+ | ...
diff --git a/test/replication/gh-6034-limbo-ownership.test.lua b/test/replication/gh-6034-limbo-ownership.test.lua
new file mode 100644
index 000000000..0e1586566
--- /dev/null
+++ b/test/replication/gh-6034-limbo-ownership.test.lua
@@ -0,0 +1,68 @@
+test_run = require('test_run').new()
+fiber = require('fiber')
+
+--
+-- gh-6034: test that transactional limbo isn't accessible without a promotion.
+--
+synchro_quorum = box.cfg.replication_synchro_quorum
+election_mode = box.cfg.election_mode
+box.cfg{replication_synchro_quorum = 1, election_mode='off'}
+
+_ = box.schema.space.create('async'):create_index('pk')
+_ = box.schema.space.create('sync', {is_sync=true}):create_index('pk')
+
+-- Limbo is initially unclaimed, everyone is writeable.
+assert(not box.info.ro)
+assert(box.info.synchro.queue.owner == 0)
+box.space.async:insert{1} -- success.
+-- Synchro spaces aren't writeable
+box.space.sync:insert{1} -- error.
+
+box.ctl.promote()
+assert(not box.info.ro)
+assert(box.info.synchro.queue.owner == box.info.id)
+box.space.sync:insert{1} -- success.
+
+-- Everyone but the limbo owner is read-only.
+box.schema.user.grant('guest', 'replication')
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=True, wait_load=True')
+test_run:cmd('set variable rpl_listen to "replica.listen"')
+orig_replication = box.cfg.replication
+box.cfg{replication={box.info.listen, rpl_listen}}
+
+test_run:switch('replica')
+assert(box.info.ro)
+assert(box.info.synchro.queue.owner == test_run:eval('default', 'return box.info.id')[1])
+box.space.async:insert{2} -- failure.
+
+-- Promotion on the other node. Default should become ro.
+box.ctl.promote()
+assert(not box.info.ro)
+assert(box.info.synchro.queue.owner == box.info.id)
+box.space.sync:insert{2} -- success.
+
+test_run:switch('default')
+assert(box.info.ro)
+assert(box.info.synchro.queue.owner == test_run:eval('replica', 'return box.info.id')[1])
+box.space.sync:insert{3} -- failure.
+
+box.ctl.demote()
+assert(not box.info.ro)
+box.space.sync:insert{3} -- still fails.
+assert(box.info.synchro.queue.owner == 0)
+box.space.async:insert{3} -- success.
+
+-- Cleanup.
+box.ctl.demote()
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+box.schema.user.revoke('guest', 'replication')
+box.space.sync:drop()
+box.space.async:drop()
+box.cfg{\
+    replication_synchro_quorum = synchro_quorum,\
+    election_mode = election_mode,\
+    replication = orig_replication,\
+}
diff --git a/test/replication/gh-6034-promote-bump-term.result b/test/replication/gh-6034-promote-bump-term.result
new file mode 100644
index 000000000..d3cbbc1c3
--- /dev/null
+++ b/test/replication/gh-6034-promote-bump-term.result
@@ -0,0 +1,51 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+
+-- gh-6034 follow-up: test that every box.ctl.promote()/box.ctl.demote() bumps
+-- the instance's term. Even when elections are disabled. Even for consequent
+-- promotes/demotes on the same instance.
+election_mode = box.cfg.election_mode
+ | ---
+ | ...
+box.cfg{election_mode='off'}
+ | ---
+ | ...
+
+term = box.info.election.term
+ | ---
+ | ...
+box.ctl.promote()
+ | ---
+ | ...
+assert(box.info.election.term == term + 1)
+ | ---
+ | - true
+ | ...
+box.ctl.promote()
+ | ---
+ | ...
+assert(box.info.election.term == term + 2)
+ | ---
+ | - true
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+assert(box.info.election.term == term + 3)
+ | ---
+ | - true
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+assert(box.info.election.term == term + 4)
+ | ---
+ | - true
+ | ...
+
+-- Cleanup.
+box.cfg{election_mode=election_mode}
+ | ---
+ | ...
diff --git a/test/replication/gh-6034-promote-bump-term.test.lua b/test/replication/gh-6034-promote-bump-term.test.lua
new file mode 100644
index 000000000..a6a082ed5
--- /dev/null
+++ b/test/replication/gh-6034-promote-bump-term.test.lua
@@ -0,0 +1,20 @@
+test_run = require('test_run').new()
+
+-- gh-6034 follow-up: test that every box.ctl.promote()/box.ctl.demote() bumps
+-- the instance's term. Even when elections are disabled. Even for consequent
+-- promotes/demotes on the same instance.
+election_mode = box.cfg.election_mode
+box.cfg{election_mode='off'}
+
+term = box.info.election.term
+box.ctl.promote()
+assert(box.info.election.term == term + 1)
+box.ctl.promote()
+assert(box.info.election.term == term + 2)
+box.ctl.demote()
+assert(box.info.election.term == term + 3)
+box.ctl.demote()
+assert(box.info.election.term == term + 4)
+
+-- Cleanup.
+box.cfg{election_mode=election_mode}
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index 46de2e6c4..9444ec49a 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -46,6 +46,8 @@
     "gh-5536-wal-limit.test.lua": {},
     "gh-5566-final-join-synchro.test.lua": {},
     "gh-6032-promote-wal-write.test.lua": {},
+    "gh-6034-limbo-ownership.test.lua": {},
+    "gh-6034-promote-bump-term.test.lua": {},
     "gh-6057-qsync-confirm-async-no-wal.test.lua": {},
     "gh-6094-rs-uuid-mismatch.test.lua": {},
     "*": {
-- 
2.30.1 (Apple Git-130)



More information about the Tarantool-patches mailing list