[Tarantool-patches] [PATCH 7/7] box: make promote/demote always bump the term
Serge Petrenko
sergepetrenko at tarantool.org
Thu Jun 10 16:32:57 MSK 2021
When called without elections, promote and demote resulted in multiple
PROMOTE/DEMOTE entries for the same term. This is not right, because all
the promotions for the same term except the first one would be ignored as
already seen.
Follow-up #6034
---
src/box/box.cc | 14 +-
src/lib/raft/raft.c | 3 +-
.../gh-6034-limbo-ownership.result | 186 ++++++++++++++++++
.../gh-6034-limbo-ownership.test.lua | 68 +++++++
.../gh-6034-promote-bump-term.result | 51 +++++
.../gh-6034-promote-bump-term.test.lua | 20 ++
test/replication/suite.cfg | 2 +
7 files changed, 336 insertions(+), 8 deletions(-)
create mode 100644 test/replication/gh-6034-limbo-ownership.result
create mode 100644 test/replication/gh-6034-limbo-ownership.test.lua
create mode 100644 test/replication/gh-6034-promote-bump-term.result
create mode 100644 test/replication/gh-6034-promote-bump-term.test.lua
diff --git a/src/box/box.cc b/src/box/box.cc
index d3908b4cd..59f8dddae 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1663,14 +1663,16 @@ box_clear_synchro_queue(bool demote)
rc = -1;
} else {
promote:
- /* We cannot possibly get here in a volatile state. */
- assert(box_raft()->volatile_term == box_raft()->term);
+ if (try_wait)
+ raft_new_term(box_raft());
+
+ uint64_t term = box_raft()->volatile_term;
if (demote) {
txn_limbo_write_demote(&txn_limbo, wait_lsn,
- box_raft()->term);
+ term);
} else {
txn_limbo_write_promote(&txn_limbo, wait_lsn,
- box_raft()->term);
+ term);
}
uint16_t type = demote ? IPROTO_DEMOTE : IPROTO_PROMOTE;
struct synchro_request req = {
@@ -1678,7 +1680,7 @@ promote:
.replica_id = former_leader_id,
.origin_id = instance_id,
.lsn = wait_lsn,
- .term = box_raft()->term,
+ .term = term,
};
txn_limbo_process(&txn_limbo, &req);
assert(txn_limbo_is_empty(&txn_limbo));
@@ -3511,7 +3513,7 @@ box_cfg_xc(void)
if (!is_bootstrap_leader) {
replicaset_sync();
- } else {
+ } else if (raft_is_enabled(box_raft())) {
/*
* When the cluster is just bootstrapped and this instance is a
* leader, it makes no sense to wait for a leader appearance.
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 247c7a71a..1de05727f 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -983,8 +983,7 @@ raft_cfg_vclock(struct raft *raft, const struct vclock *vclock)
void
raft_new_term(struct raft *raft)
{
- if (raft->is_enabled)
- raft_sm_schedule_new_term(raft, raft->volatile_term + 1);
+ raft_sm_schedule_new_term(raft, raft->volatile_term + 1);
}
static void
diff --git a/test/replication/gh-6034-limbo-ownership.result b/test/replication/gh-6034-limbo-ownership.result
new file mode 100644
index 000000000..ebedc2b17
--- /dev/null
+++ b/test/replication/gh-6034-limbo-ownership.result
@@ -0,0 +1,186 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+fiber = require('fiber')
+ | ---
+ | ...
+
+--
+-- gh-6034: test that transactional limbo isn't accessible without a promotion.
+--
+synchro_quorum = box.cfg.replication_synchro_quorum
+ | ---
+ | ...
+election_mode = box.cfg.election_mode
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum = 1, election_mode='off'}
+ | ---
+ | ...
+
+_ = box.schema.space.create('async'):create_index('pk')
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true}):create_index('pk')
+ | ---
+ | ...
+
+-- Limbo is initially unclaimed, everyone is writeable.
+assert(not box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == 0)
+ | ---
+ | - true
+ | ...
+box.space.async:insert{1} -- success.
+ | ---
+ | - [1]
+ | ...
+-- Synchro spaces aren't writeable
+box.space.sync:insert{1} -- error.
+ | ---
+ | - error: Synchronous transaction limbo doesn't belong to any instance
+ | ...
+
+box.ctl.promote()
+ | ---
+ | ...
+assert(not box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == box.info.id)
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{1} -- success.
+ | ---
+ | - [1]
+ | ...
+
+-- Everyone but the limbo owner is read-only.
+box.schema.user.grant('guest', 'replication')
+ | ---
+ | ...
+test_run:cmd('create server replica with rpl_master=default,\
+ script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=True, wait_load=True')
+ | ---
+ | - true
+ | ...
+test_run:cmd('set variable rpl_listen to "replica.listen"')
+ | ---
+ | - true
+ | ...
+orig_replication = box.cfg.replication
+ | ---
+ | ...
+box.cfg{replication={box.info.listen, rpl_listen}}
+ | ---
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+assert(box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == test_run:eval('default', 'return box.info.id')[1])
+ | ---
+ | - true
+ | ...
+box.space.async:insert{2} -- failure.
+ | ---
+ | - error: Can't modify data because this instance is in read-only mode.
+ | ...
+
+-- Promotion on the other node. Default should become ro.
+box.ctl.promote()
+ | ---
+ | ...
+assert(not box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == box.info.id)
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{2} -- success.
+ | ---
+ | - [2]
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+assert(box.info.ro)
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.owner == test_run:eval('replica', 'return box.info.id')[1])
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{3} -- failure.
+ | ---
+ | - error: Can't modify data because this instance is in read-only mode.
+ | ...
+
+box.ctl.demote()
+ | ---
+ | ...
+assert(not box.info.ro)
+ | ---
+ | - true
+ | ...
+box.space.sync:insert{3} -- still fails.
+ | ---
+ | - error: Synchronous transaction limbo doesn't belong to any instance
+ | ...
+assert(box.info.synchro.queue.owner == 0)
+ | ---
+ | - true
+ | ...
+box.space.async:insert{3} -- success.
+ | ---
+ | - [3]
+ | ...
+
+-- Cleanup.
+box.ctl.demote()
+ | ---
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+box.schema.user.revoke('guest', 'replication')
+ | ---
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+box.space.async:drop()
+ | ---
+ | ...
+box.cfg{\
+ replication_synchro_quorum = synchro_quorum,\
+ election_mode = election_mode,\
+ replication = orig_replication,\
+}
+ | ---
+ | ...
diff --git a/test/replication/gh-6034-limbo-ownership.test.lua b/test/replication/gh-6034-limbo-ownership.test.lua
new file mode 100644
index 000000000..0e1586566
--- /dev/null
+++ b/test/replication/gh-6034-limbo-ownership.test.lua
@@ -0,0 +1,68 @@
+test_run = require('test_run').new()
+fiber = require('fiber')
+
+--
+-- gh-6034: test that transactional limbo isn't accessible without a promotion.
+--
+synchro_quorum = box.cfg.replication_synchro_quorum
+election_mode = box.cfg.election_mode
+box.cfg{replication_synchro_quorum = 1, election_mode='off'}
+
+_ = box.schema.space.create('async'):create_index('pk')
+_ = box.schema.space.create('sync', {is_sync=true}):create_index('pk')
+
+-- Limbo is initially unclaimed, everyone is writeable.
+assert(not box.info.ro)
+assert(box.info.synchro.queue.owner == 0)
+box.space.async:insert{1} -- success.
+-- Synchro spaces aren't writeable
+box.space.sync:insert{1} -- error.
+
+box.ctl.promote()
+assert(not box.info.ro)
+assert(box.info.synchro.queue.owner == box.info.id)
+box.space.sync:insert{1} -- success.
+
+-- Everyone but the limbo owner is read-only.
+box.schema.user.grant('guest', 'replication')
+test_run:cmd('create server replica with rpl_master=default,\
+ script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=True, wait_load=True')
+test_run:cmd('set variable rpl_listen to "replica.listen"')
+orig_replication = box.cfg.replication
+box.cfg{replication={box.info.listen, rpl_listen}}
+
+test_run:switch('replica')
+assert(box.info.ro)
+assert(box.info.synchro.queue.owner == test_run:eval('default', 'return box.info.id')[1])
+box.space.async:insert{2} -- failure.
+
+-- Promotion on the other node. Default should become ro.
+box.ctl.promote()
+assert(not box.info.ro)
+assert(box.info.synchro.queue.owner == box.info.id)
+box.space.sync:insert{2} -- success.
+
+test_run:switch('default')
+assert(box.info.ro)
+assert(box.info.synchro.queue.owner == test_run:eval('replica', 'return box.info.id')[1])
+box.space.sync:insert{3} -- failure.
+
+box.ctl.demote()
+assert(not box.info.ro)
+box.space.sync:insert{3} -- still fails.
+assert(box.info.synchro.queue.owner == 0)
+box.space.async:insert{3} -- success.
+
+-- Cleanup.
+box.ctl.demote()
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+box.schema.user.revoke('guest', 'replication')
+box.space.sync:drop()
+box.space.async:drop()
+box.cfg{\
+ replication_synchro_quorum = synchro_quorum,\
+ election_mode = election_mode,\
+ replication = orig_replication,\
+}
diff --git a/test/replication/gh-6034-promote-bump-term.result b/test/replication/gh-6034-promote-bump-term.result
new file mode 100644
index 000000000..d3cbbc1c3
--- /dev/null
+++ b/test/replication/gh-6034-promote-bump-term.result
@@ -0,0 +1,51 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+
+-- gh-6034 follow-up: test that every box.ctl.promote()/box.ctl.demote() bumps
+-- the instance's term. Even when elections are disabled. Even for consequent
+-- promotes/demotes on the same instance.
+election_mode = box.cfg.election_mode
+ | ---
+ | ...
+box.cfg{election_mode='off'}
+ | ---
+ | ...
+
+term = box.info.election.term
+ | ---
+ | ...
+box.ctl.promote()
+ | ---
+ | ...
+assert(box.info.election.term == term + 1)
+ | ---
+ | - true
+ | ...
+box.ctl.promote()
+ | ---
+ | ...
+assert(box.info.election.term == term + 2)
+ | ---
+ | - true
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+assert(box.info.election.term == term + 3)
+ | ---
+ | - true
+ | ...
+box.ctl.demote()
+ | ---
+ | ...
+assert(box.info.election.term == term + 4)
+ | ---
+ | - true
+ | ...
+
+-- Cleanup.
+box.cfg{election_mode=election_mode}
+ | ---
+ | ...
diff --git a/test/replication/gh-6034-promote-bump-term.test.lua b/test/replication/gh-6034-promote-bump-term.test.lua
new file mode 100644
index 000000000..a6a082ed5
--- /dev/null
+++ b/test/replication/gh-6034-promote-bump-term.test.lua
@@ -0,0 +1,20 @@
+test_run = require('test_run').new()
+
+-- gh-6034 follow-up: test that every box.ctl.promote()/box.ctl.demote() bumps
+-- the instance's term. Even when elections are disabled. Even for consequent
+-- promotes/demotes on the same instance.
+election_mode = box.cfg.election_mode
+box.cfg{election_mode='off'}
+
+term = box.info.election.term
+box.ctl.promote()
+assert(box.info.election.term == term + 1)
+box.ctl.promote()
+assert(box.info.election.term == term + 2)
+box.ctl.demote()
+assert(box.info.election.term == term + 3)
+box.ctl.demote()
+assert(box.info.election.term == term + 4)
+
+-- Cleanup.
+box.cfg{election_mode=election_mode}
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index 46de2e6c4..9444ec49a 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -46,6 +46,8 @@
"gh-5536-wal-limit.test.lua": {},
"gh-5566-final-join-synchro.test.lua": {},
"gh-6032-promote-wal-write.test.lua": {},
+ "gh-6034-limbo-ownership.test.lua": {},
+ "gh-6034-promote-bump-term.test.lua": {},
"gh-6057-qsync-confirm-async-no-wal.test.lua": {},
"gh-6094-rs-uuid-mismatch.test.lua": {},
"*": {
--
2.30.1 (Apple Git-130)
More information about the Tarantool-patches
mailing list