[Tarantool-patches] [PATCH] raft: execute triggers exactly on state change
Serge Petrenko
sergepetrenko at tarantool.org
Tue Nov 17 18:16:19 MSK 2020
Raft triggers on state change should be executed right on state change,
in the same fiber and without yields.
This solves two problems:
1) instance may be rw for a short period of time after becoming
follower
2) leader may become rw even before clearing the limbo after the
previous leader
In order to achieve this introduce a new state, RAFT_STATE_NONE, which
takes part of the responsibilities RAFT_STATE_FOLLOWER had.
The state indicates that raft is turned off, so that both "raft is off"
and "state is follower" events may be delivered to the triggers.
Closes #5440
---
sp/gh-5440-on-state-update
https://github.com/tarantool/tarantool/issues/5440
src/box/box.cc | 3 +-
src/box/raft.c | 17 ++++----
src/lib/raft/raft.c | 49 +++++++++++++++---------
src/lib/raft/raft.h | 2 +
test/replication/election_basic.result | 4 +-
test/replication/election_basic.test.lua | 4 +-
6 files changed, 48 insertions(+), 31 deletions(-)
diff --git a/src/box/box.cc b/src/box/box.cc
index 7d23de95c..7c8fba6d3 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -791,8 +791,9 @@ box_set_election_mode(void)
const char *mode = box_check_election_mode();
if (mode == NULL)
return -1;
- raft_cfg_is_candidate(box_raft(), strcmp(mode, "candidate") == 0);
+ /* Order is important for correct state transitions. */
raft_cfg_is_enabled(box_raft(), strcmp(mode, "off") != 0);
+ raft_cfg_is_candidate(box_raft(), strcmp(mode, "candidate") == 0);
return 0;
}
diff --git a/src/box/raft.c b/src/box/raft.c
index db1a3f423..bb26398a3 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -77,16 +77,17 @@ box_raft_on_update_f(struct trigger *trigger, void *event)
(void)trigger;
struct raft *raft = (struct raft *)event;
assert(raft == box_raft());
+ if (raft->state == RAFT_STATE_LEADER) {
+ /*
+ * When the node became a leader, it means it will ignore all
+ * records from all the other nodes, and won't get late CONFIRM
+ * messages anyway. Can clear the queue without waiting for
+ * confirmations.
+ */
+ box_clear_synchro_queue(false);
+ }
/* State or enablence could be changed, affecting read-only state. */
box_update_ro_summary();
- if (raft->state != RAFT_STATE_LEADER)
- return 0;
- /*
- * When the node became a leader, it means it will ignore all records
- * from all the other nodes, and won't get late CONFIRM messages anyway.
- * Can clear the queue without waiting for confirmations.
- */
- box_clear_synchro_queue(false);
return 0;
}
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index b669475f3..80b4d0211 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -52,6 +52,7 @@ raft_state_str(uint32_t state)
[RAFT_STATE_FOLLOWER] = "follower",
[RAFT_STATE_CANDIDATE] = "candidate",
[RAFT_STATE_LEADER] = "leader",
+ [RAFT_STATE_NONE] = "none",
};
if (state < lengthof(str))
@@ -60,6 +61,17 @@ raft_state_str(uint32_t state)
return "invalid (x)";
};
+/** Set raft state to a new value and run triggers on state change. */
+static void
+raft_set_state(struct raft *raft, uint32_t state)
+{
+ assert(state != 0 && state <= RAFT_STATE_NONE);
+ if (state == raft->state)
+ return;
+ raft->state = state;
+ trigger_run(&raft->on_update, raft);
+}
+
/**
* Check if Raft is completely synced with disk. Meaning all its critical values
* are in WAL. Only in that state the node can become a leader or a candidate.
@@ -316,13 +328,12 @@ raft_process_msg(struct raft *raft, const struct raft_msg *req, uint32_t source)
*/
if (req->vote != 0) {
switch (raft->state) {
+ case RAFT_STATE_NONE:
+ say_info("RAFT: vote request is skipped - RAFT "
+ "is disabled");
+ break;
case RAFT_STATE_FOLLOWER:
case RAFT_STATE_LEADER:
- if (!raft->is_enabled) {
- say_info("RAFT: vote request is skipped - RAFT "
- "is disabled");
- break;
- }
if (raft->leader != 0) {
say_info("RAFT: vote request is skipped - the "
"leader is already known - %u",
@@ -467,8 +478,8 @@ static void
raft_worker_handle_io(struct raft *raft)
{
assert(raft->is_write_in_progress);
- /* During write Raft can't be anything but a follower. */
- assert(raft->state == RAFT_STATE_FOLLOWER);
+ /* During write Raft can't be anything but a follower or turned off. */
+ assert(raft->state == RAFT_STATE_FOLLOWER || !raft->is_enabled);
struct raft_msg req;
if (raft_is_fully_on_disk(raft)) {
@@ -548,7 +559,6 @@ raft_worker_handle_broadcast(struct raft *raft)
req.vclock = raft->vclock;
}
raft->vtab->broadcast(raft, &req);
- trigger_run(&raft->on_update, raft);
raft->is_broadcast_scheduled = false;
}
@@ -581,7 +591,7 @@ raft_worker_f(va_list args)
static void
raft_sm_pause_and_dump(struct raft *raft)
{
- assert(raft->state == RAFT_STATE_FOLLOWER);
+ assert(raft->state == RAFT_STATE_FOLLOWER || !raft->is_enabled);
if (raft->is_write_in_progress)
return;
ev_timer_stop(loop(), &raft->timer);
@@ -598,7 +608,7 @@ raft_sm_become_leader(struct raft *raft)
assert(raft->leader == 0);
assert(raft->is_candidate);
assert(!raft->is_write_in_progress);
- raft->state = RAFT_STATE_LEADER;
+ raft_set_state(raft, RAFT_STATE_LEADER);
raft->leader = raft->self;
ev_timer_stop(loop(), &raft->timer);
/* State is visible and it is changed - broadcast. */
@@ -611,7 +621,8 @@ raft_sm_follow_leader(struct raft *raft, uint32_t leader)
say_info("RAFT: leader is %u, follow", leader);
assert(raft->state != RAFT_STATE_LEADER);
assert(raft->leader == 0);
- raft->state = RAFT_STATE_FOLLOWER;
+ if (raft->is_enabled)
+ raft_set_state(raft, RAFT_STATE_FOLLOWER);
raft->leader = leader;
if (!raft->is_write_in_progress && raft->is_candidate) {
ev_timer_stop(loop(), &raft->timer);
@@ -631,7 +642,7 @@ raft_sm_become_candidate(struct raft *raft)
assert(raft->is_candidate);
assert(!raft->is_write_in_progress);
assert(raft->election_quorum > 1);
- raft->state = RAFT_STATE_CANDIDATE;
+ raft_set_state(raft, RAFT_STATE_CANDIDATE);
raft->vote_count = 1;
raft->vote_mask = 0;
bit_set(&raft->vote_mask, raft->self);
@@ -650,7 +661,8 @@ raft_sm_schedule_new_term(struct raft *raft, uint64_t new_term)
/* New terms means completely new Raft state. */
raft->volatile_vote = 0;
raft->leader = 0;
- raft->state = RAFT_STATE_FOLLOWER;
+ if (raft->is_enabled)
+ raft_set_state(raft, RAFT_STATE_FOLLOWER);
raft_sm_pause_and_dump(raft);
/*
* State is visible and it is changed - broadcast. Term is also visible,
@@ -740,9 +752,10 @@ raft_sm_start(struct raft *raft)
say_info("RAFT: start state machine");
assert(!ev_is_active(&raft->timer));
assert(!raft->is_enabled);
- assert(raft->state == RAFT_STATE_FOLLOWER);
+ assert(raft->state == RAFT_STATE_NONE);
raft->is_enabled = true;
raft->is_candidate = raft->is_cfg_candidate;
+ raft_set_state(raft, RAFT_STATE_FOLLOWER);
if (raft->is_write_in_progress) {
/*
* Nop. If write is in progress, the state machine is frozen. It
@@ -784,7 +797,7 @@ raft_sm_stop(struct raft *raft)
raft->is_candidate = false;
if (raft->state == RAFT_STATE_LEADER)
raft->leader = 0;
- raft->state = RAFT_STATE_FOLLOWER;
+ raft_set_state(raft, RAFT_STATE_NONE);
ev_timer_stop(loop(), &raft->timer);
/* State is visible and changed - broadcast. */
raft_schedule_broadcast(raft);
@@ -841,7 +854,7 @@ raft_cfg_is_candidate(struct raft *raft, bool is_candidate)
bool old_is_candidate = raft->is_candidate;
raft->is_cfg_candidate = is_candidate;
raft->is_candidate = is_candidate && raft->is_enabled;
- if (raft->is_candidate == old_is_candidate)
+ if (raft->is_candidate == old_is_candidate || !raft->is_enabled)
return;
if (raft->is_candidate) {
@@ -866,7 +879,7 @@ raft_cfg_is_candidate(struct raft *raft, bool is_candidate)
if (raft->state != RAFT_STATE_FOLLOWER) {
if (raft->state == RAFT_STATE_LEADER)
raft->leader = 0;
- raft->state = RAFT_STATE_FOLLOWER;
+ raft_set_state(raft, RAFT_STATE_FOLLOWER);
/* State is visible and changed - broadcast. */
raft_schedule_broadcast(raft);
}
@@ -982,7 +995,7 @@ void
raft_create(struct raft *raft, const struct raft_vtab *vtab)
{
*raft = (struct raft) {
- .state = RAFT_STATE_FOLLOWER,
+ .state = RAFT_STATE_NONE,
.volatile_term = 1,
.term = 1,
.election_quorum = 1,
diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h
index 4f4d24ca8..0f9d84f27 100644
--- a/src/lib/raft/raft.h
+++ b/src/lib/raft/raft.h
@@ -86,6 +86,8 @@ enum raft_state {
RAFT_STATE_CANDIDATE = 2,
/** Election was successful. The node accepts write requests. */
RAFT_STATE_LEADER = 3,
+ /* Empty state. Indicates that raft state machine is stopped. */
+ RAFT_STATE_NONE = 4,
};
/**
diff --git a/test/replication/election_basic.result b/test/replication/election_basic.result
index 4d7d33f2b..9f46696e7 100644
--- a/test/replication/election_basic.result
+++ b/test/replication/election_basic.result
@@ -36,7 +36,7 @@ box.cfg{election_timeout = 0}
| number'
| ...
--- When election is disabled, the instance is a follower. Does not try to become
+-- When election is disabled, the instance does not try to become
-- a leader, and does not block write operations.
term = box.info.election.term
| ---
@@ -44,7 +44,7 @@ term = box.info.election.term
vote = box.info.election.vote
| ---
| ...
-assert(box.info.election.state == 'follower')
+assert(box.info.election.state == 'none')
| ---
| - true
| ...
diff --git a/test/replication/election_basic.test.lua b/test/replication/election_basic.test.lua
index 821f73cea..21acc0a58 100644
--- a/test/replication/election_basic.test.lua
+++ b/test/replication/election_basic.test.lua
@@ -13,11 +13,11 @@ box.cfg{election_mode = '100'}
box.cfg{election_timeout = -1}
box.cfg{election_timeout = 0}
--- When election is disabled, the instance is a follower. Does not try to become
+-- When election is disabled, the instance does not try to become
-- a leader, and does not block write operations.
term = box.info.election.term
vote = box.info.election.vote
-assert(box.info.election.state == 'follower')
+assert(box.info.election.state == 'none')
assert(box.info.election.leader == 0)
assert(not box.info.ro)
--
2.24.3 (Apple Git-128)
More information about the Tarantool-patches
mailing list