[Tarantool-patches] [PATCH] raft: execute triggers exactly on state change

Serge Petrenko sergepetrenko at tarantool.org
Tue Nov 17 18:16:19 MSK 2020


Raft triggers on state change should be executed right on state change,
in the same fiber and without yields.
This solves two problems:
1) instance may be rw for a short period of time after becoming
   follower
2) leader may become rw even before clearing the limbo after the
   previous leader

In order to achieve this introduce a new state, RAFT_STATE_NONE, which
takes part of the responsibilities RAFT_STATE_FOLLOWER had.
The state indicates that raft is turned off, so that both "raft is off"
and "state is follower" events may be delivered to the triggers.

Closes #5440
---
sp/gh-5440-on-state-update
https://github.com/tarantool/tarantool/issues/5440

 src/box/box.cc                           |  3 +-
 src/box/raft.c                           | 17 ++++----
 src/lib/raft/raft.c                      | 49 +++++++++++++++---------
 src/lib/raft/raft.h                      |  2 +
 test/replication/election_basic.result   |  4 +-
 test/replication/election_basic.test.lua |  4 +-
 6 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 7d23de95c..7c8fba6d3 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -791,8 +791,9 @@ box_set_election_mode(void)
 	const char *mode = box_check_election_mode();
 	if (mode == NULL)
 		return -1;
-	raft_cfg_is_candidate(box_raft(), strcmp(mode, "candidate") == 0);
+	/* Order is important for correct state transitions. */
 	raft_cfg_is_enabled(box_raft(), strcmp(mode, "off") != 0);
+	raft_cfg_is_candidate(box_raft(), strcmp(mode, "candidate") == 0);
 	return 0;
 }
 
diff --git a/src/box/raft.c b/src/box/raft.c
index db1a3f423..bb26398a3 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -77,16 +77,17 @@ box_raft_on_update_f(struct trigger *trigger, void *event)
 	(void)trigger;
 	struct raft *raft = (struct raft *)event;
 	assert(raft == box_raft());
+	if (raft->state == RAFT_STATE_LEADER) {
+		/*
+		 * When the node became a leader, it means it will ignore all
+		 * records from all the other nodes, and won't get late CONFIRM
+		 * messages anyway. Can clear the queue without waiting for
+		 * confirmations.
+		 */
+		box_clear_synchro_queue(false);
+	}
 	/* State or enablence could be changed, affecting read-only state. */
 	box_update_ro_summary();
-	if (raft->state != RAFT_STATE_LEADER)
-		return 0;
-	/*
-	 * When the node became a leader, it means it will ignore all records
-	 * from all the other nodes, and won't get late CONFIRM messages anyway.
-	 * Can clear the queue without waiting for confirmations.
-	 */
-	box_clear_synchro_queue(false);
 	return 0;
 }
 
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index b669475f3..80b4d0211 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -52,6 +52,7 @@ raft_state_str(uint32_t state)
 		[RAFT_STATE_FOLLOWER]	= "follower",
 		[RAFT_STATE_CANDIDATE]	= "candidate",
 		[RAFT_STATE_LEADER]	= "leader",
+		[RAFT_STATE_NONE]	= "none",
 	};
 
 	if (state < lengthof(str))
@@ -60,6 +61,17 @@ raft_state_str(uint32_t state)
 	return "invalid (x)";
 };
 
+/** Set raft state to a new value and run triggers on state change. */
+static void
+raft_set_state(struct raft *raft, uint32_t state)
+{
+	assert(state != 0 && state <= RAFT_STATE_NONE);
+	if (state == raft->state)
+		return;
+	raft->state = state;
+	trigger_run(&raft->on_update, raft);
+}
+
 /**
  * Check if Raft is completely synced with disk. Meaning all its critical values
  * are in WAL. Only in that state the node can become a leader or a candidate.
@@ -316,13 +328,12 @@ raft_process_msg(struct raft *raft, const struct raft_msg *req, uint32_t source)
 	 */
 	if (req->vote != 0) {
 		switch (raft->state) {
+		case RAFT_STATE_NONE:
+			say_info("RAFT: vote request is skipped - RAFT "
+				 "is disabled");
+			break;
 		case RAFT_STATE_FOLLOWER:
 		case RAFT_STATE_LEADER:
-			if (!raft->is_enabled) {
-				say_info("RAFT: vote request is skipped - RAFT "
-					 "is disabled");
-				break;
-			}
 			if (raft->leader != 0) {
 				say_info("RAFT: vote request is skipped - the "
 					 "leader is already known - %u",
@@ -467,8 +478,8 @@ static void
 raft_worker_handle_io(struct raft *raft)
 {
 	assert(raft->is_write_in_progress);
-	/* During write Raft can't be anything but a follower. */
-	assert(raft->state == RAFT_STATE_FOLLOWER);
+	/* During write Raft can't be anything but a follower or turned off. */
+	assert(raft->state == RAFT_STATE_FOLLOWER || !raft->is_enabled);
 	struct raft_msg req;
 
 	if (raft_is_fully_on_disk(raft)) {
@@ -548,7 +559,6 @@ raft_worker_handle_broadcast(struct raft *raft)
 		req.vclock = raft->vclock;
 	}
 	raft->vtab->broadcast(raft, &req);
-	trigger_run(&raft->on_update, raft);
 	raft->is_broadcast_scheduled = false;
 }
 
@@ -581,7 +591,7 @@ raft_worker_f(va_list args)
 static void
 raft_sm_pause_and_dump(struct raft *raft)
 {
-	assert(raft->state == RAFT_STATE_FOLLOWER);
+	assert(raft->state == RAFT_STATE_FOLLOWER || !raft->is_enabled);
 	if (raft->is_write_in_progress)
 		return;
 	ev_timer_stop(loop(), &raft->timer);
@@ -598,7 +608,7 @@ raft_sm_become_leader(struct raft *raft)
 	assert(raft->leader == 0);
 	assert(raft->is_candidate);
 	assert(!raft->is_write_in_progress);
-	raft->state = RAFT_STATE_LEADER;
+	raft_set_state(raft, RAFT_STATE_LEADER);
 	raft->leader = raft->self;
 	ev_timer_stop(loop(), &raft->timer);
 	/* State is visible and it is changed - broadcast. */
@@ -611,7 +621,8 @@ raft_sm_follow_leader(struct raft *raft, uint32_t leader)
 	say_info("RAFT: leader is %u, follow", leader);
 	assert(raft->state != RAFT_STATE_LEADER);
 	assert(raft->leader == 0);
-	raft->state = RAFT_STATE_FOLLOWER;
+	if (raft->is_enabled)
+		raft_set_state(raft, RAFT_STATE_FOLLOWER);
 	raft->leader = leader;
 	if (!raft->is_write_in_progress && raft->is_candidate) {
 		ev_timer_stop(loop(), &raft->timer);
@@ -631,7 +642,7 @@ raft_sm_become_candidate(struct raft *raft)
 	assert(raft->is_candidate);
 	assert(!raft->is_write_in_progress);
 	assert(raft->election_quorum > 1);
-	raft->state = RAFT_STATE_CANDIDATE;
+	raft_set_state(raft, RAFT_STATE_CANDIDATE);
 	raft->vote_count = 1;
 	raft->vote_mask = 0;
 	bit_set(&raft->vote_mask, raft->self);
@@ -650,7 +661,8 @@ raft_sm_schedule_new_term(struct raft *raft, uint64_t new_term)
 	/* New terms means completely new Raft state. */
 	raft->volatile_vote = 0;
 	raft->leader = 0;
-	raft->state = RAFT_STATE_FOLLOWER;
+	if (raft->is_enabled)
+		raft_set_state(raft, RAFT_STATE_FOLLOWER);
 	raft_sm_pause_and_dump(raft);
 	/*
 	 * State is visible and it is changed - broadcast. Term is also visible,
@@ -740,9 +752,10 @@ raft_sm_start(struct raft *raft)
 	say_info("RAFT: start state machine");
 	assert(!ev_is_active(&raft->timer));
 	assert(!raft->is_enabled);
-	assert(raft->state == RAFT_STATE_FOLLOWER);
+	assert(raft->state == RAFT_STATE_NONE);
 	raft->is_enabled = true;
 	raft->is_candidate = raft->is_cfg_candidate;
+	raft_set_state(raft, RAFT_STATE_FOLLOWER);
 	if (raft->is_write_in_progress) {
 		/*
 		 * Nop. If write is in progress, the state machine is frozen. It
@@ -784,7 +797,7 @@ raft_sm_stop(struct raft *raft)
 	raft->is_candidate = false;
 	if (raft->state == RAFT_STATE_LEADER)
 		raft->leader = 0;
-	raft->state = RAFT_STATE_FOLLOWER;
+	raft_set_state(raft, RAFT_STATE_NONE);
 	ev_timer_stop(loop(), &raft->timer);
 	/* State is visible and changed - broadcast. */
 	raft_schedule_broadcast(raft);
@@ -841,7 +854,7 @@ raft_cfg_is_candidate(struct raft *raft, bool is_candidate)
 	bool old_is_candidate = raft->is_candidate;
 	raft->is_cfg_candidate = is_candidate;
 	raft->is_candidate = is_candidate && raft->is_enabled;
-	if (raft->is_candidate == old_is_candidate)
+	if (raft->is_candidate == old_is_candidate || !raft->is_enabled)
 		return;
 
 	if (raft->is_candidate) {
@@ -866,7 +879,7 @@ raft_cfg_is_candidate(struct raft *raft, bool is_candidate)
 		if (raft->state != RAFT_STATE_FOLLOWER) {
 			if (raft->state == RAFT_STATE_LEADER)
 				raft->leader = 0;
-			raft->state = RAFT_STATE_FOLLOWER;
+			raft_set_state(raft, RAFT_STATE_FOLLOWER);
 			/* State is visible and changed - broadcast. */
 			raft_schedule_broadcast(raft);
 		}
@@ -982,7 +995,7 @@ void
 raft_create(struct raft *raft, const struct raft_vtab *vtab)
 {
 	*raft = (struct raft) {
-		.state = RAFT_STATE_FOLLOWER,
+		.state = RAFT_STATE_NONE,
 		.volatile_term = 1,
 		.term =	1,
 		.election_quorum = 1,
diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h
index 4f4d24ca8..0f9d84f27 100644
--- a/src/lib/raft/raft.h
+++ b/src/lib/raft/raft.h
@@ -86,6 +86,8 @@ enum raft_state {
 	RAFT_STATE_CANDIDATE = 2,
 	/** Election was successful. The node accepts write requests. */
 	RAFT_STATE_LEADER = 3,
+	/* Empty state. Indicates that raft state machine is stopped. */
+	RAFT_STATE_NONE = 4,
 };
 
 /**
diff --git a/test/replication/election_basic.result b/test/replication/election_basic.result
index 4d7d33f2b..9f46696e7 100644
--- a/test/replication/election_basic.result
+++ b/test/replication/election_basic.result
@@ -36,7 +36,7 @@ box.cfg{election_timeout = 0}
  |     number'
  | ...
 
--- When election is disabled, the instance is a follower. Does not try to become
+-- When election is disabled, the instance does not try to become
 -- a leader, and does not block write operations.
 term = box.info.election.term
  | ---
@@ -44,7 +44,7 @@ term = box.info.election.term
 vote = box.info.election.vote
  | ---
  | ...
-assert(box.info.election.state == 'follower')
+assert(box.info.election.state == 'none')
  | ---
  | - true
  | ...
diff --git a/test/replication/election_basic.test.lua b/test/replication/election_basic.test.lua
index 821f73cea..21acc0a58 100644
--- a/test/replication/election_basic.test.lua
+++ b/test/replication/election_basic.test.lua
@@ -13,11 +13,11 @@ box.cfg{election_mode = '100'}
 box.cfg{election_timeout = -1}
 box.cfg{election_timeout = 0}
 
--- When election is disabled, the instance is a follower. Does not try to become
+-- When election is disabled, the instance does not try to become
 -- a leader, and does not block write operations.
 term = box.info.election.term
 vote = box.info.election.vote
-assert(box.info.election.state == 'follower')
+assert(box.info.election.state == 'none')
 assert(box.info.election.leader == 0)
 assert(not box.info.ro)
 
-- 
2.24.3 (Apple Git-128)



More information about the Tarantool-patches mailing list