[Tarantool-patches] [PATCH 3/3] [tosquash] raft: don't start new election immediately after restart

Sat Sep 26 00:29:13 MSK 2020

That creates problems in a cluster when followers restart.
Because if they would bump terms after restart, they would turn
the current leader into follower, therefore interrupting the
cluster progression.

Better let them wait for the leader death timeout if a leader
will appear. Because this is what will happen in 99% cases.
---
 src/box/box.cc | 12 +++++++++++-
 src/box/raft.c | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
 src/box/raft.h |  8 ++++++++
 3 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 1169f2cd5..957bf4d7a 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -2799,8 +2799,18 @@ box_cfg_xc(void)
 	title("running");
 	say_info("ready to accept requests");
 
-	if (!is_bootstrap_leader)
+	if (!is_bootstrap_leader) {
 		replicaset_sync();
+	} else {
+		/*
+		 * When the cluster is just bootstrapped and this instance is a
+		 * leader, it makes no sense to wait for a leader appearance.
+		 * There is no one. Moreover this node *is* a leader, so it
+		 * should take the control over the situation and start a new
+		 * term immediately.
+		 */
+		raft_new_term();
+	}
 
 	/* box.cfg.read_only is not read yet. */
 	assert(box_is_ro());
diff --git a/src/box/raft.c b/src/box/raft.c
index 8e5f7803c..608722824 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -209,6 +209,17 @@ raft_sm_stop(void);
 static void
 raft_sm_wait_leader_dead(void);
 
+/**
+ * Wait for the leader death timeout until a leader lets the node know he is
+ * alive. Otherwise the node will start a new term. Can be useful when it is not
+ * known whether the leader is alive, but it is undesirable to start a new term
+ * immediately. Because in case the leader is alive, a new term would stun him
+ * and therefore would stun DB write requests. Usually happens when a follower
+ * restarts and may need some time to hear something from the leader.
+ */
+static void
+raft_sm_wait_leader_found(void);
+
 /**
  * If election is started by this node, or it voted for some other node started
  * the election, and it can be a leader itself, it will wait until the current
@@ -785,6 +796,19 @@ raft_sm_wait_leader_dead(void)
 	ev_timer_start(loop(), &raft.timer);
 }
 
+static void
+raft_sm_wait_leader_found(void)
+{
+	assert(!ev_is_active(&raft.timer));
+	assert(!raft.is_write_in_progress);
+	assert(raft.is_candidate);
+	assert(raft.state == RAFT_STATE_FOLLOWER);
+	assert(raft.leader == 0);
+	double death_timeout = replication_disconnect_timeout();
+	ev_timer_set(&raft.timer, death_timeout, death_timeout);
+	ev_timer_start(loop(), &raft.timer);
+}
+
 static void
 raft_sm_wait_election_end(void)
 {
@@ -811,12 +835,20 @@ raft_sm_start(void)
 	assert(raft.state == RAFT_STATE_FOLLOWER);
 	raft.is_enabled = true;
 	raft.is_candidate = raft.is_cfg_candidate;
-	if (!raft.is_candidate)
+	if (!raft.is_candidate) {
 		/* Nop. */;
-	else if (raft.leader != 0)
+	} else if (raft.leader != 0) {
 		raft_sm_wait_leader_dead();
-	else
-		raft_sm_schedule_new_election();
+	} else {
+		/*
+		 * Don't start new election. The situation is most likely
+		 * happened because this node was restarted. Instance restarts
+		 * may happen in the cluster, and each restart shouldn't
+		 * disturb the current leader. Give it time to notify this node
+		 * that there is a leader.
+		 */
+		raft_sm_wait_leader_found();
+	}
 	box_update_ro_summary();
 }
 
@@ -892,7 +924,7 @@ raft_cfg_is_candidate(bool is_candidate)
 		 * node who sent newer data to this node.
 		 */
 		if (raft.leader == 0 && raft_is_fully_on_disk())
-			raft_sm_schedule_new_election();
+			raft_sm_wait_leader_found();
 	} else if (raft.state != RAFT_STATE_FOLLOWER) {
 		if (raft.state == RAFT_STATE_LEADER)
 			raft.leader = 0;
@@ -946,6 +978,13 @@ raft_cfg_death_timeout(void)
 	}
 }
 
+void
+raft_new_term(void)
+{
+	if (raft.is_enabled)
+		raft_sm_schedule_new_term(raft.volatile_term + 1);
+}
+
 static void
 raft_schedule_broadcast(void)
 {
diff --git a/src/box/raft.h b/src/box/raft.h
index 8c1cf467f..be77a5473 100644
--- a/src/box/raft.h
+++ b/src/box/raft.h
@@ -229,6 +229,14 @@ raft_cfg_election_quorum(void);
 void
 raft_cfg_death_timeout(void);
 
+/**
+ * Bump the term. When it is persisted, the node checks if there is a leader,
+ * and if there is not, a new election is started. That said, this function can
+ * be used as tool to forcefully start new election, or restart an existing.
+ */
+void
+raft_new_term(void);
+
 /**
  * Save complete Raft state into a request to be sent to other instances of the
  * cluster. It is allowed to save anything here, not only persistent state.
-- 
2.21.1 (Apple Git-122.3)