[Tarantool-patches] [PATCH 3/4] raft: introduce split vote detection

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sat Jan 15 03:48:55 MSK 2022


Split vote is a situation when during election nobody can win and
will not win in this term for sure. Not a single node could get
enough votes. For example, with 4 nodes one could get 2 votes and
other also 2 votes. Nobody will get quorum 3 in this term.

The patch makes raft able to notice that situation and speed up
the term bump. It is not bumped immediately though, because nodes
might do that simultaneously and will get the split vote again
after voting for self. There is a random delay. But it is just max
10% of election timeout, so it should speed up split vote
resolution on 90% at least.

Part of #5285
---
 src/lib/raft/raft.c         | 104 +++++++++++++++++-
 src/lib/raft/raft.h         |   6 ++
 test/unit/raft.c            | 207 +++++++++++++++++++++++++++++++++++-
 test/unit/raft.result       |  41 ++++++-
 test/unit/raft_test_utils.c |  12 +++
 test/unit/raft_test_utils.h |   5 +
 6 files changed, 370 insertions(+), 5 deletions(-)

diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 289d53fd5..5dcbc7821 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -152,20 +152,69 @@ raft_can_vote_for(const struct raft *raft, const struct vclock *v)
 	return cmp == 0 || cmp == 1;
 }
 
-static inline void
+static inline bool
 raft_add_vote(struct raft *raft, int src, int dst)
 {
 	struct raft_vote *v = &raft->votes[src];
 	if (v->did_vote)
-		return;
+		return false;
 	v->did_vote = true;
 	++raft->votes[dst].count;
+	return true;
+}
+
+static bool
+raft_has_split_vote(const struct raft *raft)
+{
+	int max_vote = 0;
+	int vote_vac = raft->cluster_size;
+	int quorum = raft->election_quorum;
+	for (int i = 0; i < VCLOCK_MAX; ++i) {
+		int count = raft->votes[i].count;
+		vote_vac -= count;
+		if (count > max_vote)
+			max_vote = count;
+	}
+	return max_vote < quorum && max_vote + vote_vac < quorum;
+}
+
+static int
+raft_scores_snprintf(const struct raft *raft, char *buf, int size)
+{
+	int total = 0;
+	bool is_empty = true;
+	SNPRINT(total, snprintf, buf, size, "{");
+	for (int i = 0; i < VCLOCK_MAX; ++i) {
+		int count = raft->votes[i].count;
+		if (count == 0)
+			continue;
+		if (!is_empty)
+			SNPRINT(total, snprintf, buf, size, ", ");
+		is_empty = false;
+		SNPRINT(total, snprintf, buf, size, "%d: %d", i, count);
+	}
+	SNPRINT(total, snprintf, buf, size, "}");
+	return total;
+}
+
+static const char *
+raft_scores_str(const struct raft *raft)
+{
+	char *buf = tt_static_buf();
+	int rc = raft_scores_snprintf(raft, buf, TT_STATIC_BUF_LEN);
+	assert(rc >= 0);
+	(void)rc;
+	return buf;
 }
 
 /** Schedule broadcast of the complete Raft state to all the followers. */
 static void
 raft_schedule_broadcast(struct raft *raft);
 
+/** If there is split vote, the node might reduce the next term delay. */
+static void
+raft_check_split_vote(struct raft *raft);
+
 /** Raft state machine methods. 'sm' stands for State Machine. */
 
 /**
@@ -351,7 +400,8 @@ raft_process_msg(struct raft *raft, const struct raft_msg *req, uint32_t source)
 	 * persisted long time ago and still broadcasted. Or a vote response.
 	 */
 	if (req->vote != 0) {
-		raft_add_vote(raft, source, req->vote);
+		if (raft_add_vote(raft, source, req->vote))
+		    raft_check_split_vote(raft);
 
 		switch (raft->state) {
 		case RAFT_STATE_FOLLOWER:
@@ -1019,6 +1069,13 @@ raft_cfg_vclock(struct raft *raft, const struct vclock *vclock)
 	raft->vclock = vclock;
 }
 
+void
+raft_cfg_cluster_size(struct raft *raft, int size)
+{
+	raft->cluster_size = size;
+	raft_check_split_vote(raft);
+}
+
 void
 raft_new_term(struct raft *raft)
 {
@@ -1043,6 +1100,46 @@ raft_schedule_broadcast(struct raft *raft)
 	raft_schedule_async(raft);
 }
 
+static void
+raft_check_split_vote(struct raft *raft)
+{
+	/* When leader is known, there is no election. Thus no vote to split. */
+	if (raft->leader != 0)
+		return;
+	/* Not a candidate = can't trigger term bump anyway. */
+	if (!raft->is_candidate)
+		return;
+	/*
+	 * WAL write in progress means the state is changing. All is rechecked
+	 * when it is done.
+	 */
+	if (raft->is_write_in_progress)
+		return;
+	if (!raft_has_split_vote(raft))
+		return;
+	assert(raft_ev_is_active(&raft->timer));
+	if (raft->timer.at < raft->election_timeout)
+		return;
+
+	assert(raft->state == RAFT_STATE_FOLLOWER ||
+	       raft->state == RAFT_STATE_CANDIDATE);
+	struct ev_loop *loop = raft_loop();
+	struct ev_timer *timer = &raft->timer;
+	double delay = raft_new_random_election_shift(raft);
+	/*
+	 * Could be too late to speed up anything - probably the term is almost
+	 * over anyway.
+	 */
+	double remaining = raft_ev_timer_remaining(loop, timer);
+	if (delay >= remaining)
+		delay = remaining;
+	say_info("RAFT: split vote is discovered - %s, new term in %lf sec",
+		 raft_scores_str(raft), delay);
+	raft_ev_timer_stop(loop, timer);
+	raft_ev_timer_set(timer, delay, delay);
+	raft_ev_timer_start(loop, timer);
+}
+
 void
 raft_create(struct raft *raft, const struct raft_vtab *vtab)
 {
@@ -1053,6 +1150,7 @@ raft_create(struct raft *raft, const struct raft_vtab *vtab)
 		.election_quorum = 1,
 		.election_timeout = 5,
 		.death_timeout = 5,
+		.cluster_size = VCLOCK_MAX,
 		.vtab = vtab,
 	};
 	raft_ev_timer_init(&raft->timer, raft_sm_schedule_new_election_cb,
diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h
index 4fbfaa91a..33597d153 100644
--- a/src/lib/raft/raft.h
+++ b/src/lib/raft/raft.h
@@ -219,6 +219,8 @@ struct raft {
 	 * elections can be started.
 	 */
 	double death_timeout;
+	/** Number of instances registered in the cluster. */
+	int cluster_size;
 	/** Virtual table to perform application-specific actions. */
 	const struct raft_vtab *vtab;
 	/**
@@ -333,6 +335,10 @@ raft_cfg_instance_id(struct raft *raft, uint32_t instance_id);
 void
 raft_cfg_vclock(struct raft *raft, const struct vclock *vclock);
 
+/** Configure number of registered instances in the cluster. */
+void
+raft_cfg_cluster_size(struct raft *raft, int size);
+
 /**
  * Bump the term. When it is persisted, the node checks if there is a leader,
  * and if there is not, a new election is started. That said, this function can
diff --git a/test/unit/raft.c b/test/unit/raft.c
index 9f18f4d8e..9b462f755 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -1497,10 +1497,214 @@ raft_test_promote_restore(void)
 	raft_finish_test();
 }
 
+static void
+raft_test_split_vote(void)
+{
+	raft_start_test(35);
+	struct raft_node node;
+	raft_node_create(&node);
+
+	/* Normal split vote. */
+
+	raft_node_cfg_cluster_size(&node, 4);
+	raft_node_cfg_election_quorum(&node, 3);
+	raft_run_next_event();
+
+	ok(raft_node_check_full_state(&node,
+		RAFT_STATE_CANDIDATE /* State. */,
+		0 /* Leader. */,
+		2 /* Term. */,
+		1 /* Vote. */,
+		2 /* Volatile term. */,
+		1 /* Volatile vote. */,
+		"{0: 1}" /* Vclock. */
+	), "elections with a new term");
+
+	/* Make so node 1 has votes 1 and 2. Node 3 has votes 3 and 4. */
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		1 /* Vote. */,
+		2 /* Source. */
+	), 0, "vote response for 1 from 2");
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		3 /* Vote. */,
+		3 /* Source. */
+	), 0, "vote response for 3 from 3");
+
+	ok(node.raft.timer.at >= node.raft.election_timeout, "term timeout >= "
+	   "election timeout normally");
+
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		3 /* Vote. */,
+		4 /* Source. */
+	), 0, "vote response for 3 from 4");
+
+	ok(node.raft.timer.at < node.raft.election_timeout, "split vote "
+	   "reduced the term timeout");
+
+	raft_run_next_event();
+	ok(raft_node_check_full_state(&node,
+		RAFT_STATE_CANDIDATE /* State. */,
+		0 /* Leader. */,
+		3 /* Term. */,
+		1 /* Vote. */,
+		3 /* Volatile term. */,
+		1 /* Volatile vote. */,
+		"{0: 2}" /* Vclock. */
+	), "a new term");
+
+	ok(node.raft.timer.at >= node.raft.election_timeout, "timeout is "
+	   "normal again");
+
+	/*
+	 * Cluster size change can make split vote.
+	 */
+	raft_node_destroy(&node);
+	raft_node_create(&node);
+	raft_node_cfg_cluster_size(&node, 3);
+	raft_node_cfg_election_quorum(&node, 2);
+	raft_run_next_event();
+	is(node.raft.state, RAFT_STATE_CANDIDATE, "is candidate");
+	is(node.raft.vote, 1, "voted for self");
+
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		2 /* Vote. */,
+		2 /* Source. */
+	), 0, "vote response for 2 from 2");
+	ok(node.raft.timer.at >= node.raft.election_timeout, "the vote is not "
+	   "split yet");
+
+	raft_node_cfg_cluster_size(&node, 2);
+	ok(node.raft.timer.at < node.raft.election_timeout, "cluster size "
+	   "change makes split vote");
+
+	raft_run_next_event();
+	ok(raft_node_check_full_state(&node,
+		RAFT_STATE_CANDIDATE /* State. */,
+		0 /* Leader. */,
+		3 /* Term. */,
+		1 /* Vote. */,
+		3 /* Volatile term. */,
+		1 /* Volatile vote. */,
+		"{0: 2}" /* Vclock. */
+	), "a new term");
+
+	/*
+	 * Split vote can be even when the leader is already known - then
+	 * nothing to do. Votes are just left from the beginning of the term and
+	 * then probably cluster size reduced a bit.
+	 */
+	raft_node_destroy(&node);
+	raft_node_create(&node);
+	raft_node_cfg_cluster_size(&node, 3);
+	raft_node_cfg_election_quorum(&node, 2);
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		2 /* Vote. */,
+		2 /* Source. */
+	), 0, "vote response for 2 from 2");
+	/* There is also a vote from 3 for 2. But it wasn't delivered to 1. */
+	is(raft_node_send_leader(&node,
+		2 /* Term. */,
+		2 /* Source. */
+	), 0, "message is accepted");
+	is(node.raft.leader, 2, "other node's leadership is accepted");
+	is(raft_vote_count(&node.raft), 1, "the only own vote was from self");
+
+	raft_node_cfg_cluster_size(&node, 2);
+	ok(node.raft.timer.at >= node.raft.death_timeout, "cluster change does "
+	   "not affect the leader's death waiting");
+
+	/*
+	 * Non-candidate should ignore split vote.
+	 */
+	raft_node_destroy(&node);
+	raft_node_create(&node);
+	raft_node_cfg_cluster_size(&node, 3);
+	raft_node_cfg_election_quorum(&node, 3);
+	raft_node_cfg_is_candidate(&node, false);
+
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		2 /* Vote. */,
+		2 /* Source. */
+	), 0, "vote response for 2 from 2");
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		3 /* Vote. */,
+		3 /* Source. */
+	), 0, "vote response for 3 from 3");
+
+	ok(!raft_ev_is_active(&node.raft.timer), "voter couldn't schedule "
+	    "new term");
+
+	/*
+	 * Split vote can get worse, but it shouldn't lead to new term delay
+	 * restart.
+	 */
+	raft_node_destroy(&node);
+	raft_node_create(&node);
+	raft_node_cfg_cluster_size(&node, 3);
+	raft_node_cfg_election_quorum(&node, 3);
+
+	raft_run_next_event();
+	is(node.raft.term, 2, "bump term");
+	is(node.raft.vote, 1, "vote for self");
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		2 /* Vote. */,
+		2 /* Source. */
+	), 0, "vote response for 2 from 2");
+
+	double delay = node.raft.timer.at;
+	ok(delay < node.raft.election_timeout, "split vote is already "
+	   "inevitable");
+
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		3 /* Vote. */,
+		3 /* Source. */
+	), 0, "vote response for 3 from 3");
+
+	is(delay, node.raft.timer.at, "split vote got worse, but delay didn't "
+	   "change");
+
+	/*
+	 * Handle split vote when WAL write is in progress.
+	 */
+	raft_node_destroy(&node);
+	raft_node_create(&node);
+	raft_node_cfg_cluster_size(&node, 2);
+	raft_node_cfg_election_quorum(&node, 2);
+
+	raft_node_block(&node);
+	raft_run_next_event();
+	is(node.raft.term, 1, "old term");
+	is(node.raft.vote, 0, "unused vote");
+	is(node.raft.volatile_term, 2, "new volatile term");
+	is(node.raft.volatile_vote, 1, "new volatile vote");
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		2 /* Vote. */,
+		2 /* Source. */
+	), 0, "vote response for 2 from 2");
+
+	raft_node_unblock(&node);
+	raft_run_next_event();
+	is(node.raft.term, 3, "bump term");
+	is(node.raft.vote, 1, "vote for self");
+
+	raft_node_destroy(&node);
+	raft_finish_test();
+}
+
 static int
 main_f(va_list ap)
 {
-	raft_start_test(14);
+	raft_start_test(15);
 
 	(void) ap;
 	fakeev_init();
@@ -1519,6 +1723,7 @@ main_f(va_list ap)
 	raft_test_enable_disable();
 	raft_test_too_long_wal_write();
 	raft_test_promote_restore();
+	raft_test_split_vote();
 
 	fakeev_free();
 
diff --git a/test/unit/raft.result b/test/unit/raft.result
index 3fcdf8180..d12aae710 100644
--- a/test/unit/raft.result
+++ b/test/unit/raft.result
@@ -1,5 +1,5 @@
 	*** main_f ***
-1..14
+1..15
 	*** raft_test_leader_election ***
     1..24
     ok 1 - 1 pending message at start
@@ -262,4 +262,43 @@ ok 13 - subtests
     ok 12 - not a candidate
 ok 14 - subtests
 	*** raft_test_promote_restore: done ***
+	*** raft_test_split_vote ***
+    1..35
+    ok 1 - elections with a new term
+    ok 2 - vote response for 1 from 2
+    ok 3 - vote response for 3 from 3
+    ok 4 - term timeout >= election timeout normally
+    ok 5 - vote response for 3 from 4
+    ok 6 - split vote reduced the term timeout
+    ok 7 - a new term
+    ok 8 - timeout is normal again
+    ok 9 - is candidate
+    ok 10 - voted for self
+    ok 11 - vote response for 2 from 2
+    ok 12 - the vote is not split yet
+    ok 13 - cluster size change makes split vote
+    ok 14 - a new term
+    ok 15 - vote response for 2 from 2
+    ok 16 - message is accepted
+    ok 17 - other node's leadership is accepted
+    ok 18 - the only own vote was from self
+    ok 19 - cluster change does not affect the leader's death waiting
+    ok 20 - vote response for 2 from 2
+    ok 21 - vote response for 3 from 3
+    ok 22 - voter couldn't schedule new term
+    ok 23 - bump term
+    ok 24 - vote for self
+    ok 25 - vote response for 2 from 2
+    ok 26 - split vote is already inevitable
+    ok 27 - vote response for 3 from 3
+    ok 28 - split vote got worse, but delay didn't change
+    ok 29 - old term
+    ok 30 - unused vote
+    ok 31 - new volatile term
+    ok 32 - new volatile vote
+    ok 33 - vote response for 2 from 2
+    ok 34 - bump term
+    ok 35 - vote for self
+ok 15 - subtests
+	*** raft_test_split_vote: done ***
 	*** main_f: done ***
diff --git a/test/unit/raft_test_utils.c b/test/unit/raft_test_utils.c
index b5754cf78..1d7dd01ba 100644
--- a/test/unit/raft_test_utils.c
+++ b/test/unit/raft_test_utils.c
@@ -194,6 +194,7 @@ raft_node_create(struct raft_node *node)
 	node->cfg_election_quorum = 3;
 	node->cfg_death_timeout = 5;
 	node->cfg_instance_id = 1;
+	node->cfg_cluster_size = VCLOCK_MAX;
 	node->cfg_vclock = &node->journal.vclock;
 	raft_journal_create(&node->journal, node->cfg_instance_id);
 	raft_node_start(node);
@@ -365,6 +366,7 @@ raft_node_start(struct raft_node *node)
 	raft_cfg_election_quorum(&node->raft, node->cfg_election_quorum);
 	raft_cfg_death_timeout(&node->raft, node->cfg_death_timeout);
 	raft_cfg_instance_id(&node->raft, node->cfg_instance_id);
+	raft_cfg_cluster_size(&node->raft, node->cfg_cluster_size);
 	raft_cfg_vclock(&node->raft, node->cfg_vclock);
 	raft_run_async_work();
 }
@@ -423,6 +425,16 @@ raft_node_cfg_is_candidate(struct raft_node *node, bool value)
 	}
 }
 
+void
+raft_node_cfg_cluster_size(struct raft_node *node, int value)
+{
+	node->cfg_cluster_size = value;
+	if (raft_node_is_started(node)) {
+		raft_cfg_cluster_size(&node->raft, value);
+		raft_run_async_work();
+	}
+}
+
 void
 raft_node_cfg_election_timeout(struct raft_node *node, double value)
 {
diff --git a/test/unit/raft_test_utils.h b/test/unit/raft_test_utils.h
index c68dc3b22..2138a829e 100644
--- a/test/unit/raft_test_utils.h
+++ b/test/unit/raft_test_utils.h
@@ -32,6 +32,7 @@
 #include "fakesys/fakeev.h"
 #include "fiber.h"
 #include "raft/raft.h"
+#include "raft/raft_ev.h"
 #include "unit.h"
 
 /** WAL simulation. It stores a list of rows which raft wanted to persist. */
@@ -105,6 +106,7 @@ struct raft_node {
 	int cfg_election_quorum;
 	double cfg_death_timeout;
 	uint32_t cfg_instance_id;
+	int cfg_cluster_size;
 	struct vclock *cfg_vclock;
 };
 
@@ -227,6 +229,9 @@ raft_node_cfg_is_enabled(struct raft_node *node, bool value);
 void
 raft_node_cfg_is_candidate(struct raft_node *node, bool value);
 
+void
+raft_node_cfg_cluster_size(struct raft_node *node, int value);
+
 void
 raft_node_cfg_election_timeout(struct raft_node *node, double value);
 
-- 
2.24.3 (Apple Git-128)



More information about the Tarantool-patches mailing list