[Tarantool-patches] [PATCH 2/4] raft: track all votes, even not own

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sat Jan 15 03:48:54 MSK 2022


To detect split vote a node needs to see that number of free votes
is not enough for anyone to win even if it gets them all.

Hence every node needs to count votes for all other nodes.

The patch makes raft store votes in a bit more complicated manner
than a simple bitmap for just the current instance.

Part of #5285
---
 src/lib/raft/raft.c   | 41 +++++++++++++++++++++++++++++++++--------
 src/lib/raft/raft.h   | 24 +++++++++++++++++-------
 test/unit/raft.c      | 17 ++++++++++-------
 test/unit/raft.result |  7 ++++---
 4 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 1ae8fe87f..289d53fd5 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -152,6 +152,16 @@ raft_can_vote_for(const struct raft *raft, const struct vclock *v)
 	return cmp == 0 || cmp == 1;
 }
 
+static inline void
+raft_add_vote(struct raft *raft, int src, int dst)
+{
+	struct raft_vote *v = &raft->votes[src];
+	if (v->did_vote)
+		return;
+	v->did_vote = true;
+	++raft->votes[dst].count;
+}
+
 /** Schedule broadcast of the complete Raft state to all the followers. */
 static void
 raft_schedule_broadcast(struct raft *raft);
@@ -279,6 +289,12 @@ void
 raft_process_recovery(struct raft *raft, const struct raft_msg *req)
 {
 	say_verbose("RAFT: recover %s", raft_msg_to_string(req));
+	/*
+	 * Instance ID is not unknown until recovery ends. Because apparently it
+	 * can change during join. In Raft it is set only one time when recovery
+	 * ends for good.
+	 */
+	assert(raft->self == 0);
 	if (req->term != 0) {
 		raft->term = req->term;
 		raft->volatile_term = req->term;
@@ -335,6 +351,8 @@ raft_process_msg(struct raft *raft, const struct raft_msg *req, uint32_t source)
 	 * persisted long time ago and still broadcasted. Or a vote response.
 	 */
 	if (req->vote != 0) {
+		raft_add_vote(raft, source, req->vote);
+
 		switch (raft->state) {
 		case RAFT_STATE_FOLLOWER:
 		case RAFT_STATE_LEADER:
@@ -395,11 +413,10 @@ raft_process_msg(struct raft *raft, const struct raft_msg *req, uint32_t source)
 			 * and now was answered by some other instance.
 			 */
 			assert(raft->volatile_vote == raft->self);
-			bool was_set = bit_set(&raft->vote_mask, source);
-			raft->vote_count += !was_set;
-			if (raft->vote_count < raft->election_quorum) {
+			int vote_count = raft_vote_count(raft);
+			if (vote_count < raft->election_quorum) {
 				say_info("RAFT: accepted vote for self, vote "
-					 "count is %d/%d", raft->vote_count,
+					 "count is %d/%d", vote_count,
 					 raft->election_quorum);
 				break;
 			}
@@ -640,13 +657,11 @@ raft_sm_become_candidate(struct raft *raft)
 	assert(raft->state == RAFT_STATE_FOLLOWER);
 	assert(raft->leader == 0);
 	assert(raft->vote == raft->self);
+	assert(raft_vote_count(raft) >= 1);
 	assert(raft->is_candidate);
 	assert(!raft->is_write_in_progress);
 	assert(raft->election_quorum > 1);
 	raft->state = RAFT_STATE_CANDIDATE;
-	raft->vote_count = 1;
-	raft->vote_mask = 0;
-	bit_set(&raft->vote_mask, raft->self);
 	raft_sm_wait_election_end(raft);
 	/* State is visible and it is changed - broadcast. */
 	raft_schedule_broadcast(raft);
@@ -663,6 +678,7 @@ raft_sm_schedule_new_term(struct raft *raft, uint64_t new_term)
 	raft->volatile_vote = 0;
 	raft->leader = 0;
 	raft->state = RAFT_STATE_FOLLOWER;
+	memset(raft->votes, 0, sizeof(raft->votes));
 	/*
 	 * The instance could be promoted for the previous term. But promotion
 	 * has no effect on following terms.
@@ -685,6 +701,8 @@ raft_sm_schedule_new_vote(struct raft *raft, uint32_t new_vote)
 	assert(raft->leader == 0);
 	assert(raft->state == RAFT_STATE_FOLLOWER);
 	raft->volatile_vote = new_vote;
+	assert(!raft->votes[raft->self].did_vote);
+	raft_add_vote(raft, raft->self, raft->self);
 	raft_sm_pause_and_dump(raft);
 	/* Nothing visible is changed - no broadcast. */
 }
@@ -956,7 +974,7 @@ raft_cfg_election_quorum(struct raft *raft, int election_quorum)
 	assert(election_quorum > 0);
 	raft->election_quorum = election_quorum;
 	if (raft->state == RAFT_STATE_CANDIDATE &&
-	    raft->vote_count >= raft->election_quorum)
+	    raft_vote_count(raft) >= raft->election_quorum)
 		raft_sm_become_leader(raft);
 }
 
@@ -984,6 +1002,13 @@ raft_cfg_instance_id(struct raft *raft, uint32_t instance_id)
 	assert(raft->self == 0);
 	assert(instance_id != 0);
 	raft->self = instance_id;
+	/*
+	 * Couldn't do that reliably during recovery. Instance ID can change
+	 * more than once during join. Here instance ID is configured when it is
+	 * known forever and is safe to use.
+	 */
+	if (raft->vote != 0)
+		raft_add_vote(raft, instance_id, raft->vote);
 }
 
 void
diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h
index 53e2c58a8..4fbfaa91a 100644
--- a/src/lib/raft/raft.h
+++ b/src/lib/raft/raft.h
@@ -138,6 +138,14 @@ struct raft_vtab {
 	raft_schedule_async_f schedule_async;
 };
 
+/** Vote descriptor of a single node. */
+struct raft_vote {
+	/** Whether the node voted for anybody. */
+	bool did_vote;
+	/** How many votes the node got in the current term. */
+	int count;
+};
+
 struct raft {
 	/** Instance ID of this node. */
 	uint32_t self;
@@ -189,13 +197,8 @@ struct raft {
 	 */
 	uint64_t term;
 	uint32_t vote;
-	/**
-	 * Bit 1 on position N means that a vote from instance with ID = N was
-	 * obtained.
-	 */
-	vclock_map_t vote_mask;
-	/** Number of votes for this instance. Valid only in candidate state. */
-	int vote_count;
+	/** Statistics which node voted for who. */
+	struct raft_vote votes[VCLOCK_MAX];
 	/** Number of votes necessary for successful election. */
 	int election_quorum;
 	/**
@@ -243,6 +246,13 @@ raft_is_enabled(const struct raft *raft)
 	return raft->is_enabled;
 }
 
+/** Number of votes for self. */
+static inline int
+raft_vote_count(const struct raft *raft)
+{
+	return raft->votes[raft->self].count;
+}
+
 /** Process a raft entry stored in WAL/snapshot. */
 void
 raft_process_recovery(struct raft *raft, const struct raft_msg *req);
diff --git a/test/unit/raft.c b/test/unit/raft.c
index 520b94466..9f18f4d8e 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -70,7 +70,7 @@ raft_test_leader_election(void)
 		1 /* Volatile vote. */,
 		"{0: 1}" /* Vclock. */
 	), "elections with a new term");
-	is(node.raft.vote_count, 1, "single vote for self");
+	is(raft_vote_count(&node.raft), 1, "single vote for self");
 	ok(node.update_count > 0, "trigger worked");
 	node.update_count = 0;
 
@@ -100,7 +100,7 @@ raft_test_leader_election(void)
 		1 /* Vote. */,
 		2 /* Source. */
 	), 0, "vote response from 2");
-	is(node.raft.vote_count, 2, "2 votes - 1 self and 1 foreign");
+	is(raft_vote_count(&node.raft), 2, "2 votes - 1 self and 1 foreign");
 	ok(!node.has_work, "no work to do - not enough votes yet");
 
 	raft_run_for(node.cfg_election_timeout / 2);
@@ -115,7 +115,7 @@ raft_test_leader_election(void)
 		1 /* Vote. */,
 		3 /* Source. */
 	), 0, "vote response from 3");
-	is(node.raft.vote_count, 3, "2 votes - 1 self and 2 foreign");
+	is(raft_vote_count(&node.raft), 3, "2 votes - 1 self and 2 foreign");
 	is(node.raft.state, RAFT_STATE_LEADER, "became leader");
 	ok(node.update_count > 0, "trigger worked");
 	node.update_count = 0;
@@ -142,7 +142,7 @@ raft_test_leader_election(void)
 static void
 raft_test_recovery(void)
 {
-	raft_start_test(12);
+	raft_start_test(13);
 	struct raft_msg msg;
 	struct raft_node node;
 	raft_node_create(&node);
@@ -224,6 +224,8 @@ raft_test_recovery(void)
 		"{0: 1}" /* Vclock. */
 	), "restart always as a follower");
 
+	is(raft_vote_count(&node.raft), 1, "vote count is restored correctly");
+
 	raft_checkpoint_remote(&node.raft, &msg);
 	ok(raft_msg_check(&msg,
 		RAFT_STATE_FOLLOWER /* State. */,
@@ -383,7 +385,7 @@ raft_test_vote_skip(void)
 		1 /* Vote. */,
 		2 /* Source. */
 	), 0, "message is accepted");
-	is(node.raft.vote_count, 1, "but ignored - too old term");
+	is(raft_vote_count(&node.raft), 1, "but ignored - too old term");
 
 	/* Competing vote requests are skipped. */
 
@@ -392,7 +394,8 @@ raft_test_vote_skip(void)
 		3 /* Vote. */,
 		2 /* Source. */
 	), 0, "message is accepted");
-	is(node.raft.vote_count, 1, "but ignored - vote not for this node");
+	is(raft_vote_count(&node.raft), 1,
+	   "but ignored - vote not for this node");
 	is(node.raft.state, RAFT_STATE_CANDIDATE, "this node does not give up");
 
 	/* Vote requests are ignored when node is disabled. */
@@ -1071,7 +1074,7 @@ raft_test_election_quorum(void)
 		1 /* Vote. */,
 		2 /* Source. */
 	), 0, "send vote response from second node");
-	is(node.raft.vote_count, 2, "vote is accepted");
+	is(raft_vote_count(&node.raft), 2, "vote is accepted");
 	is(node.raft.state, RAFT_STATE_CANDIDATE, "but still candidate");
 
 	raft_node_cfg_election_quorum(&node, 2);
diff --git a/test/unit/raft.result b/test/unit/raft.result
index 764d82969..3fcdf8180 100644
--- a/test/unit/raft.result
+++ b/test/unit/raft.result
@@ -29,7 +29,7 @@
 ok 1 - subtests
 	*** raft_test_leader_election: done ***
 	*** raft_test_recovery ***
-    1..12
+    1..13
     ok 1 - became candidate
     ok 2 - remote checkpoint of a candidate
     ok 3 - local checkpoint of a candidate
@@ -40,8 +40,9 @@ ok 1 - subtests
     ok 8 - remote checkpoint of a leader
     ok 9 - local checkpoint of a leader
     ok 10 - restart always as a follower
-    ok 11 - remote checkpoint of a leader
-    ok 12 - local checkpoint of a leader
+    ok 11 - vote count is restored correctly
+    ok 12 - remote checkpoint of a leader
+    ok 13 - local checkpoint of a leader
 ok 2 - subtests
 	*** raft_test_recovery: done ***
 	*** raft_test_bad_msg ***
-- 
2.24.3 (Apple Git-128)



More information about the Tarantool-patches mailing list