[Tarantool-patches] [PATCH 3/4] raft: introduce split vote detection
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Thu Jan 20 03:44:23 MSK 2022
Hi! Thanks for the review!
>> diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
>> index 289d53fd5..5dcbc7821 100644
>> --- a/src/lib/raft/raft.c
>> +++ b/src/lib/raft/raft.c
>> @@ -152,20 +152,69 @@ raft_can_vote_for(const struct raft *raft, const struct vclock *v)
>> return cmp == 0 || cmp == 1;
>> }
>> -static inline void
>> +static inline bool
>> raft_add_vote(struct raft *raft, int src, int dst)
>> {
>> struct raft_vote *v = &raft->votes[src];
>> if (v->did_vote)
>> - return;
>> + return false;
>> v->did_vote = true;
>> ++raft->votes[dst].count;
>> + return true;
>> +}
>> +
>
> You may check split_vote right in raft_add_vote:
> simply track number of votes given in this term and
> max votes given for one instance.
>
> This way you won't have to run over all 32 nodes each time a vote
> is casted.
I did the fullscan intentionally. Otherwise I need to introduce 2
new members to struct raft, keep them up to date, clear on term
bump. Too easy to miss something and introduce a bug. While in
the current version all the split-vote-specific details are in a
single function except for 'raft.votes' member. This thing I couldn't
get rid of.
As for perf, a couple of ifs or a loop over 32 structs - both would
take order of nanoseconds anyway. Here I wouldn't bother. Simplicity
matters most.
I did the proposal to see how it looks but then discarded as more
complex than necessary. However if you think it is still worth doing,
tell me and I will re-apply the diff.
====================
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index c4f5fb059..fcce3126a 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -159,23 +159,20 @@ raft_add_vote(struct raft *raft, int src, int dst)
if (v->did_vote)
return false;
v->did_vote = true;
- ++raft->votes[dst].count;
+ ++raft->voted_count;
+ int count = ++raft->votes[dst].count;
+ if (count > raft->max_vote_count)
+ raft->max_vote_count = count;
return true;
}
static bool
raft_has_split_vote(const struct raft *raft)
{
- int max_vote = 0;
- int vote_vac = raft->cluster_size;
- int quorum = raft->election_quorum;
- for (int i = 0; i < VCLOCK_MAX; ++i) {
- int count = raft->votes[i].count;
- vote_vac -= count;
- if (count > max_vote)
- max_vote = count;
- }
- return max_vote + vote_vac < quorum;
+ int vote_vac = raft->cluster_size - raft->voted_count;
+ if (vote_vac < 0)
+ vote_vac = 0;
+ return raft->max_vote_count + vote_vac < raft->election_quorum;
}
static int
@@ -730,6 +727,8 @@ raft_sm_schedule_new_term(struct raft *raft, uint64_t new_term)
raft->leader = 0;
raft->state = RAFT_STATE_FOLLOWER;
memset(raft->votes, 0, sizeof(raft->votes));
+ raft->max_vote_count = 0;
+ raft->voted_count = 0;
/*
* The instance could be promoted for the previous term. But promotion
* has no effect on following terms.
diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h
index 817148792..7115f658f 100644
--- a/src/lib/raft/raft.h
+++ b/src/lib/raft/raft.h
@@ -199,6 +199,10 @@ struct raft {
uint32_t vote;
/** Statistics which node voted for who. */
struct raft_vote votes[VCLOCK_MAX];
+ /** How many nodes voted in the current term. */
+ int voted_count;
+ /** Max vote count given to any node in the current term. */
+ int max_vote_count;
/** Number of votes necessary for successful election. */
int election_quorum;
/**
====================
>> +static bool
>> +raft_has_split_vote(const struct raft *raft)
>> +{
>> + int max_vote = 0;
>> + int vote_vac = raft->cluster_size;
>> + int quorum = raft->election_quorum;
>> + for (int i = 0; i < VCLOCK_MAX; ++i) {
>> + int count = raft->votes[i].count;
>> + vote_vac -= count;
>> + if (count > max_vote)
>> + max_vote = count;
>> + }
>> + return max_vote < quorum && max_vote + vote_vac < quorum;
>
> This is equal to `return max_vote + vote_vac < quorum`
Ouch, that was stupid indeed. I honestly did multiple self-reviews
and still missed it.
====================
@@ -175,7 +175,7 @@ raft_has_split_vote(const struct raft *raft)
if (count > max_vote)
max_vote = count;
}
- return max_vote < quorum && max_vote + vote_vac < quorum;
+ return max_vote + vote_vac < quorum;
}
====================
>> +}
>> +
>> +static int
>> +raft_scores_snprintf(const struct raft *raft, char *buf, int size)
>> +{
>> + int total = 0;
>> + bool is_empty = true;
>> + SNPRINT(total, snprintf, buf, size, "{");
>> + for (int i = 0; i < VCLOCK_MAX; ++i) {
>> + int count = raft->votes[i].count;
>> + if (count == 0)
>> + continue;
>> + if (!is_empty)
>> + SNPRINT(total, snprintf, buf, size, ", ");
>> + is_empty = false;
>
> Nit: you may move is_empty = false into the 'else' branch.
I put it here to write 1 line less. But I don't mind.
====================
@@ -190,7 +190,8 @@ raft_scores_snprintf(const struct raft *raft, char *buf, int size)
continue;
if (!is_empty)
SNPRINT(total, snprintf, buf, size, ", ");
- is_empty = false;
+ else
+ is_empty = false;
SNPRINT(total, snprintf, buf, size, "%d: %d", i, count);
}
====================
>> + SNPRINT(total, snprintf, buf, size, "%d: %d", i, count);
>> + }
>> + SNPRINT(total, snprintf, buf, size, "}");
>> + return total;
>> +}
>> +
>
> ...
>
>> +static void
>> +raft_check_split_vote(struct raft *raft)
>> +{
>> + /* When leader is known, there is no election. Thus no vote to split. */
>> + if (raft->leader != 0)
>> + return;
>> + /* Not a candidate = can't trigger term bump anyway. */
>> + if (!raft->is_candidate)
>> + return;
>> + /*
>> + * WAL write in progress means the state is changing. All is rechecked
>> + * when it is done.
>> + */
>> + if (raft->is_write_in_progress)
>> + return;
>> + if (!raft_has_split_vote(raft))
>> + return;
>> + assert(raft_ev_is_active(&raft->timer));
>> + if (raft->timer.at < raft->election_timeout)
>> + return;
>
> I don't understand that. timer.at should point at current time, shouldn't it?
Yes, thanks for noticing. This is a bug. And in some other existing places too.
I made a separate commit fixing the existing places, see v2. And I fixed this
place below. Replaced with ev_timer.repeat.
====================
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 630f8c677..c4f5fb059 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -1124,7 +1124,7 @@ raft_check_split_vote(struct raft *raft)
if (!raft_has_split_vote(raft))
return;
assert(raft_ev_is_active(&raft->timer));
- if (raft->timer.at < raft->election_timeout)
+ if (raft->timer.repeat < raft->election_timeout)
return;
assert(raft->state == RAFT_STATE_FOLLOWER ||
diff --git a/test/unit/raft.c b/test/unit/raft.c
index 9b462f755..f3ed453ed 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -1532,8 +1532,8 @@ raft_test_split_vote(void)
3 /* Source. */
), 0, "vote response for 3 from 3");
- ok(node.raft.timer.at >= node.raft.election_timeout, "term timeout >= "
- "election timeout normally");
+ ok(node.raft.timer.repeat >= node.raft.election_timeout, "term "
+ "timeout >= election timeout normally");
is(raft_node_send_vote_response(&node,
2 /* Term. */,
@@ -1541,7 +1541,7 @@ raft_test_split_vote(void)
4 /* Source. */
), 0, "vote response for 3 from 4");
- ok(node.raft.timer.at < node.raft.election_timeout, "split vote "
+ ok(node.raft.timer.repeat < node.raft.election_timeout, "split vote "
"reduced the term timeout");
raft_run_next_event();
@@ -1555,7 +1555,7 @@ raft_test_split_vote(void)
"{0: 2}" /* Vclock. */
), "a new term");
- ok(node.raft.timer.at >= node.raft.election_timeout, "timeout is "
+ ok(node.raft.timer.repeat >= node.raft.election_timeout, "timeout is "
"normal again");
/*
@@ -1574,11 +1574,11 @@ raft_test_split_vote(void)
2 /* Vote. */,
2 /* Source. */
), 0, "vote response for 2 from 2");
- ok(node.raft.timer.at >= node.raft.election_timeout, "the vote is not "
- "split yet");
+ ok(node.raft.timer.repeat >= node.raft.election_timeout, "the vote is "
+ "not split yet");
raft_node_cfg_cluster_size(&node, 2);
- ok(node.raft.timer.at < node.raft.election_timeout, "cluster size "
+ ok(node.raft.timer.repeat < node.raft.election_timeout, "cluster size "
"change makes split vote");
raft_run_next_event();
@@ -1615,8 +1615,8 @@ raft_test_split_vote(void)
is(raft_vote_count(&node.raft), 1, "the only own vote was from self");
raft_node_cfg_cluster_size(&node, 2);
- ok(node.raft.timer.at >= node.raft.death_timeout, "cluster change does "
- "not affect the leader's death waiting");
+ ok(node.raft.timer.repeat >= node.raft.death_timeout, "cluster change "
+ "does not affect the leader's death waiting");
/*
* Non-candidate should ignore split vote.
@@ -1659,7 +1659,7 @@ raft_test_split_vote(void)
2 /* Source. */
), 0, "vote response for 2 from 2");
- double delay = node.raft.timer.at;
+ double delay = node.raft.timer.repeat;
ok(delay < node.raft.election_timeout, "split vote is already "
"inevitable");
@@ -1669,8 +1669,8 @@ raft_test_split_vote(void)
3 /* Source. */
), 0, "vote response for 3 from 3");
- is(delay, node.raft.timer.at, "split vote got worse, but delay didn't "
- "change");
+ is(delay, node.raft.timer.repeat, "split vote got worse, but delay "
+ "didn't change");
/*
* Handle split vote when WAL write is in progress.
====================
Additionally, I disabled split vote detection for cluster < quorum.
It would only increase rate of term bumps and wouldn't solve the problem.
====================
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index c4f5fb059..098f60ed9 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -169,6 +169,12 @@ raft_has_split_vote(const struct raft *raft)
int max_vote = 0;
int vote_vac = raft->cluster_size;
int quorum = raft->election_quorum;
+ /*
+ * Quorum > cluster is either a misconfiguration or some instances
+ * didn't register yet. Anyway, speeding the elections up won't help.
+ */
+ if (vote_vac < quorum)
+ return false;
for (int i = 0; i < VCLOCK_MAX; ++i) {
int count = raft->votes[i].count;
vote_vac -= count;
diff --git a/test/unit/raft.c b/test/unit/raft.c
index f3ed453ed..e88d12a5c 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -1500,7 +1500,7 @@ raft_test_promote_restore(void)
static void
raft_test_split_vote(void)
{
- raft_start_test(35);
+ raft_start_test(39);
struct raft_node node;
raft_node_create(&node);
@@ -1697,6 +1697,27 @@ raft_test_split_vote(void)
is(node.raft.term, 3, "bump term");
is(node.raft.vote, 1, "vote for self");
+ /*
+ * Split vote check is disabled when cluster size < quorum. Makes no
+ * sense to speed the elections up.
+ */
+ raft_node_destroy(&node);
+ raft_node_create(&node);
+ raft_node_cfg_cluster_size(&node, 1);
+ raft_node_cfg_election_quorum(&node, 2);
+
+ raft_run_next_event();
+ is(node.raft.term, 2, "bump term");
+ is(node.raft.vote, 1, "vote for self");
+ is(raft_node_send_vote_response(&node,
+ 2 /* Term. */,
+ 2 /* Vote. */,
+ 2 /* Source. */
+ ), 0, "vote response for 2 from 2");
+
+ ok(node.raft.timer.repeat >= node.raft.election_timeout, "split vote "
+ "is not checked for cluster < quorum");
+
raft_node_destroy(&node);
raft_finish_test();
}
diff --git a/test/unit/raft.result b/test/unit/raft.result
index d12aae710..465bfb97d 100644
--- a/test/unit/raft.result
+++ b/test/unit/raft.result
@@ -263,7 +263,7 @@ ok 13 - subtests
ok 14 - subtests
*** raft_test_promote_restore: done ***
*** raft_test_split_vote ***
- 1..35
+ 1..39
ok 1 - elections with a new term
ok 2 - vote response for 1 from 2
ok 3 - vote response for 3 from 3
@@ -299,6 +299,10 @@ ok 14 - subtests
ok 33 - vote response for 2 from 2
ok 34 - bump term
ok 35 - vote for self
+ ok 36 - bump term
+ ok 37 - vote for self
+ ok 38 - vote response for 2 from 2
+ ok 39 - split vote is not checked for cluster < quorum
ok 15 - subtests
*** raft_test_split_vote: done ***
*** main_f: done ***
====================
Then I fixed a similar issue with voted node count > cluster size.
====================
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 098f60ed9..50e0dfe87 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -181,6 +181,13 @@ raft_has_split_vote(const struct raft *raft)
if (count > max_vote)
max_vote = count;
}
+ /*
+ * More nodes voted than there are nodes configured. The reason is the
+ * the same as with quorum > cluster. The action is also the same -
+ * faster term bumps won't help.
+ */
+ if (vote_vac < 0)
+ return false;
return max_vote + vote_vac < quorum;
}
diff --git a/test/unit/raft.c b/test/unit/raft.c
index e88d12a5c..3d5e2c777 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -1500,7 +1500,7 @@ raft_test_promote_restore(void)
static void
raft_test_split_vote(void)
{
- raft_start_test(39);
+ raft_start_test(46);
struct raft_node node;
raft_node_create(&node);
@@ -1718,6 +1718,43 @@ raft_test_split_vote(void)
ok(node.raft.timer.repeat >= node.raft.election_timeout, "split vote "
"is not checked for cluster < quorum");
+ /*
+ * Split vote check is disabled when vote count > cluster size. The
+ * reason is the same as with quorum > cluster size - something is odd,
+ * more term bumps won't help.
+ */
+ raft_node_destroy(&node);
+ raft_node_create(&node);
+ raft_node_cfg_cluster_size(&node, 3);
+ raft_node_cfg_election_quorum(&node, 2);
+
+ raft_run_next_event();
+ is(node.raft.term, 2, "bump term");
+ is(node.raft.vote, 1, "vote for self");
+ is(raft_node_send_vote_response(&node,
+ 2 /* Term. */,
+ 2 /* Vote. */,
+ 2 /* Source. */
+ ), 0, "vote response for 2 from 2");
+ is(raft_node_send_vote_response(&node,
+ 2 /* Term. */,
+ 2 /* Vote. */,
+ 3 /* Source. */
+ ), 0, "vote response for 2 from 3");
+ is(raft_node_send_vote_response(&node,
+ 2 /* Term. */,
+ 3 /* Vote. */,
+ 4 /* Source. */
+ ), 0, "vote response for 3 from 4");
+ is(raft_node_send_vote_response(&node,
+ 2 /* Term. */,
+ 4 /* Vote. */,
+ 5 /* Source. */
+ ), 0, "vote response for 4 from 5");
+
+ ok(node.raft.timer.repeat >= node.raft.election_timeout, "split vote "
+ "is not checked when vote count > cluster size");
+
raft_node_destroy(&node);
raft_finish_test();
}
diff --git a/test/unit/raft.result b/test/unit/raft.result
index 465bfb97d..cce1891c4 100644
--- a/test/unit/raft.result
+++ b/test/unit/raft.result
@@ -263,7 +263,7 @@ ok 13 - subtests
ok 14 - subtests
*** raft_test_promote_restore: done ***
*** raft_test_split_vote ***
- 1..39
+ 1..46
ok 1 - elections with a new term
ok 2 - vote response for 1 from 2
ok 3 - vote response for 3 from 3
@@ -303,6 +303,13 @@ ok 14 - subtests
ok 37 - vote for self
ok 38 - vote response for 2 from 2
ok 39 - split vote is not checked for cluster < quorum
+ ok 40 - bump term
+ ok 41 - vote for self
+ ok 42 - vote response for 2 from 2
+ ok 43 - vote response for 2 from 3
+ ok 44 - vote response for 3 from 4
+ ok 45 - vote response for 4 from 5
+ ok 46 - split vote is not checked when vote count > cluster size
ok 15 - subtests
*** raft_test_split_vote: done ***
*** main_f: done ***
====================
Then I realized one of my tests wasn't accurate enough - split vote
during WAL write wasn't properly checked.
====================
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index f606f9e79..cd7c91c8b 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -835,6 +835,11 @@ raft_sm_wait_election_end(struct raft *raft)
raft_new_random_election_shift(raft);
raft_ev_timer_set(&raft->timer, election_timeout, election_timeout);
raft_ev_timer_start(raft_loop(), &raft->timer);
+ /*
+ * Could start the waiting after a WAL write during which the split vote
+ * could happen.
+ */
+ raft_check_split_vote(raft);
}
static void
diff --git a/test/unit/raft.c b/test/unit/raft.c
index 3d5e2c777..4aa3f4959 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -1500,7 +1500,7 @@ raft_test_promote_restore(void)
static void
raft_test_split_vote(void)
{
- raft_start_test(46);
+ raft_start_test(51);
struct raft_node node;
raft_node_create(&node);
@@ -1693,6 +1693,13 @@ raft_test_split_vote(void)
), 0, "vote response for 2 from 2");
raft_node_unblock(&node);
+ is(node.raft.term, 2, "new term");
+ is(node.raft.vote, 1, "voted for self");
+ is(node.raft.volatile_term, 2, "volatile term");
+ is(node.raft.volatile_vote, 1, "volatile vote");
+ ok(node.raft.timer.repeat < node.raft.election_timeout, "found split "
+ "vote after WAL write");
+
raft_run_next_event();
is(node.raft.term, 3, "bump term");
is(node.raft.vote, 1, "vote for self");
diff --git a/test/unit/raft.result b/test/unit/raft.result
index cce1891c4..3cebf2513 100644
--- a/test/unit/raft.result
+++ b/test/unit/raft.result
@@ -263,7 +263,7 @@ ok 13 - subtests
ok 14 - subtests
*** raft_test_promote_restore: done ***
*** raft_test_split_vote ***
- 1..46
+ 1..51
ok 1 - elections with a new term
ok 2 - vote response for 1 from 2
ok 3 - vote response for 3 from 3
@@ -297,19 +297,24 @@ ok 14 - subtests
ok 31 - new volatile term
ok 32 - new volatile vote
ok 33 - vote response for 2 from 2
- ok 34 - bump term
- ok 35 - vote for self
- ok 36 - bump term
- ok 37 - vote for self
- ok 38 - vote response for 2 from 2
- ok 39 - split vote is not checked for cluster < quorum
- ok 40 - bump term
- ok 41 - vote for self
- ok 42 - vote response for 2 from 2
- ok 43 - vote response for 2 from 3
- ok 44 - vote response for 3 from 4
- ok 45 - vote response for 4 from 5
- ok 46 - split vote is not checked when vote count > cluster size
+ ok 34 - new term
+ ok 35 - voted for self
+ ok 36 - volatile term
+ ok 37 - volatile vote
+ ok 38 - found split vote after WAL write
+ ok 39 - bump term
+ ok 40 - vote for self
+ ok 41 - bump term
+ ok 42 - vote for self
+ ok 43 - vote response for 2 from 2
+ ok 44 - split vote is not checked for cluster < quorum
+ ok 45 - bump term
+ ok 46 - vote for self
+ ok 47 - vote response for 2 from 2
+ ok 48 - vote response for 2 from 3
+ ok 49 - vote response for 3 from 4
+ ok 50 - vote response for 4 from 5
+ ok 51 - split vote is not checked when vote count > cluster size
ok 15 - subtests
*** raft_test_split_vote: done ***
*** main_f: done ***
====================
Then I realized I didn't check split vote on quorum change. When
it is increased, it can trigger split vote.
====================
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index cd7c91c8b..5dcc5beaf 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -1046,6 +1046,8 @@ raft_cfg_election_quorum(struct raft *raft, int election_quorum)
if (raft->state == RAFT_STATE_CANDIDATE &&
raft_vote_count(raft) >= raft->election_quorum)
raft_sm_become_leader(raft);
+ else
+ raft_check_split_vote(raft);
}
void
diff --git a/test/unit/raft.c b/test/unit/raft.c
index 4aa3f4959..e264e864e 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -1500,7 +1500,7 @@ raft_test_promote_restore(void)
static void
raft_test_split_vote(void)
{
- raft_start_test(51);
+ raft_start_test(58);
struct raft_node node;
raft_node_create(&node);
@@ -1762,6 +1762,34 @@ raft_test_split_vote(void)
ok(node.raft.timer.repeat >= node.raft.election_timeout, "split vote "
"is not checked when vote count > cluster size");
+ /*
+ * Split vote can happen if quorum was suddenly increased.
+ */
+ raft_node_destroy(&node);
+ raft_node_create(&node);
+ raft_node_cfg_cluster_size(&node, 3);
+ raft_node_cfg_election_quorum(&node, 2);
+
+ raft_run_next_event();
+ is(node.raft.term, 2, "bump term");
+ is(node.raft.vote, 1, "vote for self");
+ is(raft_node_send_vote_response(&node,
+ 2 /* Term. */,
+ 2 /* Vote. */,
+ 2 /* Source. */
+ ), 0, "vote response for 2 from 2");
+
+ ok(node.raft.timer.repeat >= node.raft.election_timeout, "not split "
+ "vote yet");
+
+ raft_node_cfg_election_quorum(&node, 3);
+ ok(node.raft.timer.repeat < node.raft.election_timeout, "split vote "
+ "after quorum increase");
+
+ raft_run_next_event();
+ is(node.raft.term, 3, "bump term");
+ is(node.raft.vote, 1, "vote for self");
+
raft_node_destroy(&node);
raft_finish_test();
}
====================
More information about the Tarantool-patches
mailing list