[Tarantool-patches] [PATCH 3/4] raft: introduce split vote detection
Serge Petrenko
sergepetrenko at tarantool.org
Tue Jan 18 16:20:52 MSK 2022
Thanks for the patch!
I don't think this optimisation is "too much of a hassle".
It's quite nice, and looks like a bunch of SLOC in the patch are used up
by verbose printing (I mean raft_scores_snprint).
In other words, I like the idea and I think we should have that on board.
(Just like pre-voting)
Please find my comments below.
> diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
> index 289d53fd5..5dcbc7821 100644
> --- a/src/lib/raft/raft.c
> +++ b/src/lib/raft/raft.c
> @@ -152,20 +152,69 @@ raft_can_vote_for(const struct raft *raft, const struct vclock *v)
> return cmp == 0 || cmp == 1;
> }
>
> -static inline void
> +static inline bool
> raft_add_vote(struct raft *raft, int src, int dst)
> {
> struct raft_vote *v = &raft->votes[src];
> if (v->did_vote)
> - return;
> + return false;
> v->did_vote = true;
> ++raft->votes[dst].count;
> + return true;
> +}
> +
You may check split_vote right in raft_add_vote:
simply track number of votes given in this term and
max votes given for one instance.
This way you won't have to run over all 32 nodes each time a vote
is casted.
> +static bool
> +raft_has_split_vote(const struct raft *raft)
> +{
> + int max_vote = 0;
> + int vote_vac = raft->cluster_size;
> + int quorum = raft->election_quorum;
> + for (int i = 0; i < VCLOCK_MAX; ++i) {
> + int count = raft->votes[i].count;
> + vote_vac -= count;
> + if (count > max_vote)
> + max_vote = count;
> + }
> + return max_vote < quorum && max_vote + vote_vac < quorum;
This is equal to `return max_vote + vote_vac < quorum`
> +}
> +
> +static int
> +raft_scores_snprintf(const struct raft *raft, char *buf, int size)
> +{
> + int total = 0;
> + bool is_empty = true;
> + SNPRINT(total, snprintf, buf, size, "{");
> + for (int i = 0; i < VCLOCK_MAX; ++i) {
> + int count = raft->votes[i].count;
> + if (count == 0)
> + continue;
> + if (!is_empty)
> + SNPRINT(total, snprintf, buf, size, ", ");
> + is_empty = false;
Nit: you may move is_empty = false into the 'else' branch.
> + SNPRINT(total, snprintf, buf, size, "%d: %d", i, count);
> + }
> + SNPRINT(total, snprintf, buf, size, "}");
> + return total;
> +}
> +
...
>
> +static void
> +raft_check_split_vote(struct raft *raft)
> +{
> + /* When leader is known, there is no election. Thus no vote to split. */
> + if (raft->leader != 0)
> + return;
> + /* Not a candidate = can't trigger term bump anyway. */
> + if (!raft->is_candidate)
> + return;
> + /*
> + * WAL write in progress means the state is changing. All is rechecked
> + * when it is done.
> + */
> + if (raft->is_write_in_progress)
> + return;
> + if (!raft_has_split_vote(raft))
> + return;
> + assert(raft_ev_is_active(&raft->timer));
> + if (raft->timer.at < raft->election_timeout)
> + return;
I don't understand that. timer.at should point at current time,
shouldn't it?
> +
> + assert(raft->state == RAFT_STATE_FOLLOWER ||
> + raft->state == RAFT_STATE_CANDIDATE);
> + struct ev_loop *loop = raft_loop();
> + struct ev_timer *timer = &raft->timer;
> + double delay = raft_new_random_election_shift(raft);
> + /*
> + * Could be too late to speed up anything - probably the term is almost
> + * over anyway.
> + */
> + double remaining = raft_ev_timer_remaining(loop, timer);
> + if (delay >= remaining)
> + delay = remaining;
> + say_info("RAFT: split vote is discovered - %s, new term in %lf sec",
> + raft_scores_str(raft), delay);
> + raft_ev_timer_stop(loop, timer);
> + raft_ev_timer_set(timer, delay, delay);
> + raft_ev_timer_start(loop, timer);
> +}
> +
> void
> raft_create(struct raft *raft, const struct raft_vtab *vtab)
> {
> @@ -1053,6 +1150,7 @@ raft_create(struct raft *raft, const struct raft_vtab *vtab)
> .election_quorum = 1,
> .election_timeout = 5,
> .death_timeout = 5,
> + .cluster_size = VCLOCK_MAX,
> .vtab = vtab,
> };
> raft_ev_timer_init(&raft->timer, raft_sm_schedule_new_election_cb,
...
>
--
Serge Petrenko
More information about the Tarantool-patches
mailing list