[Tarantool-patches] [PATCH 04/12] raft: stop using replication_synchro_quorum
Serge Petrenko
sergepetrenko at tarantool.org
Tue Nov 17 11:17:13 MSK 2020
17.11.2020 03:02, Vladislav Shpilevoy пишет:
> Raft is being moved to a separate library in src/lib. It means,
> it can't depend on anything from box/, including global
> replication parameters such as replication_synchro_quorum.
>
> The patch makes raft stop using replication_synchro_quorum.
>
> Instead, it has a new option 'election_quorum'. Note, that this is
> just Raft API. Box API still uses replication_synchro_quorum. But
> it is used to calculate the final quorum in src/box/raft, not
> in src/box/raftlib. And to pass it to the base Raft
> implementation.
>
> Part of #5303
> ---
> src/box/box.cc | 2 +-
> src/box/raft.c | 52 +++++++++++++++++++++++++++++++
> src/box/raft.h | 8 +++++
> src/box/raftlib.c | 70 ++++++++----------------------------------
> src/box/raftlib.h | 10 +++---
> src/box/replication.cc | 3 ++
> 6 files changed, 83 insertions(+), 62 deletions(-)
>
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 25673ed42..cc0d7b81d 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -921,7 +921,7 @@ box_set_replication_synchro_quorum(void)
> return -1;
> replication_synchro_quorum = value;
> txn_limbo_on_parameters_change(&txn_limbo);
> - raft_cfg_election_quorum(box_raft());
> + box_raft_reconsider_election_quorum();
> return 0;
> }
>
> diff --git a/src/box/raft.c b/src/box/raft.c
> index f289a6993..af6e71e0b 100644
> --- a/src/box/raft.c
> +++ b/src/box/raft.c
> @@ -30,6 +30,7 @@
> */
> #include "box.h"
> #include "raft.h"
> +#include "replication.h"
>
> struct raft box_raft_global = {
> /*
> @@ -62,6 +63,57 @@ box_raft_on_update_f(struct trigger *trigger, void *event)
> return 0;
> }
>
> +void
> +box_raft_reconsider_election_quorum(void)
Suggestion: maybe use "rewrite"/"reset" instead of "reconsider"?
Or plain "update"?
Other than that, LGTM.
> +{
> + /*
> + * When the instance is started first time, it does not have an ID, so
> + * the registered count is 0. But the quorum can never be 0. At least
> + * the current instance should participate in the quorum.
> + */
> + int max = MAX(replicaset.registered_count, 1);
> + /**
> + * Election quorum is not strictly equal to synchronous replication
> + * quorum. Sometimes it can be lowered. That is about bootstrap.
> + *
> + * The problem with bootstrap is that when the replicaset boots, all the
> + * instances can't write to WAL and can't recover from their initial
> + * snapshot. They need one node which will boot first, and then they
> + * will replicate from it.
> + *
> + * This one node should boot from its zero snapshot, create replicaset
> + * UUID, register self with ID 1 in _cluster space, and then register
> + * all the other instances here. To do that the node must be writable.
> + * It should have read_only = false, connection quorum satisfied, and be
> + * a Raft leader if Raft is enabled.
> + *
> + * To be elected a Raft leader it needs to perform election. But it
> + * can't be done before at least synchronous quorum of the replicas is
> + * bootstrapped. And they can't be bootstrapped because wait for a
> + * leader to initialize _cluster. Cyclic dependency.
> + *
> + * This is resolved by truncation of the election quorum to the number
> + * of registered replicas, if their count is less than synchronous
> + * quorum. That helps to elect a first leader.
> + *
> + * It may seem that the first node could just declare itself a leader
> + * and then strictly follow the protocol from now on, but that won't
> + * work, because if the first node will restart after it is booted, but
> + * before quorum of replicas is booted, the cluster will stuck again.
> + *
> + * The current solution is totally safe because
> + *
> + * - after all the cluster will have node count >= quorum, if user used
> + * a correct config (God help him if he didn't);
> + *
> + * - synchronous replication quorum is untouched - it is not truncated.
> + * Only leader election quorum is affected. So synchronous data won't
> + * be lost.
> + */
> + int quorum = MIN(replication_synchro_quorum, max);
> + raft_cfg_election_quorum(box_raft(), quorum);
> +}
> +
> void
> box_raft_init(void)
> {
> diff --git a/src/box/raft.h b/src/box/raft.h
> index fe0f073dc..09297273f 100644
> --- a/src/box/raft.h
> +++ b/src/box/raft.h
> @@ -48,6 +48,14 @@ box_raft(void)
> return &box_raft_global;
> }
>
> +/**
> + * Let the global raft know that the election quorum could change. It happens
> + * when configuration is updated, and when new nodes are added or old are
> + * deleted from the cluster.
> + */
> +void
> +box_raft_reconsider_election_quorum(void);
> +
> void
> box_raft_init(void);
>
> diff --git a/src/box/raftlib.c b/src/box/raftlib.c
> index c156d6f46..0657fa85a 100644
> --- a/src/box/raftlib.c
> +++ b/src/box/raftlib.c
> @@ -130,50 +130,6 @@ raft_can_vote_for(const struct raft *raft, const struct vclock *v)
> return cmp == 0 || cmp == 1;
> }
>
> -/**
> - * Election quorum is not strictly equal to synchronous replication quorum.
> - * Sometimes it can be lowered. That is about bootstrap.
> - *
> - * The problem with bootstrap is that when the replicaset boots, all the
> - * instances can't write to WAL and can't recover from their initial snapshot.
> - * They need one node which will boot first, and then they will replicate from
> - * it.
> - *
> - * This one node should boot from its zero snapshot, create replicaset UUID,
> - * register self with ID 1 in _cluster space, and then register all the other
> - * instances here. To do that the node must be writable. It should have
> - * read_only = false, connection quorum satisfied, and be a Raft leader if Raft
> - * is enabled.
> - *
> - * To be elected a Raft leader it needs to perform election. But it can't be
> - * done before at least synchronous quorum of the replicas is bootstrapped. And
> - * they can't be bootstrapped because wait for a leader to initialize _cluster.
> - * Cyclic dependency.
> - *
> - * This is resolved by truncation of the election quorum to the number of
> - * registered replicas, if their count is less than synchronous quorum. That
> - * helps to elect a first leader.
> - *
> - * It may seem that the first node could just declare itself a leader and then
> - * strictly follow the protocol from now on, but that won't work, because if the
> - * first node will restart after it is booted, but before quorum of replicas is
> - * booted, the cluster will stuck again.
> - *
> - * The current solution is totally safe because
> - *
> - * - after all the cluster will have node count >= quorum, if user used a
> - * correct config (God help him if he didn't);
> - *
> - * - synchronous replication quorum is untouched - it is not truncated. Only
> - * leader election quorum is affected. So synchronous data won't be lost.
> - */
> -static inline int
> -raft_election_quorum(const struct raft *raft)
> -{
> - (void)raft;
> - return MIN(replication_synchro_quorum, replicaset.registered_count);
> -}
> -
> /**
> * Wakeup the Raft worker fiber in order to do some async work. If the fiber
> * does not exist yet, it is created.
> @@ -427,13 +383,12 @@ raft_process_msg(struct raft *raft, const struct raft_request *req,
> * and now was answered by some other instance.
> */
> assert(raft->volatile_vote == instance_id);
> - int quorum = raft_election_quorum(raft);
> bool was_set = bit_set(&raft->vote_mask, source);
> raft->vote_count += !was_set;
> - if (raft->vote_count < quorum) {
> + if (raft->vote_count < raft->election_quorum) {
> say_info("RAFT: accepted vote for self, vote "
> "count is %d/%d", raft->vote_count,
> - quorum);
> + raft->election_quorum);
> break;
> }
> raft_sm_become_leader(raft);
> @@ -594,7 +549,7 @@ end_dump:
> raft_sm_wait_leader_dead(raft);
> } else if (raft->vote == instance_id) {
> /* Just wrote own vote. */
> - if (raft_election_quorum(raft) == 1)
> + if (raft->election_quorum == 1)
> raft_sm_become_leader(raft);
> else
> raft_sm_become_candidate(raft);
> @@ -692,7 +647,7 @@ raft_sm_become_leader(struct raft *raft)
> {
> assert(raft->state != RAFT_STATE_LEADER);
> say_info("RAFT: enter leader state with quorum %d",
> - raft_election_quorum(raft));
> + raft->election_quorum);
> assert(raft->leader == 0);
> assert(raft->is_candidate);
> assert(!raft->is_write_in_progress);
> @@ -730,7 +685,7 @@ raft_sm_become_candidate(struct raft *raft)
> assert(raft->vote == instance_id);
> assert(raft->is_candidate);
> assert(!raft->is_write_in_progress);
> - assert(raft_election_quorum(raft) > 1);
> + assert(raft->election_quorum > 1);
> raft->state = RAFT_STATE_CANDIDATE;
> raft->vote_count = 1;
> raft->vote_mask = 0;
> @@ -999,14 +954,14 @@ raft_cfg_election_timeout(struct raft *raft, double timeout)
> }
>
> void
> -raft_cfg_election_quorum(struct raft *raft)
> +raft_cfg_election_quorum(struct raft *raft, int election_quorum)
> {
> - if (raft->state != RAFT_STATE_CANDIDATE ||
> - raft->state == RAFT_STATE_LEADER)
> - return;
> - if (raft->vote_count < raft_election_quorum(raft))
> - return;
> - raft_sm_become_leader(raft);
> + /* At least self is always a part of the quorum. */
> + assert(election_quorum > 0);
> + raft->election_quorum = election_quorum;
> + if (raft->state == RAFT_STATE_CANDIDATE &&
> + raft->vote_count >= raft->election_quorum)
> + raft_sm_become_leader(raft);
> }
>
> void
> @@ -1077,6 +1032,7 @@ raft_create(struct raft *raft)
> .state = RAFT_STATE_FOLLOWER,
> .volatile_term = 1,
> .term = 1,
> + .election_quorum = 1,
> .election_timeout = 5,
> .death_timeout = 5,
> };
> diff --git a/src/box/raftlib.h b/src/box/raftlib.h
> index b33a20326..c9c13136e 100644
> --- a/src/box/raftlib.h
> +++ b/src/box/raftlib.h
> @@ -150,6 +150,8 @@ struct raft {
> vclock_map_t vote_mask;
> /** Number of votes for this instance. Valid only in candidate state. */
> int vote_count;
> + /** Number of votes necessary for successful election. */
> + int election_quorum;
> /** State machine timed event trigger. */
> struct ev_timer timer;
> /** Worker fiber to execute blocking tasks like IO. */
> @@ -225,12 +227,12 @@ void
> raft_cfg_election_timeout(struct raft *raft, double timeout);
>
> /**
> - * Configure Raft leader election quorum. There is no a separate option.
> - * Instead, synchronous replication quorum is used. Since Raft is tightly bound
> - * with synchronous replication.
> + * Configure Raft leader election quorum. That may trigger immediate election,
> + * if the quorum is lowered, and this instance is a candidate having enough
> + * votes for the new quorum.
> */
> void
> -raft_cfg_election_quorum(struct raft *raft);
> +raft_cfg_election_quorum(struct raft *raft, int election_quorum);
>
> /**
> * Configure Raft leader death timeout. I.e. number of seconds without
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 65512cf0f..19d7f6beb 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -39,6 +39,7 @@
> #include "box.h"
> #include "gc.h"
> #include "error.h"
> +#include "raft.h"
> #include "relay.h"
> #include "sio.h"
>
> @@ -250,6 +251,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
> say_info("assigned id %d to replica %s",
> replica->id, tt_uuid_str(&replica->uuid));
> replica->anon = false;
> + box_raft_reconsider_election_quorum();
> }
>
> void
> @@ -298,6 +300,7 @@ replica_clear_id(struct replica *replica)
> assert(!replica->anon);
> replica_delete(replica);
> }
> + box_raft_reconsider_election_quorum();
> }
>
> void
--
Serge Petrenko
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.tarantool.org/pipermail/tarantool-patches/attachments/20201117/60197dc3/attachment.html>
More information about the Tarantool-patches
mailing list