17.11.2020 03:02, Vladislav Shpilevoy пишет: > Raft is being moved to a separate library in src/lib. It means, > it can't depend on anything from box/, including global > replication parameters such as replication_synchro_quorum. > > The patch makes raft stop using replication_synchro_quorum. > > Instead, it has a new option 'election_quorum'. Note, that this is > just Raft API. Box API still uses replication_synchro_quorum. But > it is used to calculate the final quorum in src/box/raft, not > in src/box/raftlib. And to pass it to the base Raft > implementation. > > Part of #5303 > --- > src/box/box.cc | 2 +- > src/box/raft.c | 52 +++++++++++++++++++++++++++++++ > src/box/raft.h | 8 +++++ > src/box/raftlib.c | 70 ++++++++---------------------------------- > src/box/raftlib.h | 10 +++--- > src/box/replication.cc | 3 ++ > 6 files changed, 83 insertions(+), 62 deletions(-) > > diff --git a/src/box/box.cc b/src/box/box.cc > index 25673ed42..cc0d7b81d 100644 > --- a/src/box/box.cc > +++ b/src/box/box.cc > @@ -921,7 +921,7 @@ box_set_replication_synchro_quorum(void) > return -1; > replication_synchro_quorum = value; > txn_limbo_on_parameters_change(&txn_limbo); > - raft_cfg_election_quorum(box_raft()); > + box_raft_reconsider_election_quorum(); > return 0; > } > > diff --git a/src/box/raft.c b/src/box/raft.c > index f289a6993..af6e71e0b 100644 > --- a/src/box/raft.c > +++ b/src/box/raft.c > @@ -30,6 +30,7 @@ > */ > #include "box.h" > #include "raft.h" > +#include "replication.h" > > struct raft box_raft_global = { > /* > @@ -62,6 +63,57 @@ box_raft_on_update_f(struct trigger *trigger, void *event) > return 0; > } > > +void > +box_raft_reconsider_election_quorum(void) Suggestion: maybe use "rewrite"/"reset" instead of "reconsider"? Or plain "update"? Other than that, LGTM. > +{ > + /* > + * When the instance is started first time, it does not have an ID, so > + * the registered count is 0. But the quorum can never be 0. At least > + * the current instance should participate in the quorum. > + */ > + int max = MAX(replicaset.registered_count, 1); > + /** > + * Election quorum is not strictly equal to synchronous replication > + * quorum. Sometimes it can be lowered. That is about bootstrap. > + * > + * The problem with bootstrap is that when the replicaset boots, all the > + * instances can't write to WAL and can't recover from their initial > + * snapshot. They need one node which will boot first, and then they > + * will replicate from it. > + * > + * This one node should boot from its zero snapshot, create replicaset > + * UUID, register self with ID 1 in _cluster space, and then register > + * all the other instances here. To do that the node must be writable. > + * It should have read_only = false, connection quorum satisfied, and be > + * a Raft leader if Raft is enabled. > + * > + * To be elected a Raft leader it needs to perform election. But it > + * can't be done before at least synchronous quorum of the replicas is > + * bootstrapped. And they can't be bootstrapped because wait for a > + * leader to initialize _cluster. Cyclic dependency. > + * > + * This is resolved by truncation of the election quorum to the number > + * of registered replicas, if their count is less than synchronous > + * quorum. That helps to elect a first leader. > + * > + * It may seem that the first node could just declare itself a leader > + * and then strictly follow the protocol from now on, but that won't > + * work, because if the first node will restart after it is booted, but > + * before quorum of replicas is booted, the cluster will stuck again. > + * > + * The current solution is totally safe because > + * > + * - after all the cluster will have node count >= quorum, if user used > + * a correct config (God help him if he didn't); > + * > + * - synchronous replication quorum is untouched - it is not truncated. > + * Only leader election quorum is affected. So synchronous data won't > + * be lost. > + */ > + int quorum = MIN(replication_synchro_quorum, max); > + raft_cfg_election_quorum(box_raft(), quorum); > +} > + > void > box_raft_init(void) > { > diff --git a/src/box/raft.h b/src/box/raft.h > index fe0f073dc..09297273f 100644 > --- a/src/box/raft.h > +++ b/src/box/raft.h > @@ -48,6 +48,14 @@ box_raft(void) > return &box_raft_global; > } > > +/** > + * Let the global raft know that the election quorum could change. It happens > + * when configuration is updated, and when new nodes are added or old are > + * deleted from the cluster. > + */ > +void > +box_raft_reconsider_election_quorum(void); > + > void > box_raft_init(void); > > diff --git a/src/box/raftlib.c b/src/box/raftlib.c > index c156d6f46..0657fa85a 100644 > --- a/src/box/raftlib.c > +++ b/src/box/raftlib.c > @@ -130,50 +130,6 @@ raft_can_vote_for(const struct raft *raft, const struct vclock *v) > return cmp == 0 || cmp == 1; > } > > -/** > - * Election quorum is not strictly equal to synchronous replication quorum. > - * Sometimes it can be lowered. That is about bootstrap. > - * > - * The problem with bootstrap is that when the replicaset boots, all the > - * instances can't write to WAL and can't recover from their initial snapshot. > - * They need one node which will boot first, and then they will replicate from > - * it. > - * > - * This one node should boot from its zero snapshot, create replicaset UUID, > - * register self with ID 1 in _cluster space, and then register all the other > - * instances here. To do that the node must be writable. It should have > - * read_only = false, connection quorum satisfied, and be a Raft leader if Raft > - * is enabled. > - * > - * To be elected a Raft leader it needs to perform election. But it can't be > - * done before at least synchronous quorum of the replicas is bootstrapped. And > - * they can't be bootstrapped because wait for a leader to initialize _cluster. > - * Cyclic dependency. > - * > - * This is resolved by truncation of the election quorum to the number of > - * registered replicas, if their count is less than synchronous quorum. That > - * helps to elect a first leader. > - * > - * It may seem that the first node could just declare itself a leader and then > - * strictly follow the protocol from now on, but that won't work, because if the > - * first node will restart after it is booted, but before quorum of replicas is > - * booted, the cluster will stuck again. > - * > - * The current solution is totally safe because > - * > - * - after all the cluster will have node count >= quorum, if user used a > - * correct config (God help him if he didn't); > - * > - * - synchronous replication quorum is untouched - it is not truncated. Only > - * leader election quorum is affected. So synchronous data won't be lost. > - */ > -static inline int > -raft_election_quorum(const struct raft *raft) > -{ > - (void)raft; > - return MIN(replication_synchro_quorum, replicaset.registered_count); > -} > - > /** > * Wakeup the Raft worker fiber in order to do some async work. If the fiber > * does not exist yet, it is created. > @@ -427,13 +383,12 @@ raft_process_msg(struct raft *raft, const struct raft_request *req, > * and now was answered by some other instance. > */ > assert(raft->volatile_vote == instance_id); > - int quorum = raft_election_quorum(raft); > bool was_set = bit_set(&raft->vote_mask, source); > raft->vote_count += !was_set; > - if (raft->vote_count < quorum) { > + if (raft->vote_count < raft->election_quorum) { > say_info("RAFT: accepted vote for self, vote " > "count is %d/%d", raft->vote_count, > - quorum); > + raft->election_quorum); > break; > } > raft_sm_become_leader(raft); > @@ -594,7 +549,7 @@ end_dump: > raft_sm_wait_leader_dead(raft); > } else if (raft->vote == instance_id) { > /* Just wrote own vote. */ > - if (raft_election_quorum(raft) == 1) > + if (raft->election_quorum == 1) > raft_sm_become_leader(raft); > else > raft_sm_become_candidate(raft); > @@ -692,7 +647,7 @@ raft_sm_become_leader(struct raft *raft) > { > assert(raft->state != RAFT_STATE_LEADER); > say_info("RAFT: enter leader state with quorum %d", > - raft_election_quorum(raft)); > + raft->election_quorum); > assert(raft->leader == 0); > assert(raft->is_candidate); > assert(!raft->is_write_in_progress); > @@ -730,7 +685,7 @@ raft_sm_become_candidate(struct raft *raft) > assert(raft->vote == instance_id); > assert(raft->is_candidate); > assert(!raft->is_write_in_progress); > - assert(raft_election_quorum(raft) > 1); > + assert(raft->election_quorum > 1); > raft->state = RAFT_STATE_CANDIDATE; > raft->vote_count = 1; > raft->vote_mask = 0; > @@ -999,14 +954,14 @@ raft_cfg_election_timeout(struct raft *raft, double timeout) > } > > void > -raft_cfg_election_quorum(struct raft *raft) > +raft_cfg_election_quorum(struct raft *raft, int election_quorum) > { > - if (raft->state != RAFT_STATE_CANDIDATE || > - raft->state == RAFT_STATE_LEADER) > - return; > - if (raft->vote_count < raft_election_quorum(raft)) > - return; > - raft_sm_become_leader(raft); > + /* At least self is always a part of the quorum. */ > + assert(election_quorum > 0); > + raft->election_quorum = election_quorum; > + if (raft->state == RAFT_STATE_CANDIDATE && > + raft->vote_count >= raft->election_quorum) > + raft_sm_become_leader(raft); > } > > void > @@ -1077,6 +1032,7 @@ raft_create(struct raft *raft) > .state = RAFT_STATE_FOLLOWER, > .volatile_term = 1, > .term = 1, > + .election_quorum = 1, > .election_timeout = 5, > .death_timeout = 5, > }; > diff --git a/src/box/raftlib.h b/src/box/raftlib.h > index b33a20326..c9c13136e 100644 > --- a/src/box/raftlib.h > +++ b/src/box/raftlib.h > @@ -150,6 +150,8 @@ struct raft { > vclock_map_t vote_mask; > /** Number of votes for this instance. Valid only in candidate state. */ > int vote_count; > + /** Number of votes necessary for successful election. */ > + int election_quorum; > /** State machine timed event trigger. */ > struct ev_timer timer; > /** Worker fiber to execute blocking tasks like IO. */ > @@ -225,12 +227,12 @@ void > raft_cfg_election_timeout(struct raft *raft, double timeout); > > /** > - * Configure Raft leader election quorum. There is no a separate option. > - * Instead, synchronous replication quorum is used. Since Raft is tightly bound > - * with synchronous replication. > + * Configure Raft leader election quorum. That may trigger immediate election, > + * if the quorum is lowered, and this instance is a candidate having enough > + * votes for the new quorum. > */ > void > -raft_cfg_election_quorum(struct raft *raft); > +raft_cfg_election_quorum(struct raft *raft, int election_quorum); > > /** > * Configure Raft leader death timeout. I.e. number of seconds without > diff --git a/src/box/replication.cc b/src/box/replication.cc > index 65512cf0f..19d7f6beb 100644 > --- a/src/box/replication.cc > +++ b/src/box/replication.cc > @@ -39,6 +39,7 @@ > #include "box.h" > #include "gc.h" > #include "error.h" > +#include "raft.h" > #include "relay.h" > #include "sio.h" > > @@ -250,6 +251,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id) > say_info("assigned id %d to replica %s", > replica->id, tt_uuid_str(&replica->uuid)); > replica->anon = false; > + box_raft_reconsider_election_quorum(); > } > > void > @@ -298,6 +300,7 @@ replica_clear_id(struct replica *replica) > assert(!replica->anon); > replica_delete(replica); > } > + box_raft_reconsider_election_quorum(); > } > > void -- Serge Petrenko