<html>

  <head>

    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">

  </head>

  <body>

    <p><font face="monospace"><br>

      </font></p>

    <div class="moz-cite-prefix"><font face="monospace">17.11.2020

        03:02, Vladislav Shpilevoy пишет:<br>

      </font></div>

    <blockquote type="cite"

cite="mid:c50ca87693b19490b897d39971fa4730f813f79b.1605570907.git.v.shpilevoy@tarantool.org">

      <pre class="moz-quote-pre" wrap="">Raft is being moved to a separate library in src/lib. It means,

it can't depend on anything from box/, including global

replication parameters such as replication_synchro_quorum.

The patch makes raft stop using replication_synchro_quorum.

Instead, it has a new option 'election_quorum'. Note, that this is

just Raft API. Box API still uses replication_synchro_quorum. But

it is used to calculate the final quorum in src/box/raft, not

in src/box/raftlib. And to pass it to the base Raft

implementation.

Part of #5303

---

 src/box/box.cc         |  2 +-

 src/box/raft.c         | 52 +++++++++++++++++++++++++++++++

 src/box/raft.h         |  8 +++++

 src/box/raftlib.c      | 70 ++++++++----------------------------------

 src/box/raftlib.h      | 10 +++---

 src/box/replication.cc |  3 ++

 6 files changed, 83 insertions(+), 62 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc

index 25673ed42..cc0d7b81d 100644

--- a/src/box/box.cc

+++ b/src/box/box.cc

@@ -921,7 +921,7 @@ box_set_replication_synchro_quorum(void)

                return -1;

        replication_synchro_quorum = value;

        txn_limbo_on_parameters_change(&txn_limbo);

-       raft_cfg_election_quorum(box_raft());

+       box_raft_reconsider_election_quorum();

        return 0;

 }

diff --git a/src/box/raft.c b/src/box/raft.c

index f289a6993..af6e71e0b 100644

--- a/src/box/raft.c

+++ b/src/box/raft.c

@@ -30,6 +30,7 @@

  */

 #include "box.h"

 #include "raft.h"

+#include "replication.h"

 struct raft box_raft_global = {

        /*

@@ -62,6 +63,57 @@ box_raft_on_update_f(struct trigger *trigger, void *event)

        return 0;

 }

+void

+box_raft_reconsider_election_quorum(void)</pre>

    </blockquote>

    <p><font face="monospace">Suggestion: maybe use "rewrite"/"reset"

        instead of "reconsider"?<br>

        Or plain "update"?</font></p>

    <p><font face="monospace">Other than that, LGTM<font size="+1">.</font><br>

      </font></p>

    <blockquote type="cite"

cite="mid:c50ca87693b19490b897d39971fa4730f813f79b.1605570907.git.v.shpilevoy@tarantool.org">

      <pre class="moz-quote-pre" wrap="">

+{

+       /*

+        * When the instance is started first time, it does not have an ID, so

+        * the registered count is 0. But the quorum can never be 0. At least

+        * the current instance should participate in the quorum.

+        */

+       int max = MAX(replicaset.registered_count, 1);

+       /**

+        * Election quorum is not strictly equal to synchronous replication

+        * quorum. Sometimes it can be lowered. That is about bootstrap.

+        *

+        * The problem with bootstrap is that when the replicaset boots, all the

+        * instances can't write to WAL and can't recover from their initial

+        * snapshot. They need one node which will boot first, and then they

+        * will replicate from it.

+        *

+        * This one node should boot from its zero snapshot, create replicaset

+        * UUID, register self with ID 1 in _cluster space, and then register

+        * all the other instances here. To do that the node must be writable.

+        * It should have read_only = false, connection quorum satisfied, and be

+        * a Raft leader if Raft is enabled.

+        *

+        * To be elected a Raft leader it needs to perform election. But it

+        * can't be done before at least synchronous quorum of the replicas is

+        * bootstrapped. And they can't be bootstrapped because wait for a

+        * leader to initialize _cluster. Cyclic dependency.

+        *

+        * This is resolved by truncation of the election quorum to the number

+        * of registered replicas, if their count is less than synchronous

+        * quorum. That helps to elect a first leader.

+        *

+        * It may seem that the first node could just declare itself a leader

+        * and then strictly follow the protocol from now on, but that won't

+        * work, because if the first node will restart after it is booted, but

+        * before quorum of replicas is booted, the cluster will stuck again.

+        *

+        * The current solution is totally safe because

+        *

+        * - after all the cluster will have node count >= quorum, if user used

+        *   a correct config (God help him if he didn't);

+        *

+        * - synchronous replication quorum is untouched - it is not truncated.

+        *   Only leader election quorum is affected. So synchronous data won't

+        *   be lost.

+        */

+       int quorum = MIN(replication_synchro_quorum, max);

+       raft_cfg_election_quorum(box_raft(), quorum);

+}

+

 void

 box_raft_init(void)

 {

diff --git a/src/box/raft.h b/src/box/raft.h

index fe0f073dc..09297273f 100644

--- a/src/box/raft.h

+++ b/src/box/raft.h

@@ -48,6 +48,14 @@ box_raft(void)

        return &box_raft_global;

 }

+/**

+ * Let the global raft know that the election quorum could change. It happens

+ * when configuration is updated, and when new nodes are added or old are

+ * deleted from the cluster.

+ */

+void

+box_raft_reconsider_election_quorum(void);

+

 void

 box_raft_init(void);

diff --git a/src/box/raftlib.c b/src/box/raftlib.c

index c156d6f46..0657fa85a 100644

--- a/src/box/raftlib.c

+++ b/src/box/raftlib.c

@@ -130,50 +130,6 @@ raft_can_vote_for(const struct raft *raft, const struct vclock *v)

        return cmp == 0 || cmp == 1;

 }

-/**

- * Election quorum is not strictly equal to synchronous replication quorum.

- * Sometimes it can be lowered. That is about bootstrap.

- *

- * The problem with bootstrap is that when the replicaset boots, all the

- * instances can't write to WAL and can't recover from their initial snapshot.

- * They need one node which will boot first, and then they will replicate from

- * it.

- *

- * This one node should boot from its zero snapshot, create replicaset UUID,

- * register self with ID 1 in _cluster space, and then register all the other

- * instances here. To do that the node must be writable. It should have

- * read_only = false, connection quorum satisfied, and be a Raft leader if Raft

- * is enabled.

- *

- * To be elected a Raft leader it needs to perform election. But it can't be

- * done before at least synchronous quorum of the replicas is bootstrapped. And

- * they can't be bootstrapped because wait for a leader to initialize _cluster.

- * Cyclic dependency.

- *

- * This is resolved by truncation of the election quorum to the number of

- * registered replicas, if their count is less than synchronous quorum. That

- * helps to elect a first leader.

- *

- * It may seem that the first node could just declare itself a leader and then

- * strictly follow the protocol from now on, but that won't work, because if the

- * first node will restart after it is booted, but before quorum of replicas is

- * booted, the cluster will stuck again.

- *

- * The current solution is totally safe because

- *

- * - after all the cluster will have node count >= quorum, if user used a

- *   correct config (God help him if he didn't);

- *

- * - synchronous replication quorum is untouched - it is not truncated. Only

- *   leader election quorum is affected. So synchronous data won't be lost.

- */

-static inline int

-raft_election_quorum(const struct raft *raft)

-{

-       (void)raft;

-       return MIN(replication_synchro_quorum, replicaset.registered_count);

-}

-

 /**

  * Wakeup the Raft worker fiber in order to do some async work. If the fiber

  * does not exist yet, it is created.

@@ -427,13 +383,12 @@ raft_process_msg(struct raft *raft, const struct raft_request *req,

                         * and now was answered by some other instance.

                         */

                        assert(raft->volatile_vote == instance_id);

-                       int quorum = raft_election_quorum(raft);

                        bool was_set = bit_set(&raft->vote_mask, source);

                        raft->vote_count += !was_set;

-                       if (raft->vote_count < quorum) {

+                       if (raft->vote_count < raft->election_quorum) {

                                say_info("RAFT: accepted vote for self, vote "

                                         "count is %d/%d", raft->vote_count,

-                                        quorum);

+                                        raft->election_quorum);

                                break;

                        }

                        raft_sm_become_leader(raft);

@@ -594,7 +549,7 @@ end_dump:

                        raft_sm_wait_leader_dead(raft);

                } else if (raft->vote == instance_id) {

                        /* Just wrote own vote. */

-                       if (raft_election_quorum(raft) == 1)

+                       if (raft->election_quorum == 1)

                                raft_sm_become_leader(raft);

                        else

                                raft_sm_become_candidate(raft);

@@ -692,7 +647,7 @@ raft_sm_become_leader(struct raft *raft)

 {

        assert(raft->state != RAFT_STATE_LEADER);

        say_info("RAFT: enter leader state with quorum %d",

-                raft_election_quorum(raft));

+                raft->election_quorum);

        assert(raft->leader == 0);

        assert(raft->is_candidate);

        assert(!raft->is_write_in_progress);

@@ -730,7 +685,7 @@ raft_sm_become_candidate(struct raft *raft)

        assert(raft->vote == instance_id);

        assert(raft->is_candidate);

        assert(!raft->is_write_in_progress);

-       assert(raft_election_quorum(raft) > 1);

+       assert(raft->election_quorum > 1);

        raft->state = RAFT_STATE_CANDIDATE;

        raft->vote_count = 1;

        raft->vote_mask = 0;

@@ -999,14 +954,14 @@ raft_cfg_election_timeout(struct raft *raft, double timeout)

 }

 void

-raft_cfg_election_quorum(struct raft *raft)

+raft_cfg_election_quorum(struct raft *raft, int election_quorum)

 {

-       if (raft->state != RAFT_STATE_CANDIDATE ||

-           raft->state == RAFT_STATE_LEADER)

-               return;

-       if (raft->vote_count < raft_election_quorum(raft))

-               return;

-       raft_sm_become_leader(raft);

+       /* At least self is always a part of the quorum. */

+       assert(election_quorum > 0);

+       raft->election_quorum = election_quorum;

+       if (raft->state == RAFT_STATE_CANDIDATE &&

+           raft->vote_count >= raft->election_quorum)

+               raft_sm_become_leader(raft);

 }

 void

@@ -1077,6 +1032,7 @@ raft_create(struct raft *raft)

                .state = RAFT_STATE_FOLLOWER,

                .volatile_term = 1,

                .term = 1,

+               .election_quorum = 1,

                .election_timeout = 5,

                .death_timeout = 5,

        };

diff --git a/src/box/raftlib.h b/src/box/raftlib.h

index b33a20326..c9c13136e 100644

--- a/src/box/raftlib.h

+++ b/src/box/raftlib.h

@@ -150,6 +150,8 @@ struct raft {

        vclock_map_t vote_mask;

        /** Number of votes for this instance. Valid only in candidate state. */

        int vote_count;

+       /** Number of votes necessary for successful election. */

+       int election_quorum;

        /** State machine timed event trigger. */

        struct ev_timer timer;

        /** Worker fiber to execute blocking tasks like IO. */

@@ -225,12 +227,12 @@ void

 raft_cfg_election_timeout(struct raft *raft, double timeout);

 /**

- * Configure Raft leader election quorum. There is no a separate option.

- * Instead, synchronous replication quorum is used. Since Raft is tightly bound

- * with synchronous replication.

+ * Configure Raft leader election quorum. That may trigger immediate election,

+ * if the quorum is lowered, and this instance is a candidate having enough

+ * votes for the new quorum.

  */

 void

-raft_cfg_election_quorum(struct raft *raft);

+raft_cfg_election_quorum(struct raft *raft, int election_quorum);

 /**

  * Configure Raft leader death timeout. I.e. number of seconds without

diff --git a/src/box/replication.cc b/src/box/replication.cc

index 65512cf0f..19d7f6beb 100644

--- a/src/box/replication.cc

+++ b/src/box/replication.cc

@@ -39,6 +39,7 @@

 #include "box.h"

 #include "gc.h"

 #include "error.h"

+#include "raft.h"

 #include "relay.h"

 #include "sio.h"

@@ -250,6 +251,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id)

        say_info("assigned id %d to replica %s",

                 replica->id, tt_uuid_str(&replica->uuid));

        replica->anon = false;

+       box_raft_reconsider_election_quorum();

 }

 void

@@ -298,6 +300,7 @@ replica_clear_id(struct replica *replica)

                assert(!replica->anon);

                replica_delete(replica);

        }

+       box_raft_reconsider_election_quorum();

 }

 void

</pre>

    </blockquote>

    <pre class="moz-signature" cols="72">-- 

Serge Petrenko</pre>

  </body>

</html>