<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
<p><font face="monospace"><br>
</font></p>
<div class="moz-cite-prefix"><font face="monospace">17.11.2020
03:02, Vladislav Shpilevoy пишет:<br>
</font></div>
<blockquote type="cite"
cite="mid:c50ca87693b19490b897d39971fa4730f813f79b.1605570907.git.v.shpilevoy@tarantool.org">
<pre class="moz-quote-pre" wrap="">Raft is being moved to a separate library in src/lib. It means,
it can't depend on anything from box/, including global
replication parameters such as replication_synchro_quorum.
The patch makes raft stop using replication_synchro_quorum.
Instead, it has a new option 'election_quorum'. Note, that this is
just Raft API. Box API still uses replication_synchro_quorum. But
it is used to calculate the final quorum in src/box/raft, not
in src/box/raftlib. And to pass it to the base Raft
implementation.
Part of #5303
---
src/box/box.cc | 2 +-
src/box/raft.c | 52 +++++++++++++++++++++++++++++++
src/box/raft.h | 8 +++++
src/box/raftlib.c | 70 ++++++++----------------------------------
src/box/raftlib.h | 10 +++---
src/box/replication.cc | 3 ++
6 files changed, 83 insertions(+), 62 deletions(-)
diff --git a/src/box/box.cc b/src/box/box.cc
index 25673ed42..cc0d7b81d 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -921,7 +921,7 @@ box_set_replication_synchro_quorum(void)
return -1;
replication_synchro_quorum = value;
txn_limbo_on_parameters_change(&txn_limbo);
- raft_cfg_election_quorum(box_raft());
+ box_raft_reconsider_election_quorum();
return 0;
}
diff --git a/src/box/raft.c b/src/box/raft.c
index f289a6993..af6e71e0b 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -30,6 +30,7 @@
*/
#include "box.h"
#include "raft.h"
+#include "replication.h"
struct raft box_raft_global = {
/*
@@ -62,6 +63,57 @@ box_raft_on_update_f(struct trigger *trigger, void *event)
return 0;
}
+void
+box_raft_reconsider_election_quorum(void)</pre>
</blockquote>
<p><font face="monospace">Suggestion: maybe use "rewrite"/"reset"
instead of "reconsider"?<br>
Or plain "update"?</font></p>
<p><font face="monospace">Other than that, LGTM<font size="+1">.</font><br>
</font></p>
<blockquote type="cite"
cite="mid:c50ca87693b19490b897d39971fa4730f813f79b.1605570907.git.v.shpilevoy@tarantool.org">
<pre class="moz-quote-pre" wrap="">
+{
+ /*
+ * When the instance is started first time, it does not have an ID, so
+ * the registered count is 0. But the quorum can never be 0. At least
+ * the current instance should participate in the quorum.
+ */
+ int max = MAX(replicaset.registered_count, 1);
+ /**
+ * Election quorum is not strictly equal to synchronous replication
+ * quorum. Sometimes it can be lowered. That is about bootstrap.
+ *
+ * The problem with bootstrap is that when the replicaset boots, all the
+ * instances can't write to WAL and can't recover from their initial
+ * snapshot. They need one node which will boot first, and then they
+ * will replicate from it.
+ *
+ * This one node should boot from its zero snapshot, create replicaset
+ * UUID, register self with ID 1 in _cluster space, and then register
+ * all the other instances here. To do that the node must be writable.
+ * It should have read_only = false, connection quorum satisfied, and be
+ * a Raft leader if Raft is enabled.
+ *
+ * To be elected a Raft leader it needs to perform election. But it
+ * can't be done before at least synchronous quorum of the replicas is
+ * bootstrapped. And they can't be bootstrapped because wait for a
+ * leader to initialize _cluster. Cyclic dependency.
+ *
+ * This is resolved by truncation of the election quorum to the number
+ * of registered replicas, if their count is less than synchronous
+ * quorum. That helps to elect a first leader.
+ *
+ * It may seem that the first node could just declare itself a leader
+ * and then strictly follow the protocol from now on, but that won't
+ * work, because if the first node will restart after it is booted, but
+ * before quorum of replicas is booted, the cluster will stuck again.
+ *
+ * The current solution is totally safe because
+ *
+ * - after all the cluster will have node count >= quorum, if user used
+ * a correct config (God help him if he didn't);
+ *
+ * - synchronous replication quorum is untouched - it is not truncated.
+ * Only leader election quorum is affected. So synchronous data won't
+ * be lost.
+ */
+ int quorum = MIN(replication_synchro_quorum, max);
+ raft_cfg_election_quorum(box_raft(), quorum);
+}
+
void
box_raft_init(void)
{
diff --git a/src/box/raft.h b/src/box/raft.h
index fe0f073dc..09297273f 100644
--- a/src/box/raft.h
+++ b/src/box/raft.h
@@ -48,6 +48,14 @@ box_raft(void)
return &box_raft_global;
}
+/**
+ * Let the global raft know that the election quorum could change. It happens
+ * when configuration is updated, and when new nodes are added or old are
+ * deleted from the cluster.
+ */
+void
+box_raft_reconsider_election_quorum(void);
+
void
box_raft_init(void);
diff --git a/src/box/raftlib.c b/src/box/raftlib.c
index c156d6f46..0657fa85a 100644
--- a/src/box/raftlib.c
+++ b/src/box/raftlib.c
@@ -130,50 +130,6 @@ raft_can_vote_for(const struct raft *raft, const struct vclock *v)
return cmp == 0 || cmp == 1;
}
-/**
- * Election quorum is not strictly equal to synchronous replication quorum.
- * Sometimes it can be lowered. That is about bootstrap.
- *
- * The problem with bootstrap is that when the replicaset boots, all the
- * instances can't write to WAL and can't recover from their initial snapshot.
- * They need one node which will boot first, and then they will replicate from
- * it.
- *
- * This one node should boot from its zero snapshot, create replicaset UUID,
- * register self with ID 1 in _cluster space, and then register all the other
- * instances here. To do that the node must be writable. It should have
- * read_only = false, connection quorum satisfied, and be a Raft leader if Raft
- * is enabled.
- *
- * To be elected a Raft leader it needs to perform election. But it can't be
- * done before at least synchronous quorum of the replicas is bootstrapped. And
- * they can't be bootstrapped because wait for a leader to initialize _cluster.
- * Cyclic dependency.
- *
- * This is resolved by truncation of the election quorum to the number of
- * registered replicas, if their count is less than synchronous quorum. That
- * helps to elect a first leader.
- *
- * It may seem that the first node could just declare itself a leader and then
- * strictly follow the protocol from now on, but that won't work, because if the
- * first node will restart after it is booted, but before quorum of replicas is
- * booted, the cluster will stuck again.
- *
- * The current solution is totally safe because
- *
- * - after all the cluster will have node count >= quorum, if user used a
- * correct config (God help him if he didn't);
- *
- * - synchronous replication quorum is untouched - it is not truncated. Only
- * leader election quorum is affected. So synchronous data won't be lost.
- */
-static inline int
-raft_election_quorum(const struct raft *raft)
-{
- (void)raft;
- return MIN(replication_synchro_quorum, replicaset.registered_count);
-}
-
/**
* Wakeup the Raft worker fiber in order to do some async work. If the fiber
* does not exist yet, it is created.
@@ -427,13 +383,12 @@ raft_process_msg(struct raft *raft, const struct raft_request *req,
* and now was answered by some other instance.
*/
assert(raft->volatile_vote == instance_id);
- int quorum = raft_election_quorum(raft);
bool was_set = bit_set(&raft->vote_mask, source);
raft->vote_count += !was_set;
- if (raft->vote_count < quorum) {
+ if (raft->vote_count < raft->election_quorum) {
say_info("RAFT: accepted vote for self, vote "
"count is %d/%d", raft->vote_count,
- quorum);
+ raft->election_quorum);
break;
}
raft_sm_become_leader(raft);
@@ -594,7 +549,7 @@ end_dump:
raft_sm_wait_leader_dead(raft);
} else if (raft->vote == instance_id) {
/* Just wrote own vote. */
- if (raft_election_quorum(raft) == 1)
+ if (raft->election_quorum == 1)
raft_sm_become_leader(raft);
else
raft_sm_become_candidate(raft);
@@ -692,7 +647,7 @@ raft_sm_become_leader(struct raft *raft)
{
assert(raft->state != RAFT_STATE_LEADER);
say_info("RAFT: enter leader state with quorum %d",
- raft_election_quorum(raft));
+ raft->election_quorum);
assert(raft->leader == 0);
assert(raft->is_candidate);
assert(!raft->is_write_in_progress);
@@ -730,7 +685,7 @@ raft_sm_become_candidate(struct raft *raft)
assert(raft->vote == instance_id);
assert(raft->is_candidate);
assert(!raft->is_write_in_progress);
- assert(raft_election_quorum(raft) > 1);
+ assert(raft->election_quorum > 1);
raft->state = RAFT_STATE_CANDIDATE;
raft->vote_count = 1;
raft->vote_mask = 0;
@@ -999,14 +954,14 @@ raft_cfg_election_timeout(struct raft *raft, double timeout)
}
void
-raft_cfg_election_quorum(struct raft *raft)
+raft_cfg_election_quorum(struct raft *raft, int election_quorum)
{
- if (raft->state != RAFT_STATE_CANDIDATE ||
- raft->state == RAFT_STATE_LEADER)
- return;
- if (raft->vote_count < raft_election_quorum(raft))
- return;
- raft_sm_become_leader(raft);
+ /* At least self is always a part of the quorum. */
+ assert(election_quorum > 0);
+ raft->election_quorum = election_quorum;
+ if (raft->state == RAFT_STATE_CANDIDATE &&
+ raft->vote_count >= raft->election_quorum)
+ raft_sm_become_leader(raft);
}
void
@@ -1077,6 +1032,7 @@ raft_create(struct raft *raft)
.state = RAFT_STATE_FOLLOWER,
.volatile_term = 1,
.term = 1,
+ .election_quorum = 1,
.election_timeout = 5,
.death_timeout = 5,
};
diff --git a/src/box/raftlib.h b/src/box/raftlib.h
index b33a20326..c9c13136e 100644
--- a/src/box/raftlib.h
+++ b/src/box/raftlib.h
@@ -150,6 +150,8 @@ struct raft {
vclock_map_t vote_mask;
/** Number of votes for this instance. Valid only in candidate state. */
int vote_count;
+ /** Number of votes necessary for successful election. */
+ int election_quorum;
/** State machine timed event trigger. */
struct ev_timer timer;
/** Worker fiber to execute blocking tasks like IO. */
@@ -225,12 +227,12 @@ void
raft_cfg_election_timeout(struct raft *raft, double timeout);
/**
- * Configure Raft leader election quorum. There is no a separate option.
- * Instead, synchronous replication quorum is used. Since Raft is tightly bound
- * with synchronous replication.
+ * Configure Raft leader election quorum. That may trigger immediate election,
+ * if the quorum is lowered, and this instance is a candidate having enough
+ * votes for the new quorum.
*/
void
-raft_cfg_election_quorum(struct raft *raft);
+raft_cfg_election_quorum(struct raft *raft, int election_quorum);
/**
* Configure Raft leader death timeout. I.e. number of seconds without
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 65512cf0f..19d7f6beb 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -39,6 +39,7 @@
#include "box.h"
#include "gc.h"
#include "error.h"
+#include "raft.h"
#include "relay.h"
#include "sio.h"
@@ -250,6 +251,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
say_info("assigned id %d to replica %s",
replica->id, tt_uuid_str(&replica->uuid));
replica->anon = false;
+ box_raft_reconsider_election_quorum();
}
void
@@ -298,6 +300,7 @@ replica_clear_id(struct replica *replica)
assert(!replica->anon);
replica_delete(replica);
}
+ box_raft_reconsider_election_quorum();
}
void
</pre>
</blockquote>
<pre class="moz-signature" cols="72">--
Serge Petrenko</pre>
</body>
</html>