[Tarantool-patches] [PATCH 0/8] dRaft
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Fri Sep 4 01:51:38 MSK 2020
I made a few changes to the existing commits and added 2 new.
Here is the complete diff. The patchset is resent in v2 thread.
See the diff below with my comments inlined.
====================
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 7486d9929..c7c486ee4 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -1277,9 +1277,8 @@ applier_subscribe(struct applier *applier)
if (applier_handle_raft(applier,
first_row) != 0)
diag_raise();
- } else {
- applier_signal_ack(applier);
}
+ applier_signal_ack(applier);
==================================================
I made the ACK sent always. Because a Raft message could not change state of the
instance, wouldn't cause a WAL write, and wouldn't generate an ACK after all.
==================================================
} else if (applier_apply_tx(applier, &rows) != 0) {
diag_raise();
}
diff --git a/src/box/box.cc b/src/box/box.cc
index 2e9c90310..9d0782fff 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -171,6 +171,10 @@ static int
box_check_writable(void)
{
if (is_ro_summary) {
+ /*
+ * XXX: return a special error when the node is not a leader to
+ * reroute to the leader node.
+ */
diag_set(ClientError, ER_READONLY);
diag_log();
return -1;
@@ -2143,10 +2147,10 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header)
*/
struct raft_request req;
/*
- * Omit the candidate vclock, since we've just
- * sent it in subscribe response.
+ * Omit the candidate vclock, since we've just sent it in
+ * subscribe response.
==================================================
Many comments were realigned from 66 to 80.
==================================================
*/
- raft_serialize(&req, NULL);
+ raft_serialize_for_network(&req, NULL);
xrow_encode_raft(&row, &fiber()->gc, &req);
coio_write_xrow(io, &row);
}
@@ -2795,8 +2799,6 @@ box_cfg_xc(void)
if (!is_bootstrap_leader)
replicaset_sync();
- else if (raft_is_enabled())
- raft_bootstrap_leader();
/* box.cfg.read_only is not read yet. */
assert(box_is_ro());
diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 1c131caec..8e1dbd497 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -49,6 +49,7 @@
#include "main.h"
#include "version.h"
#include "box/box.h"
+#include "box/raft.h"
#include "lua/utils.h"
#include "fiber.h"
#include "tt_static.h"
@@ -577,6 +578,21 @@ lbox_info_listen(struct lua_State *L)
return 1;
}
+static int
+lbox_info_raft(struct lua_State *L)
+{
+ lua_createtable(L, 0, 4);
+ lua_pushstring(L, raft_state_strs[raft.state]);
+ lua_setfield(L, -2, "state");
+ luaL_pushuint64(L, raft.volatile_term);
+ lua_setfield(L, -2, "term");
+ lua_pushinteger(L, raft.volatile_vote);
+ lua_setfield(L, -2, "vote");
+ lua_pushinteger(L, raft.leader);
+ lua_setfield(L, -2, "leader");
+ return 1;
+}
+
static const struct luaL_Reg lbox_info_dynamic_meta[] = {
{"id", lbox_info_id},
{"uuid", lbox_info_uuid},
@@ -595,6 +611,7 @@ static const struct luaL_Reg lbox_info_dynamic_meta[] = {
{"vinyl", lbox_info_vinyl},
{"sql", lbox_info_sql},
{"listen", lbox_info_listen},
+ {"raft", lbox_info_raft},
{NULL, NULL}
};
diff --git a/src/box/lua/misc.cc b/src/box/lua/misc.cc
index efbbcfd1f..e356f2d4b 100644
--- a/src/box/lua/misc.cc
+++ b/src/box/lua/misc.cc
@@ -40,7 +40,6 @@
#include "box/tuple.h"
#include "box/tuple_format.h"
#include "box/lua/tuple.h"
-#include "box/raft.h"
#include "box/xrow.h"
#include "mpstream/mpstream.h"
@@ -248,46 +247,12 @@ lbox_tuple_format_new(struct lua_State *L)
/* }}} */
-static int
-lbox_raft_new_term(struct lua_State *L)
-{
- uint64_t min_term = luaL_checkuint64(L, 1);
- raft_new_term(min_term);
- return 0;
-}
-
-static int
-lbox_raft_get(struct lua_State *L)
-{
- lua_createtable(L, 0, 8);
- luaL_pushuint64(L, raft.term);
- lua_setfield(L, -2, "term");
- luaL_pushuint64(L, raft.volatile_term);
- lua_setfield(L, -2, "volatile_term");
- luaL_pushuint64(L, raft.vote);
- lua_setfield(L, -2, "vote");
- luaL_pushuint64(L, raft.volatile_vote);
- lua_setfield(L, -2, "volatile_vote");
- lua_pushstring(L, raft_state_strs[raft.state]);
- lua_setfield(L, -2, "state");
- lua_pushinteger(L, raft.vote_count);
- lua_setfield(L, -2, "vote_count");
- lua_pushboolean(L, raft.is_write_in_progress);
- lua_setfield(L, -2, "is_write_in_progress");
- lua_pushboolean(L, raft.is_candidate);
- lua_setfield(L, -2, "is_candidate");
- return 1;
-}
==================================================
The first helper functions about new vote and new term were removed. Perhaps the
term bump will return as something like box.internal.raft_new_term() for the
tests, but vote should be controlled by Raft completely.
Information part is moved into box.info.raft in info.cc.
==================================================
-
void
box_lua_misc_init(struct lua_State *L)
{
static const struct luaL_Reg boxlib_internal[] = {
{"select", lbox_select},
{"new_tuple_format", lbox_tuple_format_new},
- /* Temporary helpers to sanity test raft persistency. */
- {"raft_new_term", lbox_raft_new_term},
- {"raft_get", lbox_raft_get},
{NULL, NULL}
};
diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c
index b0b744db8..fe7ae9f63 100644
--- a/src/box/memtx_engine.c
+++ b/src/box/memtx_engine.c
@@ -517,11 +517,7 @@ checkpoint_new(const char *snap_dirname, uint64_t snap_io_rate_limit)
opts.free_cache = true;
xdir_create(&ckpt->dir, snap_dirname, SNAP, &INSTANCE_UUID, &opts);
vclock_create(&ckpt->vclock);
- /*
- * Don't encode vclock, because it is stored in the snapshot header
- * anyway.
- */
- raft_serialize(&ckpt->raft, NULL);
+ raft_serialize_for_disk(&ckpt->raft);
ckpt->touch = false;
return ckpt;
}
diff --git a/src/box/raft.c b/src/box/raft.c
index 6f2891291..1c4275cd5 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -79,7 +79,7 @@ struct raft raft = {
* will turn into a candidate when the flush is done.
*
* In case of a new not flushed vote it means either this node voted for some
- * other node, and must be a follower; or it voted or self, and also must be a
+ * other node, and must be a follower; or it voted for self, and also must be a
* follower, but will become a candidate when the flush is done.
*
* In total - when something is not synced with disk, the instance is a follower
@@ -96,14 +96,14 @@ raft_is_fully_on_disk(void)
* Raft protocol says that election timeout should be a bit randomized so as
* the nodes wouldn't start election at the same time and end up with not having
* a quorum for anybody. This implementation randomizes the election timeout by
- * adding {election timeout * random factor} value, where the factor is a
- * constant floating point value > 0.
+ * adding {election timeout * random factor} value, where max value of the
+ * factor is a constant floating point value > 0.
*/
static inline double
raft_new_random_election_shift(void)
{
double timeout = raft.election_timeout;
- /* Translate to ms. */
+ /* Translate to ms. Integer is needed to be able to use mod below. */
uint32_t rand_part =
(uint32_t)(timeout * RAFT_RANDOM_ELECTION_FACTOR * 1000);
if (rand_part == 0)
@@ -134,6 +134,49 @@ raft_can_vote_for(const struct vclock *v)
return cmp == 0 || cmp == 1;
}
+/**
+ * Election quorum is not strictly equal to synchronous replication quorum.
+ * Sometimes it can be lowered. That is about bootstrap.
+ *
+ * The problem with bootstrap is that when the replicaset boots, all the
+ * instances can't write to WAL and can't recover from their initial snapshot.
+ * They need one node which will boot first, and then they will replicate from
+ * it.
+ *
+ * This one node should boot from its zero snapshot, create replicaset UUID,
+ * register self with ID 1 in _cluster space, and then register all the other
+ * instances here. To do that the node must be writable. It should have
+ * read_only = false, connection quorum satisfied, and be a Raft leader if Raft
+ * is enabled.
+ *
+ * To be elected a Raft leader it needs to perform election. But it can't be
+ * done before at least synchronous quorum of the replicas is bootstrapped. And
+ * they can't be bootstrapped because wait for a leader to initialize _cluster.
+ * Cyclic dependency.
+ *
+ * This is resolved by truncation of the election quorum to the number of
+ * registered replicas, if their count is less than synchronous quorum. That
+ * helps to elect a first leader.
+ *
+ * It may seem that the first node could just declare itself a leader and then
+ * strictly follow the protocol from now on, but that won't work, because if the
+ * first node will restart after it is booted, but before quorum of replicas is
+ * booted, the cluster will stuck again.
+ *
+ * The current solution is totally safe because
+ *
+ * - after all the cluster will have node count >= quorum, if user used a
+ * correct config (God help him if he didn't);
+ *
+ * - synchronous replication quorum is untouched - it is not truncated. Only
+ * leader election quorum is affected. So synchronous data won't be lost.
+ */
+static inline int
+raft_election_quorum(void)
+{
+ return MIN(replication_synchro_quorum, replicaset.size);
+}
+
/** Broadcast an event about this node changed its state to all relays. */
static inline void
raft_broadcast_new_state(void)
@@ -157,9 +200,9 @@ static void
raft_sm_start(void);
/**
- * Stop the state machine.
+ * Stop the state machine. Now until Raft is re-enabled,
* - Raft stops affecting the instance operation;
- * - this node can't become a leader anymore;
+ * - this node can't become a leader;
* - this node can't vote.
*/
static void
@@ -188,7 +231,10 @@ raft_sm_schedule_new_term(uint64_t new_term);
static void
raft_sm_schedule_new_vote(uint32_t new_vote);
-/** Bump volatile term, vote for self, and schedule their flush to disk. */
+/**
+ * Bump term and vote for self immediately. After that is persisted, the
+ * election timeout will be activated. Unless during that nothing newer happens.
+ */
static void
raft_sm_schedule_new_election(void);
@@ -200,6 +246,17 @@ static void
raft_sm_schedule_new_election_cb(struct ev_loop *loop, struct ev_timer *timer,
int events);
+/** Start Raft state flush to disk. */
+static void
+raft_sm_pause_and_dump(void);
+
+/**
+ * Flush Raft state changes to WAL. The callback resets itself, if during the
+ * write more changes appear.
+ */
+static void
+raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events);
==================================================
I announces these functions even though it wasn't necessary, to keep all
raft_sm_* methods declared in one place.
==================================================
+
void
raft_process_recovery(const struct raft_request *req)
{
@@ -261,6 +318,14 @@ raft_process_msg(const struct raft_request *req, uint32_t source)
/* Can't vote for too old or incomparable nodes. */
if (!raft_can_vote_for(req->vclock))
break;
+ /*
+ * Check if somebody is asking to vote for a third
+ * node - nope. Make votes only when asked directly by
+ * the new candidate. However that restriction may be
+ * relaxed in future, if can be proven to be safe.
+ */
+ if (req->vote != source)
+ break;
==================================================
I decided not to modify the original Raft here, and don't spread a vote wave.
A node can vote only for candidates asking for a vote directly. Not via a third
node.
==================================================
/*
* Either the term is new, or didn't vote in the current
* term yet. Anyway can vote now.
@@ -279,7 +344,7 @@ raft_process_msg(const struct raft_request *req, uint32_t source)
assert(raft.volatile_vote == instance_id);
bool was_set = bit_set(&raft.vote_mask, source);
raft.vote_count += !was_set;
- if (raft.vote_count < replication_synchro_quorum)
+ if (raft.vote_count < raft_election_quorum())
break;
raft.state = RAFT_STATE_LEADER;
raft.leader = instance_id;
@@ -330,8 +395,8 @@ void
raft_process_heartbeat(uint32_t source)
{
/*
- * When not a candidate - don't wait for anything. Therefore does not
- * care about the leader being dead.
+ * When not a candidate - don't wait for anything. Therefore do not care
+ * about the leader being dead.
*/
if (!raft.is_candidate)
return;
@@ -352,23 +417,6 @@ raft_process_heartbeat(uint32_t source)
raft_sm_wait_leader_dead();
}
-void
-raft_serialize(struct raft_request *req, struct vclock *vclock)
-{
- memset(req, 0, sizeof(*req));
- /*
- * Volatile state is never used for any communications.
- * Use only persisted state.
- */
- req->term = raft.term;
- req->vote = raft.vote;
- req->state = raft.state;
- /*
- * Raft does not own vclock, so it always expects it passed externally.
- */
- req->vclock = vclock;
-}
-
/** Wakeup Raft state writer fiber waiting for WAL write end. */
static void
raft_write_cb(struct journal_entry *entry)
@@ -386,6 +434,13 @@ raft_write_request(const struct raft_request *req)
* be sent to network when vote for self.
*/
assert(req->vclock == NULL);
+ /*
+ * State is not persisted. That would be strictly against Raft protocol.
+ * The reason is that it does not make much sense - even if the node is
+ * a leader now, after the node is restarted, there will be another
+ * leader elected by that time likely.
+ */
+ assert(req->state == 0);
struct region *region = &fiber()->gc;
uint32_t svp = region_used(region);
struct xrow_header row;
@@ -417,12 +472,8 @@ fail:
panic("Could not write a raft request to WAL\n");
}
-/**
- * Flush Raft state changes to WAL. The callback resets itself, if during the
- * write more changes appear.
- */
static void
-raft_sm_dump_step(ev_loop *loop, ev_check *watcher, int events)
+raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events)
{
assert(watcher == &raft.io);
(void) events;
@@ -453,7 +504,7 @@ end_dump:
raft_sm_wait_leader_dead();
} else if (raft.vote == instance_id) {
/* Just wrote own vote. */
- if (replication_synchro_quorum == 1) {
+ if (raft_election_quorum() == 1) {
raft.state = RAFT_STATE_LEADER;
raft.leader = instance_id;
/*
@@ -463,8 +514,10 @@ end_dump:
box_update_ro_summary();
} else {
raft.state = RAFT_STATE_CANDIDATE;
+ /* First vote for self. */
raft.vote_count = 1;
raft.vote_mask = 0;
+ bit_set(&raft.vote_mask, instance_id);
raft_sm_wait_election_end();
}
} else if (raft.vote != 0) {
@@ -530,7 +583,6 @@ end_dump:
raft_broadcast(&req);
}
-/** Start Raft state flush to disk. */
static void
raft_sm_pause_and_dump(void)
{
@@ -542,7 +594,6 @@ raft_sm_pause_and_dump(void)
raft.is_write_in_progress = true;
}
-/** Bump term, reset Raft state, and persist that fact. */
static void
raft_sm_schedule_new_term(uint64_t new_term)
{
@@ -556,7 +607,6 @@ raft_sm_schedule_new_term(uint64_t new_term)
raft_sm_pause_and_dump();
}
-/** Vote in the current term, and persist that fact. */
static void
raft_sm_schedule_new_vote(uint32_t new_vote)
{
@@ -565,10 +615,6 @@ raft_sm_schedule_new_vote(uint32_t new_vote)
raft_sm_pause_and_dump();
}
-/**
- * Bump term and vote for self immediately. After that is persisted, the
- * election timeout will be activated. Unless during that nothing newer happens.
- */
static void
raft_sm_schedule_new_election(void)
{
@@ -581,21 +627,6 @@ raft_sm_schedule_new_election(void)
box_update_ro_summary();
}
-void
-raft_new_term(uint64_t min_new_term)
-{
- uint64_t new_term;
- if (raft.term < min_new_term)
- new_term = min_new_term + 1;
- else
- new_term = raft.term + 1;
- enum raft_state old_state = raft.state;
- raft_sm_schedule_new_term(new_term);
- if (raft.state != old_state)
- raft_broadcast_new_state();
- box_update_ro_summary();
-}
-
static void
raft_sm_schedule_new_election_cb(struct ev_loop *loop, struct ev_timer *timer,
int events)
@@ -657,7 +688,7 @@ raft_sm_start(void)
* it wasn't sent in disabled state.
*/
struct raft_request req;
- raft_serialize(&req, NULL);
+ raft_serialize_for_network(&req, NULL);
raft_broadcast(&req);
}
@@ -670,6 +701,31 @@ raft_sm_stop(void)
box_update_ro_summary();
}
+void
+raft_serialize_for_network(struct raft_request *req, struct vclock *vclock)
+{
+ memset(req, 0, sizeof(*req));
+ /*
+ * Volatile state is never used for any communications.
+ * Use only persisted state.
+ */
+ req->term = raft.term;
+ req->vote = raft.vote;
+ req->state = raft.state;
+ /*
+ * Raft does not own vclock, so it always expects it passed externally.
+ */
+ req->vclock = vclock;
+}
+
+void
+raft_serialize_for_disk(struct raft_request *req)
+{
+ memset(req, 0, sizeof(*req));
+ req->term = raft.term;
+ req->vote = raft.vote;
+}
+
void
raft_cfg_is_enabled(bool is_enabled)
{
@@ -694,9 +750,8 @@ raft_cfg_is_candidate(bool is_candidate)
if (raft.is_candidate) {
assert(raft.state == RAFT_STATE_FOLLOWER);
/*
- * If there is an on-going WAL write, it means
- * there was some node who sent newer data to this
- * node.
+ * If there is an on-going WAL write, it means there was some
+ * node who sent newer data to this node.
*/
if (raft.leader == 0 && raft_is_fully_on_disk())
raft_sm_schedule_new_election();
@@ -729,7 +784,7 @@ raft_cfg_election_quorum(void)
if (raft.state != RAFT_STATE_CANDIDATE ||
raft.state == RAFT_STATE_LEADER)
return;
- if (raft.vote_count < replication_synchro_quorum)
+ if (raft.vote_count < raft_election_quorum())
return;
/*
* The node is a candidate. It means its state if fully synced with
@@ -767,18 +822,6 @@ raft_broadcast(const struct raft_request *req)
}
}
-void
-raft_bootstrap_leader(void)
-{
- assert(raft.is_enabled);
- assert(raft.volatile_term == 0);
- assert(raft.volatile_vote == 0);
- assert(raft.state == RAFT_STATE_FOLLOWER);
- raft.state = RAFT_STATE_LEADER;
- raft_broadcast_new_state();
- box_update_ro_summary();
-}
-
void
raft_init(void)
{
diff --git a/src/box/raft.h b/src/box/raft.h
index 57584bc1b..111a9c16e 100644
--- a/src/box/raft.h
+++ b/src/box/raft.h
@@ -37,12 +37,51 @@
extern "C" {
#endif
+/**
+ * This is an implementation of Raft leader election protocol, separated from
+ * synchronous replication part.
+ *
+ * The protocol describes an algorithm which helps to elect a single leader in
+ * the cluster, which is supposed to handle write requests. And re-elect a new
+ * leader, when the current leader dies.
+ *
+ * The implementation follows the protocol to the letter except a few important
+ * details.
+ *
+ * Firstly, the original Raft assumes, that all nodes share the same log record
+ * numbers. In Tarantool they are called LSNs. But in case of Tarantool each
+ * node has its own LSN in its own component of vclock. That makes the election
+ * messages a bit heavier, because the nodes need to send and compare complete
+ * vclocks of each other instead of a single number like in the original Raft.
+ * But logic becomes simpler. Because in the original Raft there is a problem of
+ * uncertainty about what to do with records of an old leader right after a new
+ * leader is elected. They could be rolled back or confirmed depending on
+ * circumstances. The issue disappears when vclock is used.
+ *
+ * Secondly, leader election works differently during cluster bootstrap, until
+ * number of bootstrapped replicas becomes >= election quorum. That arises from
+ * specifics of replicas bootstrap and order of systems initialization. In
+ * short: during bootstrap a leader election may use a smaller election quorum
+ * than the configured one. See more details in the code.
+ */
==================================================
It feels the comment is still not detailed enough. It looks suspiciously small
for such a big feature. Would be nice if anyone could add anything. I will add
more info later, if something will come into mind.
==================================================
+
struct raft_request;
struct vclock;
enum raft_state {
+ /**
+ * Can't write. Can only accept data from a leader. Node in this state
+ * either monitors an existing leader, or there is an on-going election
+ * and the node voted for another node, or it can't be a candidate and
+ * does not do anything.
+ */
RAFT_STATE_FOLLOWER = 1,
+ /**
+ * The node can't write. There is an active election, in which the node
+ * voted for self. Now it waits for election outcome.
+ */
RAFT_STATE_CANDIDATE = 2,
+ /** Election was successful. The node accepts write requests. */
RAFT_STATE_LEADER = 3,
};
@@ -54,31 +93,27 @@ struct raft {
/** State of the instance. */
enum raft_state state;
/**
- * Volatile part of the Raft state, whose WAL write may be
- * still in-progress, and yet the state may be already
- * used. Volatile state is never sent to anywhere, but the
- * state machine makes decisions based on it. That is
- * vital.
- * As an example, volatile vote needs to be used to reject
- * votes inside a term, where the instance already voted
- * (even if the vote WAL write is not finished yet).
- * Otherwise the instance would try to write several votes
- * inside one term.
+ * Volatile part of the Raft state, whose WAL write may be still
+ * in-progress, and yet the state may be already used. Volatile state is
+ * never sent to anywhere, but the state machine makes decisions based
+ * on it. That is vital.
+ * As an example, volatile vote needs to be used to reject votes inside
+ * a term, where the instance already voted (even if the vote WAL write
+ * is not finished yet). Otherwise the instance would try to write
+ * several votes inside one term.
*/
uint64_t volatile_term;
uint32_t volatile_vote;
/**
- * Flag whether Raft is enabled. When disabled, it still
- * persists terms so as to quickly enroll into the cluster
- * when (if) it is enabled. In everything else disabled
- * Raft does not affect instance work.
+ * Flag whether Raft is enabled. When disabled, it still persists terms
+ * so as to quickly enroll into the cluster when (if) it is enabled. In
+ * everything else disabled Raft does not affect instance work.
*/
bool is_enabled;
/**
- * Flag whether the node can become a leader. It is an
- * accumulated value of configuration options Raft enabled
- * Raft candidate. If at least one is false - the instance
- * is not a candidate.
+ * Flag whether the node can become a leader. It is an accumulated value
+ * of configuration options Raft enabled and Raft candidate. If at least
+ * one is false - the instance is not a candidate.
*/
bool is_candidate;
/** Flag whether the instance is allowed to be a leader. */
@@ -94,7 +129,10 @@ struct raft {
*/
uint64_t term;
uint32_t vote;
- /** Bit 1 means that a vote from that instance was obtained. */
+ /**
+ * Bit 1 on position N means that a vote from instance with ID = N was
+ * obtained.
+ */
uint32_t vote_mask;
/** Number of votes for this instance. Valid only in candidate state. */
int vote_count;
@@ -110,24 +148,25 @@ struct raft {
extern struct raft raft;
-void
-raft_new_term(uint64_t min_new_term);
-
-void
-raft_vote(uint32_t vote_for);
-
+/**
+ * A flag whether the instance is read-only according to Raft. Even if Raft
+ * allows writes though, it does not mean the instance is writable. It can be
+ * affected by box.cfg.read_only, connection quorum.
+ */
static inline bool
raft_is_ro(void)
{
return raft.is_enabled && raft.state != RAFT_STATE_LEADER;
}
+/** See if the instance can accept rows from an instance with the given ID. */
static inline bool
raft_is_source_allowed(uint32_t source_id)
{
return !raft.is_enabled || raft.leader == source_id;
}
+/** Check if Raft is enabled. */
static inline bool
raft_is_enabled(void)
{
@@ -146,59 +185,66 @@ raft_process_recovery(const struct raft_request *req);
void
raft_process_msg(const struct raft_request *req, uint32_t source);
+/**
+ * Process a heartbeat message from an instance with the given ID. It is used to
+ * watch leader's health and start election when necessary.
+ */
void
raft_process_heartbeat(uint32_t source);
-/**
- * Broadcast the changes in this instance's raft status to all
- * the followers.
- */
+/** Configure whether Raft is enabled. */
void
raft_cfg_is_enabled(bool is_enabled);
+/**
+ * Configure whether the instance can be elected as Raft leader. Even if false,
+ * the node still can vote, when Raft is enabled.
+ */
void
raft_cfg_is_candidate(bool is_candidate);
+/** Configure Raft leader election timeout. */
void
raft_cfg_election_timeout(double timeout);
+/**
+ * Configure Raft leader election quorum. There is no a separate option.
+ * Instead, synchronous replication quorum is used. Since Raft is tightly bound
+ * with synchronous replication.
+ */
void
raft_cfg_election_quorum(void);
+/**
+ * Configure Raft leader death timeout. I.e. number of seconds without
+ * heartbeats from the leader to consider it dead. There is no a separate
+ * option. Raft uses replication timeout for that.
+ */
void
raft_cfg_death_timeout(void);
-/** Save complete Raft state into the request. */
+/**
+ * Save complete Raft state into a request to be sent to other instances of the
+ * cluster. It is allowed to save anything here, not only persistent state.
+ */
void
-raft_serialize(struct raft_request *req, struct vclock *vclock);
+raft_serialize_for_network(struct raft_request *req, struct vclock *vclock);
==================================================
Serialization was split in 2 calls - for disk and for network. Because for disk
state and vclock are never needed. It looked ugly to pass NULL vclock and
manually nullify request's state after serialize() call.
==================================================
/**
- * Broadcast the changes in this instance's raft status to all
- * the followers.
+ * Save complete Raft state into a request to be persisted on disk. Only term
+ * and vote are being persisted.
*/
void
-raft_broadcast(const struct raft_request *req);
+raft_serialize_for_disk(struct raft_request *req);
/**
- * Bootstrap the current instance as the first leader of the cluster. That is
- * done bypassing the Raft election protocol, by just assigning this node a
- * leader role. That is needed, because when the cluster is not bootstrapped, it
- * is necessary to find a node, which will generate a replicaset UUID, write it
- * into _cluster space, and register all the other nodes in _cluster.
- * Until it is done, all nodes but one won't boot. Their WALs won't work. And
- * therefore they won't be able to participate in leader election. That
- * effectively makes the cluster dead from the beginning unless the first
- * bootstrapped node won't declare itself a leader without elections.
- *
- * XXX: That does not solve the problem, when the first node boots, creates a
- * snapshot, and then immediately dies. After recovery it won't declare itself a
- * leader. Therefore if quorum > 1, the cluster won't proceed to registering
- * any replicas and becomes completely dead. Perhaps that must be solved by
- * truncating quorum down to number of records in _cluster.
+ * Broadcast the changes in this instance's raft status to all
+ * the followers.
*/
void
-raft_bootstrap_leader(void);
+raft_broadcast(const struct raft_request *req);
+/** Initialize Raft global data structures. */
void
raft_init(void);
diff --git a/src/box/replication.cc b/src/box/replication.cc
index ef0e2411d..20f16206a 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -247,6 +247,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
tt_uuid_str(&replica->uuid));
}
replicaset.replica_by_id[replica_id] = replica;
+ ++replicaset.size;
say_info("assigned id %d to replica %s",
replica->id, tt_uuid_str(&replica->uuid));
@@ -267,6 +268,8 @@ replica_clear_id(struct replica *replica)
* replication.
*/
replicaset.replica_by_id[replica->id] = NULL;
+ assert(replicaset.size > 0);
+ --replicaset.size;
if (replica->id == instance_id) {
/* See replica_check_id(). */
assert(replicaset.is_joining);
diff --git a/src/box/replication.h b/src/box/replication.h
index ddc2bddf4..69cc820c9 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -217,6 +217,13 @@ struct replicaset {
bool is_joining;
/* A number of anonymous replicas following this instance. */
int anon_count;
+ /**
+ * Number of registered replicas. That includes all of them - connected,
+ * disconnected, connected not directly, just present in _cluster. If an
+ * instance has an ID, has the same replicaset UUID, then it is
+ * accounted here.
+ */
+ int size;
/** Applier state. */
struct {
/**
diff --git a/test/box/info.result b/test/box/info.result
index 40eeae069..d0abb634a 100644
--- a/test/box/info.result
+++ b/test/box/info.result
@@ -82,6 +82,7 @@ t
- memory
- package
- pid
+ - raft
- replication
- replication_anon
- ro
More information about the Tarantool-patches
mailing list