[Tarantool-patches] [PATCH 0/8] dRaft

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Fri Sep 4 01:51:38 MSK 2020


I made a few changes to the existing commits and added 2 new.
Here is the complete diff. The patchset is resent in v2 thread.

See the diff below with my comments inlined.

====================
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 7486d9929..c7c486ee4 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -1277,9 +1277,8 @@ applier_subscribe(struct applier *applier)
 				if (applier_handle_raft(applier,
 							first_row) != 0)
 					diag_raise();
-			} else {
-				applier_signal_ack(applier);
 			}
+			applier_signal_ack(applier);
==================================================

I made the ACK sent always. Because a Raft message could not change state of the
instance, wouldn't cause a WAL write, and wouldn't generate an ACK after all.

==================================================
 		} else if (applier_apply_tx(applier, &rows) != 0) {
 			diag_raise();
 		}
diff --git a/src/box/box.cc b/src/box/box.cc
index 2e9c90310..9d0782fff 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -171,6 +171,10 @@ static int
 box_check_writable(void)
 {
 	if (is_ro_summary) {
+		/*
+		 * XXX: return a special error when the node is not a leader to
+		 * reroute to the leader node.
+		 */
 		diag_set(ClientError, ER_READONLY);
 		diag_log();
 		return -1;
@@ -2143,10 +2147,10 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header)
 		 */
 		struct raft_request req;
 		/*
-		 * Omit the candidate vclock, since we've just
-		 * sent it in subscribe response.
+		 * Omit the candidate vclock, since we've just sent it in
+		 * subscribe response.
==================================================

Many comments were realigned from 66 to 80.

==================================================
 		 */
-		raft_serialize(&req, NULL);
+		raft_serialize_for_network(&req, NULL);
 		xrow_encode_raft(&row, &fiber()->gc, &req);
 		coio_write_xrow(io, &row);
 	}
@@ -2795,8 +2799,6 @@ box_cfg_xc(void)
 
 	if (!is_bootstrap_leader)
 		replicaset_sync();
-	else if (raft_is_enabled())
-		raft_bootstrap_leader();
 
 	/* box.cfg.read_only is not read yet. */
 	assert(box_is_ro());
diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 1c131caec..8e1dbd497 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -49,6 +49,7 @@
 #include "main.h"
 #include "version.h"
 #include "box/box.h"
+#include "box/raft.h"
 #include "lua/utils.h"
 #include "fiber.h"
 #include "tt_static.h"
@@ -577,6 +578,21 @@ lbox_info_listen(struct lua_State *L)
 	return 1;
 }
 
+static int
+lbox_info_raft(struct lua_State *L)
+{
+	lua_createtable(L, 0, 4);
+	lua_pushstring(L, raft_state_strs[raft.state]);
+	lua_setfield(L, -2, "state");
+	luaL_pushuint64(L, raft.volatile_term);
+	lua_setfield(L, -2, "term");
+	lua_pushinteger(L, raft.volatile_vote);
+	lua_setfield(L, -2, "vote");
+	lua_pushinteger(L, raft.leader);
+	lua_setfield(L, -2, "leader");
+	return 1;
+}
+
 static const struct luaL_Reg lbox_info_dynamic_meta[] = {
 	{"id", lbox_info_id},
 	{"uuid", lbox_info_uuid},
@@ -595,6 +611,7 @@ static const struct luaL_Reg lbox_info_dynamic_meta[] = {
 	{"vinyl", lbox_info_vinyl},
 	{"sql", lbox_info_sql},
 	{"listen", lbox_info_listen},
+	{"raft", lbox_info_raft},
 	{NULL, NULL}
 };
 
diff --git a/src/box/lua/misc.cc b/src/box/lua/misc.cc
index efbbcfd1f..e356f2d4b 100644
--- a/src/box/lua/misc.cc
+++ b/src/box/lua/misc.cc
@@ -40,7 +40,6 @@
 #include "box/tuple.h"
 #include "box/tuple_format.h"
 #include "box/lua/tuple.h"
-#include "box/raft.h"
 #include "box/xrow.h"
 #include "mpstream/mpstream.h"
 
@@ -248,46 +247,12 @@ lbox_tuple_format_new(struct lua_State *L)
 
 /* }}} */
 
-static int
-lbox_raft_new_term(struct lua_State *L)
-{
-	uint64_t min_term = luaL_checkuint64(L, 1);
-	raft_new_term(min_term);
-	return 0;
-}
-
-static int
-lbox_raft_get(struct lua_State *L)
-{
-	lua_createtable(L, 0, 8);
-	luaL_pushuint64(L, raft.term);
-	lua_setfield(L, -2, "term");
-	luaL_pushuint64(L, raft.volatile_term);
-	lua_setfield(L, -2, "volatile_term");
-	luaL_pushuint64(L, raft.vote);
-	lua_setfield(L, -2, "vote");
-	luaL_pushuint64(L, raft.volatile_vote);
-	lua_setfield(L, -2, "volatile_vote");
-	lua_pushstring(L, raft_state_strs[raft.state]);
-	lua_setfield(L, -2, "state");
-	lua_pushinteger(L, raft.vote_count);
-	lua_setfield(L, -2, "vote_count");
-	lua_pushboolean(L, raft.is_write_in_progress);
-	lua_setfield(L, -2, "is_write_in_progress");
-	lua_pushboolean(L, raft.is_candidate);
-	lua_setfield(L, -2, "is_candidate");
-	return 1;
-}
==================================================

The first helper functions about new vote and new term were removed. Perhaps the
term bump will return as something like box.internal.raft_new_term() for the
tests, but vote should be controlled by Raft completely.

Information part is moved into box.info.raft in info.cc.

==================================================
-
 void
 box_lua_misc_init(struct lua_State *L)
 {
 	static const struct luaL_Reg boxlib_internal[] = {
 		{"select", lbox_select},
 		{"new_tuple_format", lbox_tuple_format_new},
-		/* Temporary helpers to sanity test raft persistency. */
-		{"raft_new_term", lbox_raft_new_term},
-		{"raft_get", lbox_raft_get},
 		{NULL, NULL}
 	};
 
diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c
index b0b744db8..fe7ae9f63 100644
--- a/src/box/memtx_engine.c
+++ b/src/box/memtx_engine.c
@@ -517,11 +517,7 @@ checkpoint_new(const char *snap_dirname, uint64_t snap_io_rate_limit)
 	opts.free_cache = true;
 	xdir_create(&ckpt->dir, snap_dirname, SNAP, &INSTANCE_UUID, &opts);
 	vclock_create(&ckpt->vclock);
-	/*
-	 * Don't encode vclock, because it is stored in the snapshot header
-	 * anyway.
-	 */
-	raft_serialize(&ckpt->raft, NULL);
+	raft_serialize_for_disk(&ckpt->raft);
 	ckpt->touch = false;
 	return ckpt;
 }
diff --git a/src/box/raft.c b/src/box/raft.c
index 6f2891291..1c4275cd5 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -79,7 +79,7 @@ struct raft raft = {
  * will turn into a candidate when the flush is done.
  *
  * In case of a new not flushed vote it means either this node voted for some
- * other node, and must be a follower; or it voted or self, and also must be a
+ * other node, and must be a follower; or it voted for self, and also must be a
  * follower, but will become a candidate when the flush is done.
  *
  * In total - when something is not synced with disk, the instance is a follower
@@ -96,14 +96,14 @@ raft_is_fully_on_disk(void)
  * Raft protocol says that election timeout should be a bit randomized so as
  * the nodes wouldn't start election at the same time and end up with not having
  * a quorum for anybody. This implementation randomizes the election timeout by
- * adding {election timeout * random factor} value, where the factor is a
- * constant floating point value > 0.
+ * adding {election timeout * random factor} value, where max value of the
+ * factor is a constant floating point value > 0.
  */
 static inline double
 raft_new_random_election_shift(void)
 {
 	double timeout = raft.election_timeout;
-	/* Translate to ms. */
+	/* Translate to ms. Integer is needed to be able to use mod below. */
 	uint32_t rand_part =
 		(uint32_t)(timeout * RAFT_RANDOM_ELECTION_FACTOR * 1000);
 	if (rand_part == 0)
@@ -134,6 +134,49 @@ raft_can_vote_for(const struct vclock *v)
 	return cmp == 0 || cmp == 1;
 }
 
+/**
+ * Election quorum is not strictly equal to synchronous replication quorum.
+ * Sometimes it can be lowered. That is about bootstrap.
+ *
+ * The problem with bootstrap is that when the replicaset boots, all the
+ * instances can't write to WAL and can't recover from their initial snapshot.
+ * They need one node which will boot first, and then they will replicate from
+ * it.
+ *
+ * This one node should boot from its zero snapshot, create replicaset UUID,
+ * register self with ID 1 in _cluster space, and then register all the other
+ * instances here. To do that the node must be writable. It should have
+ * read_only = false, connection quorum satisfied, and be a Raft leader if Raft
+ * is enabled.
+ *
+ * To be elected a Raft leader it needs to perform election. But it can't be
+ * done before at least synchronous quorum of the replicas is bootstrapped. And
+ * they can't be bootstrapped because wait for a leader to initialize _cluster.
+ * Cyclic dependency.
+ *
+ * This is resolved by truncation of the election quorum to the number of
+ * registered replicas, if their count is less than synchronous quorum. That
+ * helps to elect a first leader.
+ *
+ * It may seem that the first node could just declare itself a leader and then
+ * strictly follow the protocol from now on, but that won't work, because if the
+ * first node will restart after it is booted, but before quorum of replicas is
+ * booted, the cluster will stuck again.
+ *
+ * The current solution is totally safe because
+ *
+ * - after all the cluster will have node count >= quorum, if user used a
+ *   correct config (God help him if he didn't);
+ *
+ * - synchronous replication quorum is untouched - it is not truncated. Only
+ *   leader election quorum is affected. So synchronous data won't be lost.
+ */
+static inline int
+raft_election_quorum(void)
+{
+	return MIN(replication_synchro_quorum, replicaset.size);
+}
+
 /** Broadcast an event about this node changed its state to all relays. */
 static inline void
 raft_broadcast_new_state(void)
@@ -157,9 +200,9 @@ static void
 raft_sm_start(void);
 
 /**
- * Stop the state machine.
+ * Stop the state machine. Now until Raft is re-enabled,
  * - Raft stops affecting the instance operation;
- * - this node can't become a leader anymore;
+ * - this node can't become a leader;
  * - this node can't vote.
  */
 static void
@@ -188,7 +231,10 @@ raft_sm_schedule_new_term(uint64_t new_term);
 static void
 raft_sm_schedule_new_vote(uint32_t new_vote);
 
-/** Bump volatile term, vote for self, and schedule their flush to disk. */
+/**
+ * Bump term and vote for self immediately. After that is persisted, the
+ * election timeout will be activated. Unless during that nothing newer happens.
+ */
 static void
 raft_sm_schedule_new_election(void);
 
@@ -200,6 +246,17 @@ static void
 raft_sm_schedule_new_election_cb(struct ev_loop *loop, struct ev_timer *timer,
 				 int events);
 
+/** Start Raft state flush to disk. */
+static void
+raft_sm_pause_and_dump(void);
+
+/**
+ * Flush Raft state changes to WAL. The callback resets itself, if during the
+ * write more changes appear.
+ */
+static void
+raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events);
==================================================

I announces these functions even though it wasn't necessary, to keep all
raft_sm_* methods declared in one place.

==================================================
+
 void
 raft_process_recovery(const struct raft_request *req)
 {
@@ -261,6 +318,14 @@ raft_process_msg(const struct raft_request *req, uint32_t source)
 			/* Can't vote for too old or incomparable nodes. */
 			if (!raft_can_vote_for(req->vclock))
 				break;
+			/*
+			 * Check if somebody is asking to vote for a third
+			 * node - nope. Make votes only when asked directly by
+			 * the new candidate. However that restriction may be
+			 * relaxed in future, if can be proven to be safe.
+			 */
+			if (req->vote != source)
+				break;
==================================================

I decided not to modify the original Raft here, and don't spread a vote wave.
A node can vote only for candidates asking for a vote directly. Not via a third
node.

==================================================
 			/*
 			 * Either the term is new, or didn't vote in the current
 			 * term yet. Anyway can vote now.
@@ -279,7 +344,7 @@ raft_process_msg(const struct raft_request *req, uint32_t source)
 			assert(raft.volatile_vote == instance_id);
 			bool was_set = bit_set(&raft.vote_mask, source);
 			raft.vote_count += !was_set;
-			if (raft.vote_count < replication_synchro_quorum)
+			if (raft.vote_count < raft_election_quorum())
 				break;
 			raft.state = RAFT_STATE_LEADER;
 			raft.leader = instance_id;
@@ -330,8 +395,8 @@ void
 raft_process_heartbeat(uint32_t source)
 {
 	/*
-	 * When not a candidate - don't wait for anything. Therefore does not
-	 * care about the leader being dead.
+	 * When not a candidate - don't wait for anything. Therefore do not care
+	 * about the leader being dead.
 	 */
 	if (!raft.is_candidate)
 		return;
@@ -352,23 +417,6 @@ raft_process_heartbeat(uint32_t source)
 	raft_sm_wait_leader_dead();
 }
 
-void
-raft_serialize(struct raft_request *req, struct vclock *vclock)
-{
-	memset(req, 0, sizeof(*req));
-	/*
-	 * Volatile state is never used for any communications.
-	 * Use only persisted state.
-	 */
-	req->term = raft.term;
-	req->vote = raft.vote;
-	req->state = raft.state;
-	/*
-	 * Raft does not own vclock, so it always expects it passed externally.
-	 */
-	req->vclock = vclock;
-}
-
 /** Wakeup Raft state writer fiber waiting for WAL write end. */
 static void
 raft_write_cb(struct journal_entry *entry)
@@ -386,6 +434,13 @@ raft_write_request(const struct raft_request *req)
 	 * be sent to network when vote for self.
 	 */
 	assert(req->vclock == NULL);
+	/*
+	 * State is not persisted. That would be strictly against Raft protocol.
+	 * The reason is that it does not make much sense - even if the node is
+	 * a leader now, after the node is restarted, there will be another
+	 * leader elected by that time likely.
+	 */
+	assert(req->state == 0);
 	struct region *region = &fiber()->gc;
 	uint32_t svp = region_used(region);
 	struct xrow_header row;
@@ -417,12 +472,8 @@ fail:
 	panic("Could not write a raft request to WAL\n");
 }
 
-/**
- * Flush Raft state changes to WAL. The callback resets itself, if during the
- * write more changes appear.
- */
 static void
-raft_sm_dump_step(ev_loop *loop, ev_check *watcher, int events)
+raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events)
 {
 	assert(watcher == &raft.io);
 	(void) events;
@@ -453,7 +504,7 @@ end_dump:
 			raft_sm_wait_leader_dead();
 		} else if (raft.vote == instance_id) {
 			/* Just wrote own vote. */
-			if (replication_synchro_quorum == 1) {
+			if (raft_election_quorum() == 1) {
 				raft.state = RAFT_STATE_LEADER;
 				raft.leader = instance_id;
 				/*
@@ -463,8 +514,10 @@ end_dump:
 				box_update_ro_summary();
 			} else {
 				raft.state = RAFT_STATE_CANDIDATE;
+				/* First vote for self. */
 				raft.vote_count = 1;
 				raft.vote_mask = 0;
+				bit_set(&raft.vote_mask, instance_id);
 				raft_sm_wait_election_end();
 			}
 		} else if (raft.vote != 0) {
@@ -530,7 +583,6 @@ end_dump:
 		raft_broadcast(&req);
 }
 
-/** Start Raft state flush to disk. */
 static void
 raft_sm_pause_and_dump(void)
 {
@@ -542,7 +594,6 @@ raft_sm_pause_and_dump(void)
 	raft.is_write_in_progress = true;
 }
 
-/** Bump term, reset Raft state, and persist that fact. */
 static void
 raft_sm_schedule_new_term(uint64_t new_term)
 {
@@ -556,7 +607,6 @@ raft_sm_schedule_new_term(uint64_t new_term)
 	raft_sm_pause_and_dump();
 }
 
-/** Vote in the current term, and persist that fact. */
 static void
 raft_sm_schedule_new_vote(uint32_t new_vote)
 {
@@ -565,10 +615,6 @@ raft_sm_schedule_new_vote(uint32_t new_vote)
 	raft_sm_pause_and_dump();
 }
 
-/**
- * Bump term and vote for self immediately. After that is persisted, the
- * election timeout will be activated. Unless during that nothing newer happens.
- */
 static void
 raft_sm_schedule_new_election(void)
 {
@@ -581,21 +627,6 @@ raft_sm_schedule_new_election(void)
 	box_update_ro_summary();
 }
 
-void
-raft_new_term(uint64_t min_new_term)
-{
-	uint64_t new_term;
-	if (raft.term < min_new_term)
-		new_term = min_new_term + 1;
-	else
-		new_term = raft.term + 1;
-	enum raft_state old_state = raft.state;
-	raft_sm_schedule_new_term(new_term);
-	if (raft.state != old_state)
-		raft_broadcast_new_state();
-	box_update_ro_summary();
-}
-
 static void
 raft_sm_schedule_new_election_cb(struct ev_loop *loop, struct ev_timer *timer,
 				 int events)
@@ -657,7 +688,7 @@ raft_sm_start(void)
 	 * it wasn't sent in disabled state.
 	 */
 	struct raft_request req;
-	raft_serialize(&req, NULL);
+	raft_serialize_for_network(&req, NULL);
 	raft_broadcast(&req);
 }
 
@@ -670,6 +701,31 @@ raft_sm_stop(void)
 	box_update_ro_summary();
 }
 
+void
+raft_serialize_for_network(struct raft_request *req, struct vclock *vclock)
+{
+	memset(req, 0, sizeof(*req));
+	/*
+	 * Volatile state is never used for any communications.
+	 * Use only persisted state.
+	 */
+	req->term = raft.term;
+	req->vote = raft.vote;
+	req->state = raft.state;
+	/*
+	 * Raft does not own vclock, so it always expects it passed externally.
+	 */
+	req->vclock = vclock;
+}
+
+void
+raft_serialize_for_disk(struct raft_request *req)
+{
+	memset(req, 0, sizeof(*req));
+	req->term = raft.term;
+	req->vote = raft.vote;
+}
+
 void
 raft_cfg_is_enabled(bool is_enabled)
 {
@@ -694,9 +750,8 @@ raft_cfg_is_candidate(bool is_candidate)
 	if (raft.is_candidate) {
 		assert(raft.state == RAFT_STATE_FOLLOWER);
 		/*
-		 * If there is an on-going WAL write, it means
-		 * there was some node who sent newer data to this
-		 * node.
+		 * If there is an on-going WAL write, it means there was some
+		 * node who sent newer data to this node.
 		 */
 		if (raft.leader == 0 && raft_is_fully_on_disk())
 			raft_sm_schedule_new_election();
@@ -729,7 +784,7 @@ raft_cfg_election_quorum(void)
 	if (raft.state != RAFT_STATE_CANDIDATE ||
 	    raft.state == RAFT_STATE_LEADER)
 		return;
-	if (raft.vote_count < replication_synchro_quorum)
+	if (raft.vote_count < raft_election_quorum())
 		return;
 	/*
 	 * The node is a candidate. It means its state if fully synced with
@@ -767,18 +822,6 @@ raft_broadcast(const struct raft_request *req)
 	}
 }
 
-void
-raft_bootstrap_leader(void)
-{
-	assert(raft.is_enabled);
-	assert(raft.volatile_term == 0);
-	assert(raft.volatile_vote == 0);
-	assert(raft.state == RAFT_STATE_FOLLOWER);
-	raft.state = RAFT_STATE_LEADER;
-	raft_broadcast_new_state();
-	box_update_ro_summary();
-}
-
 void
 raft_init(void)
 {
diff --git a/src/box/raft.h b/src/box/raft.h
index 57584bc1b..111a9c16e 100644
--- a/src/box/raft.h
+++ b/src/box/raft.h
@@ -37,12 +37,51 @@
 extern "C" {
 #endif
 
+/**
+ * This is an implementation of Raft leader election protocol, separated from
+ * synchronous replication part.
+ *
+ * The protocol describes an algorithm which helps to elect a single leader in
+ * the cluster, which is supposed to handle write requests. And re-elect a new
+ * leader, when the current leader dies.
+ *
+ * The implementation follows the protocol to the letter except a few important
+ * details.
+ *
+ * Firstly, the original Raft assumes, that all nodes share the same log record
+ * numbers. In Tarantool they are called LSNs. But in case of Tarantool each
+ * node has its own LSN in its own component of vclock. That makes the election
+ * messages a bit heavier, because the nodes need to send and compare complete
+ * vclocks of each other instead of a single number like in the original Raft.
+ * But logic becomes simpler. Because in the original Raft there is a problem of
+ * uncertainty about what to do with records of an old leader right after a new
+ * leader is elected. They could be rolled back or confirmed depending on
+ * circumstances. The issue disappears when vclock is used.
+ *
+ * Secondly, leader election works differently during cluster bootstrap, until
+ * number of bootstrapped replicas becomes >= election quorum. That arises from
+ * specifics of replicas bootstrap and order of systems initialization. In
+ * short: during bootstrap a leader election may use a smaller election quorum
+ * than the configured one. See more details in the code.
+ */
==================================================

It feels the comment is still not detailed enough. It looks suspiciously small
for such a big feature. Would be nice if anyone could add anything. I will add
more info later, if something will come into mind.

==================================================
+
 struct raft_request;
 struct vclock;
 
 enum raft_state {
+	/**
+	 * Can't write. Can only accept data from a leader. Node in this state
+	 * either monitors an existing leader, or there is an on-going election
+	 * and the node voted for another node, or it can't be a candidate and
+	 * does not do anything.
+	 */
 	RAFT_STATE_FOLLOWER = 1,
+	/**
+	 * The node can't write. There is an active election, in which the node
+	 * voted for self. Now it waits for election outcome.
+	 */
 	RAFT_STATE_CANDIDATE = 2,
+	/** Election was successful. The node accepts write requests. */
 	RAFT_STATE_LEADER = 3,
 };
 
@@ -54,31 +93,27 @@ struct raft {
 	/** State of the instance. */
 	enum raft_state state;
 	/**
-	 * Volatile part of the Raft state, whose WAL write may be
-	 * still in-progress, and yet the state may be already
-	 * used. Volatile state is never sent to anywhere, but the
-	 * state machine makes decisions based on it. That is
-	 * vital.
-	 * As an example, volatile vote needs to be used to reject
-	 * votes inside a term, where the instance already voted
-	 * (even if the vote WAL write is not finished yet).
-	 * Otherwise the instance would try to write several votes
-	 * inside one term.
+	 * Volatile part of the Raft state, whose WAL write may be still
+	 * in-progress, and yet the state may be already used. Volatile state is
+	 * never sent to anywhere, but the state machine makes decisions based
+	 * on it. That is vital.
+	 * As an example, volatile vote needs to be used to reject votes inside
+	 * a term, where the instance already voted (even if the vote WAL write
+	 * is not finished yet). Otherwise the instance would try to write
+	 * several votes inside one term.
 	 */
 	uint64_t volatile_term;
 	uint32_t volatile_vote;
 	/**
-	 * Flag whether Raft is enabled. When disabled, it still
-	 * persists terms so as to quickly enroll into the cluster
-	 * when (if) it is enabled. In everything else disabled
-	 * Raft does not affect instance work.
+	 * Flag whether Raft is enabled. When disabled, it still persists terms
+	 * so as to quickly enroll into the cluster when (if) it is enabled. In
+	 * everything else disabled Raft does not affect instance work.
 	 */
 	bool is_enabled;
 	/**
-	 * Flag whether the node can become a leader. It is an
-	 * accumulated value of configuration options Raft enabled
-	 * Raft candidate. If at least one is false - the instance
-	 * is not a candidate.
+	 * Flag whether the node can become a leader. It is an accumulated value
+	 * of configuration options Raft enabled and Raft candidate. If at least
+	 * one is false - the instance is not a candidate.
 	 */
 	bool is_candidate;
 	/** Flag whether the instance is allowed to be a leader. */
@@ -94,7 +129,10 @@ struct raft {
 	 */
 	uint64_t term;
 	uint32_t vote;
-	/** Bit 1 means that a vote from that instance was obtained. */
+	/**
+	 * Bit 1 on position N means that a vote from instance with ID = N was
+	 * obtained.
+	 */
 	uint32_t vote_mask;
 	/** Number of votes for this instance. Valid only in candidate state. */
 	int vote_count;
@@ -110,24 +148,25 @@ struct raft {
 
 extern struct raft raft;
 
-void
-raft_new_term(uint64_t min_new_term);
-
-void
-raft_vote(uint32_t vote_for);
-
+/**
+ * A flag whether the instance is read-only according to Raft. Even if Raft
+ * allows writes though, it does not mean the instance is writable. It can be
+ * affected by box.cfg.read_only, connection quorum.
+ */
 static inline bool
 raft_is_ro(void)
 {
 	return raft.is_enabled && raft.state != RAFT_STATE_LEADER;
 }
 
+/** See if the instance can accept rows from an instance with the given ID. */
 static inline bool
 raft_is_source_allowed(uint32_t source_id)
 {
 	return !raft.is_enabled || raft.leader == source_id;
 }
 
+/** Check if Raft is enabled. */
 static inline bool
 raft_is_enabled(void)
 {
@@ -146,59 +185,66 @@ raft_process_recovery(const struct raft_request *req);
 void
 raft_process_msg(const struct raft_request *req, uint32_t source);
 
+/**
+ * Process a heartbeat message from an instance with the given ID. It is used to
+ * watch leader's health and start election when necessary.
+ */
 void
 raft_process_heartbeat(uint32_t source);
 
-/**
- * Broadcast the changes in this instance's raft status to all
- * the followers.
- */
+/** Configure whether Raft is enabled. */
 void
 raft_cfg_is_enabled(bool is_enabled);
 
+/**
+ * Configure whether the instance can be elected as Raft leader. Even if false,
+ * the node still can vote, when Raft is enabled.
+ */
 void
 raft_cfg_is_candidate(bool is_candidate);
 
+/** Configure Raft leader election timeout. */
 void
 raft_cfg_election_timeout(double timeout);
 
+/**
+ * Configure Raft leader election quorum. There is no a separate option.
+ * Instead, synchronous replication quorum is used. Since Raft is tightly bound
+ * with synchronous replication.
+ */
 void
 raft_cfg_election_quorum(void);
 
+/**
+ * Configure Raft leader death timeout. I.e. number of seconds without
+ * heartbeats from the leader to consider it dead. There is no a separate
+ * option. Raft uses replication timeout for that.
+ */
 void
 raft_cfg_death_timeout(void);
 
-/** Save complete Raft state into the request. */
+/**
+ * Save complete Raft state into a request to be sent to other instances of the
+ * cluster. It is allowed to save anything here, not only persistent state.
+ */
 void
-raft_serialize(struct raft_request *req, struct vclock *vclock);
+raft_serialize_for_network(struct raft_request *req, struct vclock *vclock);
==================================================

Serialization was split in 2 calls - for disk and for network. Because for disk
state and vclock are never needed. It looked ugly to pass NULL vclock and
manually nullify request's state after serialize() call.

==================================================
 
 /**
- * Broadcast the changes in this instance's raft status to all
- * the followers.
+ * Save complete Raft state into a request to be persisted on disk. Only term
+ * and vote are being persisted.
  */
 void
-raft_broadcast(const struct raft_request *req);
+raft_serialize_for_disk(struct raft_request *req);
 
 /**
- * Bootstrap the current instance as the first leader of the cluster. That is
- * done bypassing the Raft election protocol, by just assigning this node a
- * leader role. That is needed, because when the cluster is not bootstrapped, it
- * is necessary to find a node, which will generate a replicaset UUID, write it
- * into _cluster space, and register all the other nodes in _cluster.
- * Until it is done, all nodes but one won't boot. Their WALs won't work. And
- * therefore they won't be able to participate in leader election. That
- * effectively makes the cluster dead from the beginning unless the first
- * bootstrapped node won't declare itself a leader without elections.
- *
- * XXX: That does not solve the problem, when the first node boots, creates a
- * snapshot, and then immediately dies. After recovery it won't declare itself a
- * leader. Therefore if quorum > 1, the cluster won't proceed to registering
- * any replicas and becomes completely dead. Perhaps that must be solved by
- * truncating quorum down to number of records in _cluster.
+ * Broadcast the changes in this instance's raft status to all
+ * the followers.
  */
 void
-raft_bootstrap_leader(void);
+raft_broadcast(const struct raft_request *req);
 
+/** Initialize Raft global data structures. */
 void
 raft_init(void);
 
diff --git a/src/box/replication.cc b/src/box/replication.cc
index ef0e2411d..20f16206a 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -247,6 +247,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
 						   tt_uuid_str(&replica->uuid));
 	}
 	replicaset.replica_by_id[replica_id] = replica;
+	++replicaset.size;
 
 	say_info("assigned id %d to replica %s",
 		 replica->id, tt_uuid_str(&replica->uuid));
@@ -267,6 +268,8 @@ replica_clear_id(struct replica *replica)
 	 * replication.
 	 */
 	replicaset.replica_by_id[replica->id] = NULL;
+	assert(replicaset.size > 0);
+	--replicaset.size;
 	if (replica->id == instance_id) {
 		/* See replica_check_id(). */
 		assert(replicaset.is_joining);
diff --git a/src/box/replication.h b/src/box/replication.h
index ddc2bddf4..69cc820c9 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -217,6 +217,13 @@ struct replicaset {
 	bool is_joining;
 	/* A number of anonymous replicas following this instance. */
 	int anon_count;
+	/**
+	 * Number of registered replicas. That includes all of them - connected,
+	 * disconnected, connected not directly, just present in _cluster. If an
+	 * instance has an ID, has the same replicaset UUID, then it is
+	 * accounted here.
+	 */
+	int size;
 	/** Applier state. */
 	struct {
 		/**
diff --git a/test/box/info.result b/test/box/info.result
index 40eeae069..d0abb634a 100644
--- a/test/box/info.result
+++ b/test/box/info.result
@@ -82,6 +82,7 @@ t
   - memory
   - package
   - pid
+  - raft
   - replication
   - replication_anon
   - ro


More information about the Tarantool-patches mailing list