From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpng2.m.smailru.net (smtpng2.m.smailru.net [94.100.179.3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id BCA73430418 for ; Fri, 4 Sep 2020 01:51:40 +0300 (MSK) From: Vladislav Shpilevoy References: Message-ID: <089e0f1f-4fb1-a828-068b-0e20ed1dc895@tarantool.org> Date: Fri, 4 Sep 2020 00:51:38 +0200 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset=utf-8 Content-Language: en-US Content-Transfer-Encoding: 7bit Subject: Re: [Tarantool-patches] [PATCH 0/8] dRaft List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: tarantool-patches@dev.tarantool.org, sergepetrenko@tarantool.org, gorcunov@gmail.com I made a few changes to the existing commits and added 2 new. Here is the complete diff. The patchset is resent in v2 thread. See the diff below with my comments inlined. ==================== diff --git a/src/box/applier.cc b/src/box/applier.cc index 7486d9929..c7c486ee4 100644 --- a/src/box/applier.cc +++ b/src/box/applier.cc @@ -1277,9 +1277,8 @@ applier_subscribe(struct applier *applier) if (applier_handle_raft(applier, first_row) != 0) diag_raise(); - } else { - applier_signal_ack(applier); } + applier_signal_ack(applier); ================================================== I made the ACK sent always. Because a Raft message could not change state of the instance, wouldn't cause a WAL write, and wouldn't generate an ACK after all. ================================================== } else if (applier_apply_tx(applier, &rows) != 0) { diag_raise(); } diff --git a/src/box/box.cc b/src/box/box.cc index 2e9c90310..9d0782fff 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -171,6 +171,10 @@ static int box_check_writable(void) { if (is_ro_summary) { + /* + * XXX: return a special error when the node is not a leader to + * reroute to the leader node. + */ diag_set(ClientError, ER_READONLY); diag_log(); return -1; @@ -2143,10 +2147,10 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header) */ struct raft_request req; /* - * Omit the candidate vclock, since we've just - * sent it in subscribe response. + * Omit the candidate vclock, since we've just sent it in + * subscribe response. ================================================== Many comments were realigned from 66 to 80. ================================================== */ - raft_serialize(&req, NULL); + raft_serialize_for_network(&req, NULL); xrow_encode_raft(&row, &fiber()->gc, &req); coio_write_xrow(io, &row); } @@ -2795,8 +2799,6 @@ box_cfg_xc(void) if (!is_bootstrap_leader) replicaset_sync(); - else if (raft_is_enabled()) - raft_bootstrap_leader(); /* box.cfg.read_only is not read yet. */ assert(box_is_ro()); diff --git a/src/box/lua/info.c b/src/box/lua/info.c index 1c131caec..8e1dbd497 100644 --- a/src/box/lua/info.c +++ b/src/box/lua/info.c @@ -49,6 +49,7 @@ #include "main.h" #include "version.h" #include "box/box.h" +#include "box/raft.h" #include "lua/utils.h" #include "fiber.h" #include "tt_static.h" @@ -577,6 +578,21 @@ lbox_info_listen(struct lua_State *L) return 1; } +static int +lbox_info_raft(struct lua_State *L) +{ + lua_createtable(L, 0, 4); + lua_pushstring(L, raft_state_strs[raft.state]); + lua_setfield(L, -2, "state"); + luaL_pushuint64(L, raft.volatile_term); + lua_setfield(L, -2, "term"); + lua_pushinteger(L, raft.volatile_vote); + lua_setfield(L, -2, "vote"); + lua_pushinteger(L, raft.leader); + lua_setfield(L, -2, "leader"); + return 1; +} + static const struct luaL_Reg lbox_info_dynamic_meta[] = { {"id", lbox_info_id}, {"uuid", lbox_info_uuid}, @@ -595,6 +611,7 @@ static const struct luaL_Reg lbox_info_dynamic_meta[] = { {"vinyl", lbox_info_vinyl}, {"sql", lbox_info_sql}, {"listen", lbox_info_listen}, + {"raft", lbox_info_raft}, {NULL, NULL} }; diff --git a/src/box/lua/misc.cc b/src/box/lua/misc.cc index efbbcfd1f..e356f2d4b 100644 --- a/src/box/lua/misc.cc +++ b/src/box/lua/misc.cc @@ -40,7 +40,6 @@ #include "box/tuple.h" #include "box/tuple_format.h" #include "box/lua/tuple.h" -#include "box/raft.h" #include "box/xrow.h" #include "mpstream/mpstream.h" @@ -248,46 +247,12 @@ lbox_tuple_format_new(struct lua_State *L) /* }}} */ -static int -lbox_raft_new_term(struct lua_State *L) -{ - uint64_t min_term = luaL_checkuint64(L, 1); - raft_new_term(min_term); - return 0; -} - -static int -lbox_raft_get(struct lua_State *L) -{ - lua_createtable(L, 0, 8); - luaL_pushuint64(L, raft.term); - lua_setfield(L, -2, "term"); - luaL_pushuint64(L, raft.volatile_term); - lua_setfield(L, -2, "volatile_term"); - luaL_pushuint64(L, raft.vote); - lua_setfield(L, -2, "vote"); - luaL_pushuint64(L, raft.volatile_vote); - lua_setfield(L, -2, "volatile_vote"); - lua_pushstring(L, raft_state_strs[raft.state]); - lua_setfield(L, -2, "state"); - lua_pushinteger(L, raft.vote_count); - lua_setfield(L, -2, "vote_count"); - lua_pushboolean(L, raft.is_write_in_progress); - lua_setfield(L, -2, "is_write_in_progress"); - lua_pushboolean(L, raft.is_candidate); - lua_setfield(L, -2, "is_candidate"); - return 1; -} ================================================== The first helper functions about new vote and new term were removed. Perhaps the term bump will return as something like box.internal.raft_new_term() for the tests, but vote should be controlled by Raft completely. Information part is moved into box.info.raft in info.cc. ================================================== - void box_lua_misc_init(struct lua_State *L) { static const struct luaL_Reg boxlib_internal[] = { {"select", lbox_select}, {"new_tuple_format", lbox_tuple_format_new}, - /* Temporary helpers to sanity test raft persistency. */ - {"raft_new_term", lbox_raft_new_term}, - {"raft_get", lbox_raft_get}, {NULL, NULL} }; diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c index b0b744db8..fe7ae9f63 100644 --- a/src/box/memtx_engine.c +++ b/src/box/memtx_engine.c @@ -517,11 +517,7 @@ checkpoint_new(const char *snap_dirname, uint64_t snap_io_rate_limit) opts.free_cache = true; xdir_create(&ckpt->dir, snap_dirname, SNAP, &INSTANCE_UUID, &opts); vclock_create(&ckpt->vclock); - /* - * Don't encode vclock, because it is stored in the snapshot header - * anyway. - */ - raft_serialize(&ckpt->raft, NULL); + raft_serialize_for_disk(&ckpt->raft); ckpt->touch = false; return ckpt; } diff --git a/src/box/raft.c b/src/box/raft.c index 6f2891291..1c4275cd5 100644 --- a/src/box/raft.c +++ b/src/box/raft.c @@ -79,7 +79,7 @@ struct raft raft = { * will turn into a candidate when the flush is done. * * In case of a new not flushed vote it means either this node voted for some - * other node, and must be a follower; or it voted or self, and also must be a + * other node, and must be a follower; or it voted for self, and also must be a * follower, but will become a candidate when the flush is done. * * In total - when something is not synced with disk, the instance is a follower @@ -96,14 +96,14 @@ raft_is_fully_on_disk(void) * Raft protocol says that election timeout should be a bit randomized so as * the nodes wouldn't start election at the same time and end up with not having * a quorum for anybody. This implementation randomizes the election timeout by - * adding {election timeout * random factor} value, where the factor is a - * constant floating point value > 0. + * adding {election timeout * random factor} value, where max value of the + * factor is a constant floating point value > 0. */ static inline double raft_new_random_election_shift(void) { double timeout = raft.election_timeout; - /* Translate to ms. */ + /* Translate to ms. Integer is needed to be able to use mod below. */ uint32_t rand_part = (uint32_t)(timeout * RAFT_RANDOM_ELECTION_FACTOR * 1000); if (rand_part == 0) @@ -134,6 +134,49 @@ raft_can_vote_for(const struct vclock *v) return cmp == 0 || cmp == 1; } +/** + * Election quorum is not strictly equal to synchronous replication quorum. + * Sometimes it can be lowered. That is about bootstrap. + * + * The problem with bootstrap is that when the replicaset boots, all the + * instances can't write to WAL and can't recover from their initial snapshot. + * They need one node which will boot first, and then they will replicate from + * it. + * + * This one node should boot from its zero snapshot, create replicaset UUID, + * register self with ID 1 in _cluster space, and then register all the other + * instances here. To do that the node must be writable. It should have + * read_only = false, connection quorum satisfied, and be a Raft leader if Raft + * is enabled. + * + * To be elected a Raft leader it needs to perform election. But it can't be + * done before at least synchronous quorum of the replicas is bootstrapped. And + * they can't be bootstrapped because wait for a leader to initialize _cluster. + * Cyclic dependency. + * + * This is resolved by truncation of the election quorum to the number of + * registered replicas, if their count is less than synchronous quorum. That + * helps to elect a first leader. + * + * It may seem that the first node could just declare itself a leader and then + * strictly follow the protocol from now on, but that won't work, because if the + * first node will restart after it is booted, but before quorum of replicas is + * booted, the cluster will stuck again. + * + * The current solution is totally safe because + * + * - after all the cluster will have node count >= quorum, if user used a + * correct config (God help him if he didn't); + * + * - synchronous replication quorum is untouched - it is not truncated. Only + * leader election quorum is affected. So synchronous data won't be lost. + */ +static inline int +raft_election_quorum(void) +{ + return MIN(replication_synchro_quorum, replicaset.size); +} + /** Broadcast an event about this node changed its state to all relays. */ static inline void raft_broadcast_new_state(void) @@ -157,9 +200,9 @@ static void raft_sm_start(void); /** - * Stop the state machine. + * Stop the state machine. Now until Raft is re-enabled, * - Raft stops affecting the instance operation; - * - this node can't become a leader anymore; + * - this node can't become a leader; * - this node can't vote. */ static void @@ -188,7 +231,10 @@ raft_sm_schedule_new_term(uint64_t new_term); static void raft_sm_schedule_new_vote(uint32_t new_vote); -/** Bump volatile term, vote for self, and schedule their flush to disk. */ +/** + * Bump term and vote for self immediately. After that is persisted, the + * election timeout will be activated. Unless during that nothing newer happens. + */ static void raft_sm_schedule_new_election(void); @@ -200,6 +246,17 @@ static void raft_sm_schedule_new_election_cb(struct ev_loop *loop, struct ev_timer *timer, int events); +/** Start Raft state flush to disk. */ +static void +raft_sm_pause_and_dump(void); + +/** + * Flush Raft state changes to WAL. The callback resets itself, if during the + * write more changes appear. + */ +static void +raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events); ================================================== I announces these functions even though it wasn't necessary, to keep all raft_sm_* methods declared in one place. ================================================== + void raft_process_recovery(const struct raft_request *req) { @@ -261,6 +318,14 @@ raft_process_msg(const struct raft_request *req, uint32_t source) /* Can't vote for too old or incomparable nodes. */ if (!raft_can_vote_for(req->vclock)) break; + /* + * Check if somebody is asking to vote for a third + * node - nope. Make votes only when asked directly by + * the new candidate. However that restriction may be + * relaxed in future, if can be proven to be safe. + */ + if (req->vote != source) + break; ================================================== I decided not to modify the original Raft here, and don't spread a vote wave. A node can vote only for candidates asking for a vote directly. Not via a third node. ================================================== /* * Either the term is new, or didn't vote in the current * term yet. Anyway can vote now. @@ -279,7 +344,7 @@ raft_process_msg(const struct raft_request *req, uint32_t source) assert(raft.volatile_vote == instance_id); bool was_set = bit_set(&raft.vote_mask, source); raft.vote_count += !was_set; - if (raft.vote_count < replication_synchro_quorum) + if (raft.vote_count < raft_election_quorum()) break; raft.state = RAFT_STATE_LEADER; raft.leader = instance_id; @@ -330,8 +395,8 @@ void raft_process_heartbeat(uint32_t source) { /* - * When not a candidate - don't wait for anything. Therefore does not - * care about the leader being dead. + * When not a candidate - don't wait for anything. Therefore do not care + * about the leader being dead. */ if (!raft.is_candidate) return; @@ -352,23 +417,6 @@ raft_process_heartbeat(uint32_t source) raft_sm_wait_leader_dead(); } -void -raft_serialize(struct raft_request *req, struct vclock *vclock) -{ - memset(req, 0, sizeof(*req)); - /* - * Volatile state is never used for any communications. - * Use only persisted state. - */ - req->term = raft.term; - req->vote = raft.vote; - req->state = raft.state; - /* - * Raft does not own vclock, so it always expects it passed externally. - */ - req->vclock = vclock; -} - /** Wakeup Raft state writer fiber waiting for WAL write end. */ static void raft_write_cb(struct journal_entry *entry) @@ -386,6 +434,13 @@ raft_write_request(const struct raft_request *req) * be sent to network when vote for self. */ assert(req->vclock == NULL); + /* + * State is not persisted. That would be strictly against Raft protocol. + * The reason is that it does not make much sense - even if the node is + * a leader now, after the node is restarted, there will be another + * leader elected by that time likely. + */ + assert(req->state == 0); struct region *region = &fiber()->gc; uint32_t svp = region_used(region); struct xrow_header row; @@ -417,12 +472,8 @@ fail: panic("Could not write a raft request to WAL\n"); } -/** - * Flush Raft state changes to WAL. The callback resets itself, if during the - * write more changes appear. - */ static void -raft_sm_dump_step(ev_loop *loop, ev_check *watcher, int events) +raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events) { assert(watcher == &raft.io); (void) events; @@ -453,7 +504,7 @@ end_dump: raft_sm_wait_leader_dead(); } else if (raft.vote == instance_id) { /* Just wrote own vote. */ - if (replication_synchro_quorum == 1) { + if (raft_election_quorum() == 1) { raft.state = RAFT_STATE_LEADER; raft.leader = instance_id; /* @@ -463,8 +514,10 @@ end_dump: box_update_ro_summary(); } else { raft.state = RAFT_STATE_CANDIDATE; + /* First vote for self. */ raft.vote_count = 1; raft.vote_mask = 0; + bit_set(&raft.vote_mask, instance_id); raft_sm_wait_election_end(); } } else if (raft.vote != 0) { @@ -530,7 +583,6 @@ end_dump: raft_broadcast(&req); } -/** Start Raft state flush to disk. */ static void raft_sm_pause_and_dump(void) { @@ -542,7 +594,6 @@ raft_sm_pause_and_dump(void) raft.is_write_in_progress = true; } -/** Bump term, reset Raft state, and persist that fact. */ static void raft_sm_schedule_new_term(uint64_t new_term) { @@ -556,7 +607,6 @@ raft_sm_schedule_new_term(uint64_t new_term) raft_sm_pause_and_dump(); } -/** Vote in the current term, and persist that fact. */ static void raft_sm_schedule_new_vote(uint32_t new_vote) { @@ -565,10 +615,6 @@ raft_sm_schedule_new_vote(uint32_t new_vote) raft_sm_pause_and_dump(); } -/** - * Bump term and vote for self immediately. After that is persisted, the - * election timeout will be activated. Unless during that nothing newer happens. - */ static void raft_sm_schedule_new_election(void) { @@ -581,21 +627,6 @@ raft_sm_schedule_new_election(void) box_update_ro_summary(); } -void -raft_new_term(uint64_t min_new_term) -{ - uint64_t new_term; - if (raft.term < min_new_term) - new_term = min_new_term + 1; - else - new_term = raft.term + 1; - enum raft_state old_state = raft.state; - raft_sm_schedule_new_term(new_term); - if (raft.state != old_state) - raft_broadcast_new_state(); - box_update_ro_summary(); -} - static void raft_sm_schedule_new_election_cb(struct ev_loop *loop, struct ev_timer *timer, int events) @@ -657,7 +688,7 @@ raft_sm_start(void) * it wasn't sent in disabled state. */ struct raft_request req; - raft_serialize(&req, NULL); + raft_serialize_for_network(&req, NULL); raft_broadcast(&req); } @@ -670,6 +701,31 @@ raft_sm_stop(void) box_update_ro_summary(); } +void +raft_serialize_for_network(struct raft_request *req, struct vclock *vclock) +{ + memset(req, 0, sizeof(*req)); + /* + * Volatile state is never used for any communications. + * Use only persisted state. + */ + req->term = raft.term; + req->vote = raft.vote; + req->state = raft.state; + /* + * Raft does not own vclock, so it always expects it passed externally. + */ + req->vclock = vclock; +} + +void +raft_serialize_for_disk(struct raft_request *req) +{ + memset(req, 0, sizeof(*req)); + req->term = raft.term; + req->vote = raft.vote; +} + void raft_cfg_is_enabled(bool is_enabled) { @@ -694,9 +750,8 @@ raft_cfg_is_candidate(bool is_candidate) if (raft.is_candidate) { assert(raft.state == RAFT_STATE_FOLLOWER); /* - * If there is an on-going WAL write, it means - * there was some node who sent newer data to this - * node. + * If there is an on-going WAL write, it means there was some + * node who sent newer data to this node. */ if (raft.leader == 0 && raft_is_fully_on_disk()) raft_sm_schedule_new_election(); @@ -729,7 +784,7 @@ raft_cfg_election_quorum(void) if (raft.state != RAFT_STATE_CANDIDATE || raft.state == RAFT_STATE_LEADER) return; - if (raft.vote_count < replication_synchro_quorum) + if (raft.vote_count < raft_election_quorum()) return; /* * The node is a candidate. It means its state if fully synced with @@ -767,18 +822,6 @@ raft_broadcast(const struct raft_request *req) } } -void -raft_bootstrap_leader(void) -{ - assert(raft.is_enabled); - assert(raft.volatile_term == 0); - assert(raft.volatile_vote == 0); - assert(raft.state == RAFT_STATE_FOLLOWER); - raft.state = RAFT_STATE_LEADER; - raft_broadcast_new_state(); - box_update_ro_summary(); -} - void raft_init(void) { diff --git a/src/box/raft.h b/src/box/raft.h index 57584bc1b..111a9c16e 100644 --- a/src/box/raft.h +++ b/src/box/raft.h @@ -37,12 +37,51 @@ extern "C" { #endif +/** + * This is an implementation of Raft leader election protocol, separated from + * synchronous replication part. + * + * The protocol describes an algorithm which helps to elect a single leader in + * the cluster, which is supposed to handle write requests. And re-elect a new + * leader, when the current leader dies. + * + * The implementation follows the protocol to the letter except a few important + * details. + * + * Firstly, the original Raft assumes, that all nodes share the same log record + * numbers. In Tarantool they are called LSNs. But in case of Tarantool each + * node has its own LSN in its own component of vclock. That makes the election + * messages a bit heavier, because the nodes need to send and compare complete + * vclocks of each other instead of a single number like in the original Raft. + * But logic becomes simpler. Because in the original Raft there is a problem of + * uncertainty about what to do with records of an old leader right after a new + * leader is elected. They could be rolled back or confirmed depending on + * circumstances. The issue disappears when vclock is used. + * + * Secondly, leader election works differently during cluster bootstrap, until + * number of bootstrapped replicas becomes >= election quorum. That arises from + * specifics of replicas bootstrap and order of systems initialization. In + * short: during bootstrap a leader election may use a smaller election quorum + * than the configured one. See more details in the code. + */ ================================================== It feels the comment is still not detailed enough. It looks suspiciously small for such a big feature. Would be nice if anyone could add anything. I will add more info later, if something will come into mind. ================================================== + struct raft_request; struct vclock; enum raft_state { + /** + * Can't write. Can only accept data from a leader. Node in this state + * either monitors an existing leader, or there is an on-going election + * and the node voted for another node, or it can't be a candidate and + * does not do anything. + */ RAFT_STATE_FOLLOWER = 1, + /** + * The node can't write. There is an active election, in which the node + * voted for self. Now it waits for election outcome. + */ RAFT_STATE_CANDIDATE = 2, + /** Election was successful. The node accepts write requests. */ RAFT_STATE_LEADER = 3, }; @@ -54,31 +93,27 @@ struct raft { /** State of the instance. */ enum raft_state state; /** - * Volatile part of the Raft state, whose WAL write may be - * still in-progress, and yet the state may be already - * used. Volatile state is never sent to anywhere, but the - * state machine makes decisions based on it. That is - * vital. - * As an example, volatile vote needs to be used to reject - * votes inside a term, where the instance already voted - * (even if the vote WAL write is not finished yet). - * Otherwise the instance would try to write several votes - * inside one term. + * Volatile part of the Raft state, whose WAL write may be still + * in-progress, and yet the state may be already used. Volatile state is + * never sent to anywhere, but the state machine makes decisions based + * on it. That is vital. + * As an example, volatile vote needs to be used to reject votes inside + * a term, where the instance already voted (even if the vote WAL write + * is not finished yet). Otherwise the instance would try to write + * several votes inside one term. */ uint64_t volatile_term; uint32_t volatile_vote; /** - * Flag whether Raft is enabled. When disabled, it still - * persists terms so as to quickly enroll into the cluster - * when (if) it is enabled. In everything else disabled - * Raft does not affect instance work. + * Flag whether Raft is enabled. When disabled, it still persists terms + * so as to quickly enroll into the cluster when (if) it is enabled. In + * everything else disabled Raft does not affect instance work. */ bool is_enabled; /** - * Flag whether the node can become a leader. It is an - * accumulated value of configuration options Raft enabled - * Raft candidate. If at least one is false - the instance - * is not a candidate. + * Flag whether the node can become a leader. It is an accumulated value + * of configuration options Raft enabled and Raft candidate. If at least + * one is false - the instance is not a candidate. */ bool is_candidate; /** Flag whether the instance is allowed to be a leader. */ @@ -94,7 +129,10 @@ struct raft { */ uint64_t term; uint32_t vote; - /** Bit 1 means that a vote from that instance was obtained. */ + /** + * Bit 1 on position N means that a vote from instance with ID = N was + * obtained. + */ uint32_t vote_mask; /** Number of votes for this instance. Valid only in candidate state. */ int vote_count; @@ -110,24 +148,25 @@ struct raft { extern struct raft raft; -void -raft_new_term(uint64_t min_new_term); - -void -raft_vote(uint32_t vote_for); - +/** + * A flag whether the instance is read-only according to Raft. Even if Raft + * allows writes though, it does not mean the instance is writable. It can be + * affected by box.cfg.read_only, connection quorum. + */ static inline bool raft_is_ro(void) { return raft.is_enabled && raft.state != RAFT_STATE_LEADER; } +/** See if the instance can accept rows from an instance with the given ID. */ static inline bool raft_is_source_allowed(uint32_t source_id) { return !raft.is_enabled || raft.leader == source_id; } +/** Check if Raft is enabled. */ static inline bool raft_is_enabled(void) { @@ -146,59 +185,66 @@ raft_process_recovery(const struct raft_request *req); void raft_process_msg(const struct raft_request *req, uint32_t source); +/** + * Process a heartbeat message from an instance with the given ID. It is used to + * watch leader's health and start election when necessary. + */ void raft_process_heartbeat(uint32_t source); -/** - * Broadcast the changes in this instance's raft status to all - * the followers. - */ +/** Configure whether Raft is enabled. */ void raft_cfg_is_enabled(bool is_enabled); +/** + * Configure whether the instance can be elected as Raft leader. Even if false, + * the node still can vote, when Raft is enabled. + */ void raft_cfg_is_candidate(bool is_candidate); +/** Configure Raft leader election timeout. */ void raft_cfg_election_timeout(double timeout); +/** + * Configure Raft leader election quorum. There is no a separate option. + * Instead, synchronous replication quorum is used. Since Raft is tightly bound + * with synchronous replication. + */ void raft_cfg_election_quorum(void); +/** + * Configure Raft leader death timeout. I.e. number of seconds without + * heartbeats from the leader to consider it dead. There is no a separate + * option. Raft uses replication timeout for that. + */ void raft_cfg_death_timeout(void); -/** Save complete Raft state into the request. */ +/** + * Save complete Raft state into a request to be sent to other instances of the + * cluster. It is allowed to save anything here, not only persistent state. + */ void -raft_serialize(struct raft_request *req, struct vclock *vclock); +raft_serialize_for_network(struct raft_request *req, struct vclock *vclock); ================================================== Serialization was split in 2 calls - for disk and for network. Because for disk state and vclock are never needed. It looked ugly to pass NULL vclock and manually nullify request's state after serialize() call. ================================================== /** - * Broadcast the changes in this instance's raft status to all - * the followers. + * Save complete Raft state into a request to be persisted on disk. Only term + * and vote are being persisted. */ void -raft_broadcast(const struct raft_request *req); +raft_serialize_for_disk(struct raft_request *req); /** - * Bootstrap the current instance as the first leader of the cluster. That is - * done bypassing the Raft election protocol, by just assigning this node a - * leader role. That is needed, because when the cluster is not bootstrapped, it - * is necessary to find a node, which will generate a replicaset UUID, write it - * into _cluster space, and register all the other nodes in _cluster. - * Until it is done, all nodes but one won't boot. Their WALs won't work. And - * therefore they won't be able to participate in leader election. That - * effectively makes the cluster dead from the beginning unless the first - * bootstrapped node won't declare itself a leader without elections. - * - * XXX: That does not solve the problem, when the first node boots, creates a - * snapshot, and then immediately dies. After recovery it won't declare itself a - * leader. Therefore if quorum > 1, the cluster won't proceed to registering - * any replicas and becomes completely dead. Perhaps that must be solved by - * truncating quorum down to number of records in _cluster. + * Broadcast the changes in this instance's raft status to all + * the followers. */ void -raft_bootstrap_leader(void); +raft_broadcast(const struct raft_request *req); +/** Initialize Raft global data structures. */ void raft_init(void); diff --git a/src/box/replication.cc b/src/box/replication.cc index ef0e2411d..20f16206a 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -247,6 +247,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id) tt_uuid_str(&replica->uuid)); } replicaset.replica_by_id[replica_id] = replica; + ++replicaset.size; say_info("assigned id %d to replica %s", replica->id, tt_uuid_str(&replica->uuid)); @@ -267,6 +268,8 @@ replica_clear_id(struct replica *replica) * replication. */ replicaset.replica_by_id[replica->id] = NULL; + assert(replicaset.size > 0); + --replicaset.size; if (replica->id == instance_id) { /* See replica_check_id(). */ assert(replicaset.is_joining); diff --git a/src/box/replication.h b/src/box/replication.h index ddc2bddf4..69cc820c9 100644 --- a/src/box/replication.h +++ b/src/box/replication.h @@ -217,6 +217,13 @@ struct replicaset { bool is_joining; /* A number of anonymous replicas following this instance. */ int anon_count; + /** + * Number of registered replicas. That includes all of them - connected, + * disconnected, connected not directly, just present in _cluster. If an + * instance has an ID, has the same replicaset UUID, then it is + * accounted here. + */ + int size; /** Applier state. */ struct { /** diff --git a/test/box/info.result b/test/box/info.result index 40eeae069..d0abb634a 100644 --- a/test/box/info.result +++ b/test/box/info.result @@ -82,6 +82,7 @@ t - memory - package - pid + - raft - replication - replication_anon - ro