From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp49.i.mail.ru (smtp49.i.mail.ru [94.100.177.109]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 20397469710 for ; Tue, 17 Nov 2020 18:16:33 +0300 (MSK) From: Serge Petrenko Date: Tue, 17 Nov 2020 18:16:19 +0300 Message-Id: <20201117151619.68468-1-sergepetrenko@tarantool.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH] raft: execute triggers exactly on state change List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: v.shpilevoy@tarantool.org, gorcunov@gmail.com Cc: tarantool-patches@dev.tarantool.org Raft triggers on state change should be executed right on state change, in the same fiber and without yields. This solves two problems: 1) instance may be rw for a short period of time after becoming follower 2) leader may become rw even before clearing the limbo after the previous leader In order to achieve this introduce a new state, RAFT_STATE_NONE, which takes part of the responsibilities RAFT_STATE_FOLLOWER had. The state indicates that raft is turned off, so that both "raft is off" and "state is follower" events may be delivered to the triggers. Closes #5440 --- sp/gh-5440-on-state-update https://github.com/tarantool/tarantool/issues/5440 src/box/box.cc | 3 +- src/box/raft.c | 17 ++++---- src/lib/raft/raft.c | 49 +++++++++++++++--------- src/lib/raft/raft.h | 2 + test/replication/election_basic.result | 4 +- test/replication/election_basic.test.lua | 4 +- 6 files changed, 48 insertions(+), 31 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index 7d23de95c..7c8fba6d3 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -791,8 +791,9 @@ box_set_election_mode(void) const char *mode = box_check_election_mode(); if (mode == NULL) return -1; - raft_cfg_is_candidate(box_raft(), strcmp(mode, "candidate") == 0); + /* Order is important for correct state transitions. */ raft_cfg_is_enabled(box_raft(), strcmp(mode, "off") != 0); + raft_cfg_is_candidate(box_raft(), strcmp(mode, "candidate") == 0); return 0; } diff --git a/src/box/raft.c b/src/box/raft.c index db1a3f423..bb26398a3 100644 --- a/src/box/raft.c +++ b/src/box/raft.c @@ -77,16 +77,17 @@ box_raft_on_update_f(struct trigger *trigger, void *event) (void)trigger; struct raft *raft = (struct raft *)event; assert(raft == box_raft()); + if (raft->state == RAFT_STATE_LEADER) { + /* + * When the node became a leader, it means it will ignore all + * records from all the other nodes, and won't get late CONFIRM + * messages anyway. Can clear the queue without waiting for + * confirmations. + */ + box_clear_synchro_queue(false); + } /* State or enablence could be changed, affecting read-only state. */ box_update_ro_summary(); - if (raft->state != RAFT_STATE_LEADER) - return 0; - /* - * When the node became a leader, it means it will ignore all records - * from all the other nodes, and won't get late CONFIRM messages anyway. - * Can clear the queue without waiting for confirmations. - */ - box_clear_synchro_queue(false); return 0; } diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c index b669475f3..80b4d0211 100644 --- a/src/lib/raft/raft.c +++ b/src/lib/raft/raft.c @@ -52,6 +52,7 @@ raft_state_str(uint32_t state) [RAFT_STATE_FOLLOWER] = "follower", [RAFT_STATE_CANDIDATE] = "candidate", [RAFT_STATE_LEADER] = "leader", + [RAFT_STATE_NONE] = "none", }; if (state < lengthof(str)) @@ -60,6 +61,17 @@ raft_state_str(uint32_t state) return "invalid (x)"; }; +/** Set raft state to a new value and run triggers on state change. */ +static void +raft_set_state(struct raft *raft, uint32_t state) +{ + assert(state != 0 && state <= RAFT_STATE_NONE); + if (state == raft->state) + return; + raft->state = state; + trigger_run(&raft->on_update, raft); +} + /** * Check if Raft is completely synced with disk. Meaning all its critical values * are in WAL. Only in that state the node can become a leader or a candidate. @@ -316,13 +328,12 @@ raft_process_msg(struct raft *raft, const struct raft_msg *req, uint32_t source) */ if (req->vote != 0) { switch (raft->state) { + case RAFT_STATE_NONE: + say_info("RAFT: vote request is skipped - RAFT " + "is disabled"); + break; case RAFT_STATE_FOLLOWER: case RAFT_STATE_LEADER: - if (!raft->is_enabled) { - say_info("RAFT: vote request is skipped - RAFT " - "is disabled"); - break; - } if (raft->leader != 0) { say_info("RAFT: vote request is skipped - the " "leader is already known - %u", @@ -467,8 +478,8 @@ static void raft_worker_handle_io(struct raft *raft) { assert(raft->is_write_in_progress); - /* During write Raft can't be anything but a follower. */ - assert(raft->state == RAFT_STATE_FOLLOWER); + /* During write Raft can't be anything but a follower or turned off. */ + assert(raft->state == RAFT_STATE_FOLLOWER || !raft->is_enabled); struct raft_msg req; if (raft_is_fully_on_disk(raft)) { @@ -548,7 +559,6 @@ raft_worker_handle_broadcast(struct raft *raft) req.vclock = raft->vclock; } raft->vtab->broadcast(raft, &req); - trigger_run(&raft->on_update, raft); raft->is_broadcast_scheduled = false; } @@ -581,7 +591,7 @@ raft_worker_f(va_list args) static void raft_sm_pause_and_dump(struct raft *raft) { - assert(raft->state == RAFT_STATE_FOLLOWER); + assert(raft->state == RAFT_STATE_FOLLOWER || !raft->is_enabled); if (raft->is_write_in_progress) return; ev_timer_stop(loop(), &raft->timer); @@ -598,7 +608,7 @@ raft_sm_become_leader(struct raft *raft) assert(raft->leader == 0); assert(raft->is_candidate); assert(!raft->is_write_in_progress); - raft->state = RAFT_STATE_LEADER; + raft_set_state(raft, RAFT_STATE_LEADER); raft->leader = raft->self; ev_timer_stop(loop(), &raft->timer); /* State is visible and it is changed - broadcast. */ @@ -611,7 +621,8 @@ raft_sm_follow_leader(struct raft *raft, uint32_t leader) say_info("RAFT: leader is %u, follow", leader); assert(raft->state != RAFT_STATE_LEADER); assert(raft->leader == 0); - raft->state = RAFT_STATE_FOLLOWER; + if (raft->is_enabled) + raft_set_state(raft, RAFT_STATE_FOLLOWER); raft->leader = leader; if (!raft->is_write_in_progress && raft->is_candidate) { ev_timer_stop(loop(), &raft->timer); @@ -631,7 +642,7 @@ raft_sm_become_candidate(struct raft *raft) assert(raft->is_candidate); assert(!raft->is_write_in_progress); assert(raft->election_quorum > 1); - raft->state = RAFT_STATE_CANDIDATE; + raft_set_state(raft, RAFT_STATE_CANDIDATE); raft->vote_count = 1; raft->vote_mask = 0; bit_set(&raft->vote_mask, raft->self); @@ -650,7 +661,8 @@ raft_sm_schedule_new_term(struct raft *raft, uint64_t new_term) /* New terms means completely new Raft state. */ raft->volatile_vote = 0; raft->leader = 0; - raft->state = RAFT_STATE_FOLLOWER; + if (raft->is_enabled) + raft_set_state(raft, RAFT_STATE_FOLLOWER); raft_sm_pause_and_dump(raft); /* * State is visible and it is changed - broadcast. Term is also visible, @@ -740,9 +752,10 @@ raft_sm_start(struct raft *raft) say_info("RAFT: start state machine"); assert(!ev_is_active(&raft->timer)); assert(!raft->is_enabled); - assert(raft->state == RAFT_STATE_FOLLOWER); + assert(raft->state == RAFT_STATE_NONE); raft->is_enabled = true; raft->is_candidate = raft->is_cfg_candidate; + raft_set_state(raft, RAFT_STATE_FOLLOWER); if (raft->is_write_in_progress) { /* * Nop. If write is in progress, the state machine is frozen. It @@ -784,7 +797,7 @@ raft_sm_stop(struct raft *raft) raft->is_candidate = false; if (raft->state == RAFT_STATE_LEADER) raft->leader = 0; - raft->state = RAFT_STATE_FOLLOWER; + raft_set_state(raft, RAFT_STATE_NONE); ev_timer_stop(loop(), &raft->timer); /* State is visible and changed - broadcast. */ raft_schedule_broadcast(raft); @@ -841,7 +854,7 @@ raft_cfg_is_candidate(struct raft *raft, bool is_candidate) bool old_is_candidate = raft->is_candidate; raft->is_cfg_candidate = is_candidate; raft->is_candidate = is_candidate && raft->is_enabled; - if (raft->is_candidate == old_is_candidate) + if (raft->is_candidate == old_is_candidate || !raft->is_enabled) return; if (raft->is_candidate) { @@ -866,7 +879,7 @@ raft_cfg_is_candidate(struct raft *raft, bool is_candidate) if (raft->state != RAFT_STATE_FOLLOWER) { if (raft->state == RAFT_STATE_LEADER) raft->leader = 0; - raft->state = RAFT_STATE_FOLLOWER; + raft_set_state(raft, RAFT_STATE_FOLLOWER); /* State is visible and changed - broadcast. */ raft_schedule_broadcast(raft); } @@ -982,7 +995,7 @@ void raft_create(struct raft *raft, const struct raft_vtab *vtab) { *raft = (struct raft) { - .state = RAFT_STATE_FOLLOWER, + .state = RAFT_STATE_NONE, .volatile_term = 1, .term = 1, .election_quorum = 1, diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h index 4f4d24ca8..0f9d84f27 100644 --- a/src/lib/raft/raft.h +++ b/src/lib/raft/raft.h @@ -86,6 +86,8 @@ enum raft_state { RAFT_STATE_CANDIDATE = 2, /** Election was successful. The node accepts write requests. */ RAFT_STATE_LEADER = 3, + /* Empty state. Indicates that raft state machine is stopped. */ + RAFT_STATE_NONE = 4, }; /** diff --git a/test/replication/election_basic.result b/test/replication/election_basic.result index 4d7d33f2b..9f46696e7 100644 --- a/test/replication/election_basic.result +++ b/test/replication/election_basic.result @@ -36,7 +36,7 @@ box.cfg{election_timeout = 0} | number' | ... --- When election is disabled, the instance is a follower. Does not try to become +-- When election is disabled, the instance does not try to become -- a leader, and does not block write operations. term = box.info.election.term | --- @@ -44,7 +44,7 @@ term = box.info.election.term vote = box.info.election.vote | --- | ... -assert(box.info.election.state == 'follower') +assert(box.info.election.state == 'none') | --- | - true | ... diff --git a/test/replication/election_basic.test.lua b/test/replication/election_basic.test.lua index 821f73cea..21acc0a58 100644 --- a/test/replication/election_basic.test.lua +++ b/test/replication/election_basic.test.lua @@ -13,11 +13,11 @@ box.cfg{election_mode = '100'} box.cfg{election_timeout = -1} box.cfg{election_timeout = 0} --- When election is disabled, the instance is a follower. Does not try to become +-- When election is disabled, the instance does not try to become -- a leader, and does not block write operations. term = box.info.election.term vote = box.info.election.vote -assert(box.info.election.state == 'follower') +assert(box.info.election.state == 'none') assert(box.info.election.leader == 0) assert(not box.info.ro) -- 2.24.3 (Apple Git-128)