From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpng3.m.smailru.net (smtpng3.m.smailru.net [94.100.177.149]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id E8CDA45C310 for ; Fri, 20 Nov 2020 02:46:34 +0300 (MSK) From: Vladislav Shpilevoy Date: Fri, 20 Nov 2020 00:46:11 +0100 Message-Id: <83c8a35a8f5c29d6cf6d7837ba2fef84a84ab7f3.1605829282.git.v.shpilevoy@tarantool.org> In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH v2 08/16] raft: introduce vtab for disk and network List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: tarantool-patches@dev.tarantool.org, sergepetrenko@tarantool.org Raft is being moved to a separate library in src/lib. It means, it can't depend on anything from box/. The patch makes raft stop using replicaset and journal objects. They were used to broadcast messages to all the other nodes, and to persist updates. Now Raft does the same through vtab, which is configured by box. Broadcast still sends messages via relays, and disk write still uses the journal. But Raft does not depend on any specific journal or network API. Part of #5303 --- src/box/raft.c | 63 ++++++++++++++++++++++++++++++++- src/box/raftlib.c | 89 +++++++++++++++-------------------------------- src/box/raftlib.h | 24 ++++++++++++- 3 files changed, 114 insertions(+), 62 deletions(-) diff --git a/src/box/raft.c b/src/box/raft.c index dae5a559c..5efff80a0 100644 --- a/src/box/raft.c +++ b/src/box/raft.c @@ -29,7 +29,10 @@ * SUCH DAMAGE. */ #include "box.h" +#include "error.h" +#include "journal.h" #include "raft.h" +#include "relay.h" #include "replication.h" struct raft box_raft_global = { @@ -114,10 +117,68 @@ box_raft_update_election_quorum(void) raft_cfg_election_quorum(box_raft(), quorum); } +static void +box_raft_broadcast(struct raft *raft, const struct raft_request *req) +{ + (void)raft; + assert(raft == box_raft()); + replicaset_foreach(replica) + relay_push_raft(replica->relay, req); +} + +/** Wakeup Raft state writer fiber waiting for WAL write end. */ +static void +box_raft_write_cb(struct journal_entry *entry) +{ + fiber_wakeup(entry->complete_data); +} + +static void +box_raft_write(struct raft *raft, const struct raft_request *req) +{ + (void)raft; + assert(raft == box_raft()); + /* See Raft implementation why these fields are never written. */ + assert(req->vclock == NULL); + assert(req->state == 0); + + struct region *region = &fiber()->gc; + uint32_t svp = region_used(region); + struct xrow_header row; + char buf[sizeof(struct journal_entry) + + sizeof(struct xrow_header *)]; + struct journal_entry *entry = (struct journal_entry *)buf; + entry->rows[0] = &row; + + if (xrow_encode_raft(&row, region, req) != 0) + goto fail; + journal_entry_create(entry, 1, xrow_approx_len(&row), box_raft_write_cb, + fiber()); + + if (journal_write(entry) != 0 || entry->res < 0) { + diag_set(ClientError, ER_WAL_IO); + diag_log(); + goto fail; + } + + region_truncate(region, svp); + return; +fail: + /* + * XXX: the stub is supposed to be removed once it is defined what to do + * when a raft request WAL write fails. + */ + panic("Could not write a raft request to WAL\n"); +} + void box_raft_init(void) { - raft_create(&box_raft_global); + static const struct raft_vtab box_raft_vtab = { + .broadcast = box_raft_broadcast, + .write = box_raft_write, + }; + raft_create(&box_raft_global, &box_raft_vtab); trigger_create(&box_raft_on_update, box_raft_on_update_f, NULL, NULL); raft_on_update(box_raft(), &box_raft_on_update); } diff --git a/src/box/raftlib.c b/src/box/raftlib.c index ab2e27fd8..2f5e90f21 100644 --- a/src/box/raftlib.c +++ b/src/box/raftlib.c @@ -31,11 +31,9 @@ #include "raft.h" #include "error.h" -#include "journal.h" +#include "fiber.h" #include "xrow.h" #include "small/region.h" -#include "replication.h" -#include "relay.h" #include "box.h" #include "tt_static.h" @@ -64,6 +62,20 @@ raft_state_str(uint32_t state) return "invalid (x)"; }; +/** Shortcut for vtab 'broadcast' method. */ +static inline void +raft_broadcast(struct raft *raft, const struct raft_request *req) +{ + raft->vtab->broadcast(raft, req); +} + +/** Shortcut for vtab 'write' method. */ +static inline void +raft_write(struct raft *raft, const struct raft_request *req) +{ + raft->vtab->write(raft, req); +} + /** * Check if Raft is completely synced with disk. Meaning all its critical values * are in WAL. Only in that state the node can become a leader or a candidate. @@ -469,58 +481,6 @@ raft_process_heartbeat(struct raft *raft, uint32_t source) raft_sm_wait_leader_dead(raft); } -/** Wakeup Raft state writer fiber waiting for WAL write end. */ -static void -raft_write_cb(struct journal_entry *entry) -{ - fiber_wakeup(entry->complete_data); -} - -/** Synchronously write a Raft request into WAL. */ -static void -raft_write_request(const struct raft_request *req) -{ - /* - * Vclock is never persisted by Raft. It is used only to - * be sent to network when vote for self. - */ - assert(req->vclock == NULL); - /* - * State is not persisted. That would be strictly against Raft protocol. - * The reason is that it does not make much sense - even if the node is - * a leader now, after the node is restarted, there will be another - * leader elected by that time likely. - */ - assert(req->state == 0); - struct region *region = &fiber()->gc; - uint32_t svp = region_used(region); - struct xrow_header row; - char buf[sizeof(struct journal_entry) + - sizeof(struct xrow_header *)]; - struct journal_entry *entry = (struct journal_entry *)buf; - entry->rows[0] = &row; - - if (xrow_encode_raft(&row, region, req) != 0) - goto fail; - journal_entry_create(entry, 1, xrow_approx_len(&row), raft_write_cb, - fiber()); - - if (journal_write(entry) != 0 || entry->res < 0) { - diag_set(ClientError, ER_WAL_IO); - diag_log(); - goto fail; - } - - region_truncate(region, svp); - return; -fail: - /* - * XXX: the stub is supposed to be removed once it is defined what to do - * when a raft request WAL write fails. - */ - panic("Could not write a raft request to WAL\n"); -} - /* Dump Raft state to WAL in a blocking way. */ static void raft_worker_handle_io(struct raft *raft) @@ -567,8 +527,17 @@ end_dump: assert(raft->volatile_term >= raft->term); req.term = raft->volatile_term; req.vote = raft->volatile_vote; - - raft_write_request(&req); + /* + * Skip vclock. It is used only to be sent to network when vote + * for self. It is a job of the vclock owner to persist it + * anyhow. + * + * Skip state. That would be strictly against Raft protocol. The + * reason is that it does not make much sense - even if the node + * is a leader now, after the node is restarted, there will be + * another leader elected by that time likely. + */ + raft_write(raft, &req); say_info("RAFT: persisted state %s", raft_request_to_string(&req)); @@ -598,8 +567,7 @@ raft_worker_handle_broadcast(struct raft *raft) assert(raft->vote == raft->self); req.vclock = raft->vclock; } - replicaset_foreach(replica) - relay_push_raft(replica->relay, &req); + raft_broadcast(raft, &req); trigger_run(&raft->on_update, raft); raft->is_broadcast_scheduled = false; } @@ -1038,7 +1006,7 @@ raft_schedule_broadcast(struct raft *raft) } void -raft_create(struct raft *raft) +raft_create(struct raft *raft, const struct raft_vtab *vtab) { *raft = (struct raft) { .state = RAFT_STATE_FOLLOWER, @@ -1047,6 +1015,7 @@ raft_create(struct raft *raft) .election_quorum = 1, .election_timeout = 5, .death_timeout = 5, + .vtab = vtab, }; ev_timer_init(&raft->timer, raft_sm_schedule_new_election_cb, 0, 0); raft->timer.data = raft; diff --git a/src/box/raftlib.h b/src/box/raftlib.h index 8d0d03da0..6181d9d49 100644 --- a/src/box/raftlib.h +++ b/src/box/raftlib.h @@ -69,6 +69,7 @@ extern "C" { */ struct fiber; +struct raft; struct raft_request; enum raft_state { @@ -94,6 +95,21 @@ enum raft_state { const char * raft_state_str(uint32_t state); +typedef void (*raft_broadcast_f)(struct raft *raft, + const struct raft_request *req); +typedef void (*raft_write_f)(struct raft *raft, const struct raft_request *req); + +/** + * Raft connection to the environment, via which it talks to other nodes and + * saves something to disk. + */ +struct raft_vtab { + /** Send a message to all nodes in the cluster. */ + raft_broadcast_f broadcast; + /** Save a message to disk. */ + raft_write_f write; +}; + struct raft { /** Instance ID of this node. */ uint32_t self; @@ -174,6 +190,8 @@ struct raft { * elections can be started. */ double death_timeout; + /** Virtual table to perform application-specific actions. */ + const struct raft_vtab *vtab; /** * Trigger invoked each time any of the Raft node visible attributes are * changed. @@ -295,8 +313,12 @@ raft_serialize_for_disk(const struct raft *raft, struct raft_request *req); void raft_on_update(struct raft *raft, struct trigger *trigger); +/** + * Create a Raft node. The vtab is not copied. Its memory should stay valid even + * after the creation. + */ void -raft_create(struct raft *raft); +raft_create(struct raft *raft, const struct raft_vtab *vtab); void raft_destroy(struct raft *raft); -- 2.24.3 (Apple Git-128)