[Tarantool-patches] [PATCH v2 08/16] raft: introduce vtab for disk and network

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Fri Nov 20 02:46:11 MSK 2020


Raft is being moved to a separate library in src/lib. It means,
it can't depend on anything from box/.

The patch makes raft stop using replicaset and journal objects.
They were used to broadcast messages to all the other nodes, and
to persist updates.

Now Raft does the same through vtab, which is configured by box.
Broadcast still sends messages via relays, and disk write still
uses the journal. But Raft does not depend on any specific journal
or network API.

Part of #5303
---
 src/box/raft.c    | 63 ++++++++++++++++++++++++++++++++-
 src/box/raftlib.c | 89 +++++++++++++++--------------------------------
 src/box/raftlib.h | 24 ++++++++++++-
 3 files changed, 114 insertions(+), 62 deletions(-)

diff --git a/src/box/raft.c b/src/box/raft.c
index dae5a559c..5efff80a0 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -29,7 +29,10 @@
  * SUCH DAMAGE.
  */
 #include "box.h"
+#include "error.h"
+#include "journal.h"
 #include "raft.h"
+#include "relay.h"
 #include "replication.h"
 
 struct raft box_raft_global = {
@@ -114,10 +117,68 @@ box_raft_update_election_quorum(void)
 	raft_cfg_election_quorum(box_raft(), quorum);
 }
 
+static void
+box_raft_broadcast(struct raft *raft, const struct raft_request *req)
+{
+	(void)raft;
+	assert(raft == box_raft());
+	replicaset_foreach(replica)
+		relay_push_raft(replica->relay, req);
+}
+
+/** Wakeup Raft state writer fiber waiting for WAL write end. */
+static void
+box_raft_write_cb(struct journal_entry *entry)
+{
+	fiber_wakeup(entry->complete_data);
+}
+
+static void
+box_raft_write(struct raft *raft, const struct raft_request *req)
+{
+	(void)raft;
+	assert(raft == box_raft());
+	/* See Raft implementation why these fields are never written. */
+	assert(req->vclock == NULL);
+	assert(req->state == 0);
+
+	struct region *region = &fiber()->gc;
+	uint32_t svp = region_used(region);
+	struct xrow_header row;
+	char buf[sizeof(struct journal_entry) +
+		 sizeof(struct xrow_header *)];
+	struct journal_entry *entry = (struct journal_entry *)buf;
+	entry->rows[0] = &row;
+
+	if (xrow_encode_raft(&row, region, req) != 0)
+		goto fail;
+	journal_entry_create(entry, 1, xrow_approx_len(&row), box_raft_write_cb,
+			     fiber());
+
+	if (journal_write(entry) != 0 || entry->res < 0) {
+		diag_set(ClientError, ER_WAL_IO);
+		diag_log();
+		goto fail;
+	}
+
+	region_truncate(region, svp);
+	return;
+fail:
+	/*
+	 * XXX: the stub is supposed to be removed once it is defined what to do
+	 * when a raft request WAL write fails.
+	 */
+	panic("Could not write a raft request to WAL\n");
+}
+
 void
 box_raft_init(void)
 {
-	raft_create(&box_raft_global);
+	static const struct raft_vtab box_raft_vtab = {
+		.broadcast = box_raft_broadcast,
+		.write = box_raft_write,
+	};
+	raft_create(&box_raft_global, &box_raft_vtab);
 	trigger_create(&box_raft_on_update, box_raft_on_update_f, NULL, NULL);
 	raft_on_update(box_raft(), &box_raft_on_update);
 }
diff --git a/src/box/raftlib.c b/src/box/raftlib.c
index ab2e27fd8..2f5e90f21 100644
--- a/src/box/raftlib.c
+++ b/src/box/raftlib.c
@@ -31,11 +31,9 @@
 #include "raft.h"
 
 #include "error.h"
-#include "journal.h"
+#include "fiber.h"
 #include "xrow.h"
 #include "small/region.h"
-#include "replication.h"
-#include "relay.h"
 #include "box.h"
 #include "tt_static.h"
 
@@ -64,6 +62,20 @@ raft_state_str(uint32_t state)
 	return "invalid (x)";
 };
 
+/** Shortcut for vtab 'broadcast' method. */
+static inline void
+raft_broadcast(struct raft *raft, const struct raft_request *req)
+{
+	raft->vtab->broadcast(raft, req);
+}
+
+/** Shortcut for vtab 'write' method. */
+static inline void
+raft_write(struct raft *raft, const struct raft_request *req)
+{
+	raft->vtab->write(raft, req);
+}
+
 /**
  * Check if Raft is completely synced with disk. Meaning all its critical values
  * are in WAL. Only in that state the node can become a leader or a candidate.
@@ -469,58 +481,6 @@ raft_process_heartbeat(struct raft *raft, uint32_t source)
 	raft_sm_wait_leader_dead(raft);
 }
 
-/** Wakeup Raft state writer fiber waiting for WAL write end. */
-static void
-raft_write_cb(struct journal_entry *entry)
-{
-	fiber_wakeup(entry->complete_data);
-}
-
-/** Synchronously write a Raft request into WAL. */
-static void
-raft_write_request(const struct raft_request *req)
-{
-	/*
-	 * Vclock is never persisted by Raft. It is used only to
-	 * be sent to network when vote for self.
-	 */
-	assert(req->vclock == NULL);
-	/*
-	 * State is not persisted. That would be strictly against Raft protocol.
-	 * The reason is that it does not make much sense - even if the node is
-	 * a leader now, after the node is restarted, there will be another
-	 * leader elected by that time likely.
-	 */
-	assert(req->state == 0);
-	struct region *region = &fiber()->gc;
-	uint32_t svp = region_used(region);
-	struct xrow_header row;
-	char buf[sizeof(struct journal_entry) +
-		 sizeof(struct xrow_header *)];
-	struct journal_entry *entry = (struct journal_entry *)buf;
-	entry->rows[0] = &row;
-
-	if (xrow_encode_raft(&row, region, req) != 0)
-		goto fail;
-	journal_entry_create(entry, 1, xrow_approx_len(&row), raft_write_cb,
-			     fiber());
-
-	if (journal_write(entry) != 0 || entry->res < 0) {
-		diag_set(ClientError, ER_WAL_IO);
-		diag_log();
-		goto fail;
-	}
-
-	region_truncate(region, svp);
-	return;
-fail:
-	/*
-	 * XXX: the stub is supposed to be removed once it is defined what to do
-	 * when a raft request WAL write fails.
-	 */
-	panic("Could not write a raft request to WAL\n");
-}
-
 /* Dump Raft state to WAL in a blocking way. */
 static void
 raft_worker_handle_io(struct raft *raft)
@@ -567,8 +527,17 @@ end_dump:
 		assert(raft->volatile_term >= raft->term);
 		req.term = raft->volatile_term;
 		req.vote = raft->volatile_vote;
-
-		raft_write_request(&req);
+		/*
+		 * Skip vclock. It is used only to be sent to network when vote
+		 * for self. It is a job of the vclock owner to persist it
+		 * anyhow.
+		 *
+		 * Skip state. That would be strictly against Raft protocol. The
+		 * reason is that it does not make much sense - even if the node
+		 * is a leader now, after the node is restarted, there will be
+		 * another leader elected by that time likely.
+		 */
+		raft_write(raft, &req);
 		say_info("RAFT: persisted state %s",
 			 raft_request_to_string(&req));
 
@@ -598,8 +567,7 @@ raft_worker_handle_broadcast(struct raft *raft)
 		assert(raft->vote == raft->self);
 		req.vclock = raft->vclock;
 	}
-	replicaset_foreach(replica)
-		relay_push_raft(replica->relay, &req);
+	raft_broadcast(raft, &req);
 	trigger_run(&raft->on_update, raft);
 	raft->is_broadcast_scheduled = false;
 }
@@ -1038,7 +1006,7 @@ raft_schedule_broadcast(struct raft *raft)
 }
 
 void
-raft_create(struct raft *raft)
+raft_create(struct raft *raft, const struct raft_vtab *vtab)
 {
 	*raft = (struct raft) {
 		.state = RAFT_STATE_FOLLOWER,
@@ -1047,6 +1015,7 @@ raft_create(struct raft *raft)
 		.election_quorum = 1,
 		.election_timeout = 5,
 		.death_timeout = 5,
+		.vtab = vtab,
 	};
 	ev_timer_init(&raft->timer, raft_sm_schedule_new_election_cb, 0, 0);
 	raft->timer.data = raft;
diff --git a/src/box/raftlib.h b/src/box/raftlib.h
index 8d0d03da0..6181d9d49 100644
--- a/src/box/raftlib.h
+++ b/src/box/raftlib.h
@@ -69,6 +69,7 @@ extern "C" {
  */
 
 struct fiber;
+struct raft;
 struct raft_request;
 
 enum raft_state {
@@ -94,6 +95,21 @@ enum raft_state {
 const char *
 raft_state_str(uint32_t state);
 
+typedef void (*raft_broadcast_f)(struct raft *raft,
+				 const struct raft_request *req);
+typedef void (*raft_write_f)(struct raft *raft, const struct raft_request *req);
+
+/**
+ * Raft connection to the environment, via which it talks to other nodes and
+ * saves something to disk.
+ */
+struct raft_vtab {
+	/** Send a message to all nodes in the cluster. */
+	raft_broadcast_f broadcast;
+	/** Save a message to disk. */
+	raft_write_f write;
+};
+
 struct raft {
 	/** Instance ID of this node. */
 	uint32_t self;
@@ -174,6 +190,8 @@ struct raft {
 	 * elections can be started.
 	 */
 	double death_timeout;
+	/** Virtual table to perform application-specific actions. */
+	const struct raft_vtab *vtab;
 	/**
 	 * Trigger invoked each time any of the Raft node visible attributes are
 	 * changed.
@@ -295,8 +313,12 @@ raft_serialize_for_disk(const struct raft *raft, struct raft_request *req);
 void
 raft_on_update(struct raft *raft, struct trigger *trigger);
 
+/**
+ * Create a Raft node. The vtab is not copied. Its memory should stay valid even
+ * after the creation.
+ */
 void
-raft_create(struct raft *raft);
+raft_create(struct raft *raft, const struct raft_vtab *vtab);
 
 void
 raft_destroy(struct raft *raft);
-- 
2.24.3 (Apple Git-128)



More information about the Tarantool-patches mailing list