[Tarantool-patches] [PATCH v2 11/11] [tosquash] raft: a swarm of minor fixes

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Thu Sep 10 02:16:52 MSK 2020


Major change - ev_check/ev_prepare didn't work. They are invoked
from a context, where yields are not possible. To a blocking WAL
write can't be done there. And an async WAL write requires too
many changes.

This patch adds a worker fiber to Raft to perform state dumps.

Other changes are fixes of typos and of minor bugs.
---
 src/box/raft.c | 58 ++++++++++++++++++++++++++++++++++----------------
 src/box/raft.h |  7 +++---
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/src/box/raft.c b/src/box/raft.c
index b01e65ced..e4e0b037c 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -64,6 +64,7 @@ struct raft raft = {
 	.vote = 0,
 	.vote_mask = 0,
 	.vote_count = 0,
+	.worker = NULL,
 	.election_timeout = 5,
 };
 
@@ -250,13 +251,6 @@ raft_sm_schedule_new_election_cb(struct ev_loop *loop, struct ev_timer *timer,
 static void
 raft_sm_pause_and_dump(void);
 
-/**
- * Flush Raft state changes to WAL. The callback resets itself, if during the
- * write more changes appear.
- */
-static void
-raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events);
-
 void
 raft_process_recovery(const struct raft_request *req)
 {
@@ -348,6 +342,7 @@ raft_process_msg(const struct raft_request *req, uint32_t source)
 				break;
 			raft.state = RAFT_STATE_LEADER;
 			raft.leader = instance_id;
+			ev_timer_stop(loop(), &raft.timer);
 			break;
 		default:
 			unreachable();
@@ -374,6 +369,10 @@ raft_process_msg(const struct raft_request *req, uint32_t source)
 	/* New leader was elected. */
 	raft.state = RAFT_STATE_FOLLOWER;
 	raft.leader = source;
+	if (!raft.is_write_in_progress) {
+		ev_timer_stop(loop(), &raft.timer);
+		raft_sm_wait_leader_dead();
+	}
 end:
 	if (raft.state != old_state) {
 		/*
@@ -406,6 +405,12 @@ raft_process_heartbeat(uint32_t source)
 	/* Not interested in heartbeats from not a leader. */
 	if (raft.leader != source)
 		return;
+	/*
+	 * The instance currently is busy with writing something on disk. Can't
+	 * react to heartbeats.
+	 */
+	if (raft.is_write_in_progress)
+		return;
 	/*
 	 * XXX: it may be expensive to reset the timer like that. It may be less
 	 * expensive to let the timer work, and remember last timestamp when
@@ -473,10 +478,8 @@ fail:
 }
 
 static void
-raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events)
+raft_worker_handle_io(void)
 {
-	assert(watcher == &raft.io);
-	(void) events;
 	assert(raft.is_write_in_progress);
 	/* During write Raft can't be anything but a follower. */
 	assert(raft.state == RAFT_STATE_FOLLOWER);
@@ -488,7 +491,6 @@ raft_sm_dump_step(struct ev_loop *loop, struct ev_check *watcher, int events)
 	if (raft_is_fully_on_disk()) {
 end_dump:
 		raft.is_write_in_progress = false;
-		ev_check_stop(loop, watcher);
 		/*
 		 * The state machine is stable. Can see now, to what state to
 		 * go.
@@ -583,6 +585,25 @@ end_dump:
 		raft_broadcast(&req);
 }
 
+static int
+raft_worker_f(va_list args)
+{
+	(void)args;
+	while (!fiber_is_cancelled()) {
+		if (!raft.is_write_in_progress)
+			goto idle;
+		raft_worker_handle_io();
+		if (!raft.is_write_in_progress)
+			goto idle;
+		fiber_sleep(0);
+		continue;
+	idle:
+		assert(raft_is_fully_on_disk());
+		fiber_yield();
+	}
+	return 0;
+}
+
 static void
 raft_sm_pause_and_dump(void)
 {
@@ -590,8 +611,10 @@ raft_sm_pause_and_dump(void)
 	if (raft.is_write_in_progress)
 		return;
 	ev_timer_stop(loop(), &raft.timer);
-	ev_check_start(loop(), &raft.io);
 	raft.is_write_in_progress = true;
+	if (raft.worker == NULL)
+		raft.worker = fiber_new("raft_worker", raft_worker_f);
+	fiber_wakeup(raft.worker);
 }
 
 static void
@@ -620,7 +643,6 @@ raft_sm_schedule_new_election(void)
 {
 	assert(raft_is_fully_on_disk());
 	assert(raft.is_candidate);
-	assert(raft.leader == 0);
 	/* Everyone is a follower until its vote for self is persisted. */
 	raft_sm_schedule_new_term(raft.term + 1);
 	raft_sm_schedule_new_vote(instance_id);
@@ -641,20 +663,19 @@ static void
 raft_sm_wait_leader_dead(void)
 {
 	assert(!ev_is_active(&raft.timer));
-	assert(!ev_is_active(&raft.io));
 	assert(!raft.is_write_in_progress);
 	assert(raft.is_candidate);
 	assert(raft.state == RAFT_STATE_FOLLOWER);
 	assert(raft.leader != 0);
 	double death_timeout = replication_disconnect_timeout();
 	ev_timer_set(&raft.timer, death_timeout, death_timeout);
+	ev_timer_start(loop(), &raft.timer);
 }
 
 static void
 raft_sm_wait_election_end(void)
 {
 	assert(!ev_is_active(&raft.timer));
-	assert(!ev_is_active(&raft.io));
 	assert(!raft.is_write_in_progress);
 	assert(raft.is_candidate);
 	assert(raft.state == RAFT_STATE_FOLLOWER ||
@@ -664,13 +685,13 @@ raft_sm_wait_election_end(void)
 	double election_timeout = raft.election_timeout +
 				  raft_new_random_election_shift();
 	ev_timer_set(&raft.timer, election_timeout, election_timeout);
+	ev_timer_start(loop(), &raft.timer);
 }
 
 static void
 raft_sm_start(void)
 {
 	assert(!ev_is_active(&raft.timer));
-	assert(!ev_is_active(&raft.io));
 	assert(!raft.is_write_in_progress);
 	assert(!raft.is_enabled);
 	assert(raft.state == RAFT_STATE_FOLLOWER);
@@ -769,12 +790,13 @@ raft_cfg_election_timeout(double timeout)
 		return;
 
 	raft.election_timeout = timeout;
-	if (raft.vote != 0 && raft.leader == 0) {
+	if (raft.vote != 0 && raft.leader == 0 && raft.is_candidate) {
 		assert(ev_is_active(&raft.timer));
 		double timeout = ev_timer_remaining(loop(), &raft.timer) -
 				 raft.timer.at + raft.election_timeout;
 		ev_timer_stop(loop(), &raft.timer);
 		ev_timer_set(&raft.timer, timeout, timeout);
+		ev_timer_start(loop(), &raft.timer);
 	}
 }
 
@@ -808,6 +830,7 @@ raft_cfg_death_timeout(void)
 				 raft.timer.at + death_timeout;
 		ev_timer_stop(loop(), &raft.timer);
 		ev_timer_set(&raft.timer, timeout, timeout);
+		ev_timer_start(loop(), &raft.timer);
 	}
 }
 
@@ -826,5 +849,4 @@ void
 raft_init(void)
 {
 	ev_timer_init(&raft.timer, raft_sm_schedule_new_election_cb, 0, 0);
-	ev_check_init(&raft.io, raft_sm_dump_step);
 }
diff --git a/src/box/raft.h b/src/box/raft.h
index 111a9c16e..23aedfe10 100644
--- a/src/box/raft.h
+++ b/src/box/raft.h
@@ -65,6 +65,7 @@ extern "C" {
  * than the configured one. See more details in the code.
  */
 
+struct fiber;
 struct raft_request;
 struct vclock;
 
@@ -138,10 +139,8 @@ struct raft {
 	int vote_count;
 	/** State machine timed event trigger. */
 	struct ev_timer timer;
-	/**
-	 * Dump of Raft state in the end of event loop, when it is changed.
-	 */
-	struct ev_check io;
+	/** Worker fiber to execute blocking tasks like IO. */
+	struct fiber *worker;
 	/** Configured election timeout in seconds. */
 	double election_timeout;
 };
-- 
2.21.1 (Apple Git-122.3)



More information about the Tarantool-patches mailing list