Tarantool development patches archive
 help / color / mirror / Atom feed
* [Tarantool-patches] [PATCH 1/1] raft: do not rely on just ev_is_active for timers
@ 2022-02-10  1:09 Vladislav Shpilevoy via Tarantool-patches
  2022-02-10 14:25 ` Serge Petrenko via Tarantool-patches
  0 siblings, 1 reply; 3+ messages in thread
From: Vladislav Shpilevoy via Tarantool-patches @ 2022-02-10  1:09 UTC (permalink / raw)
  To: tarantool-patches, sergepetrenko

When timer has 0 timeout and 0 repeat, during timers preparation
libev makes them inactive right away but puts into the array of
pending events.

Therefore to check if a timer is really active (will be executed
in the future) need to look both at ev_is_active() and
ev_is_pending().

It could happen only during split-vote, because all the other
places use election_timeout + random shift, while split vote uses
just random shift - it can be 0.

The patch makes raft do that for all timer checks. Also to make
the testing reliable and not depend on random the shift factor now
is configurable. For the test it is set to 0.

Closes #6847

NO_DOC=Bugfix
NO_CHANGELOG=Bug was not released
---
Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-6847-split-vote-crash
Issue: https://github.com/tarantool/tarantool/issues/6847

 src/lib/fakesys/fakeev.c    | 15 +++++++++++++--
 src/lib/raft/raft.c         | 33 ++++++++++++++++-----------------
 src/lib/raft/raft.h         |  9 +++++++++
 src/lib/raft/raft_ev.h      |  5 +++++
 test/unit/raft.c            | 33 ++++++++++++++++++++++++++++++++-
 test/unit/raft.result       |  8 +++++++-
 test/unit/raft_test_utils.c | 12 ++++++++++++
 test/unit/raft_test_utils.h |  4 ++++
 8 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/src/lib/fakesys/fakeev.c b/src/lib/fakesys/fakeev.c
index 333c1646f..00163e57d 100644
--- a/src/lib/fakesys/fakeev.c
+++ b/src/lib/fakesys/fakeev.c
@@ -184,8 +184,9 @@ fakeev_timer_event_delete(struct fakeev_event *e)
 {
 	assert(e->type == FAKEEV_EVENT_TIMER);
 	struct fakeev_timer_event *te = (struct fakeev_timer_event *)e;
-	assert(te->watcher->active == 1);
+	assert(te->watcher->active == 1 || te->watcher->pending == 1);
 	te->watcher->active = 0;
+	te->watcher->pending = 0;
 	mh_int_t rc = mh_i64ptr_find(events_hash, (uint64_t) te->watcher, NULL);
 	assert(rc != mh_end(events_hash));
 	mh_i64ptr_del(events_hash, rc, NULL);
@@ -206,8 +207,11 @@ fakeev_timer_event_process(struct fakeev_event *e, struct ev_loop *loop)
 	struct ev_timer *t = (struct ev_timer *) w;
 	fakeev_timer_event_delete(e);
 	t->at = 0;
+	w->pending = 0;
 	if (t->repeat > 0)
 		fakeev_timer_event_new(w, t->repeat);
+	else
+		w->active = 0;
 	ev_invoke(loop, w, EV_TIMER);
 }
 
@@ -215,13 +219,20 @@ static void
 fakeev_timer_event_new(struct ev_watcher *watcher, double delay)
 {
 	assert(watcher->active == 0);
-	watcher->active = 1;
 	struct fakeev_timer_event *e = malloc(sizeof(*e));
 	assert(e != NULL);
 	fakeev_event_create(&e->base, FAKEEV_EVENT_TIMER, delay,
 			    fakeev_timer_event_process,
 			    fakeev_timer_event_delete);
 	e->watcher = watcher;
+	/*
+	 * In libev 0 timeout without repeat makes the timer not
+	 * active but just pending. To execute only once.
+	 */
+	if (delay > 0)
+		watcher->active = 1;
+	else
+		watcher->pending = 1;
 	assert(fakeev_event_by_ev(watcher) == NULL);
 	struct mh_i64ptr_node_t node = {(uint64_t) watcher, e};
 	mh_i64ptr_put(events_hash, &node, NULL, NULL);
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 89cfb3c17..f4ca67439 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -34,11 +34,6 @@
 #include "fiber.h"
 #include "tt_static.h"
 
-/**
- * Maximal random deviation of the election timeout. From the configured value.
- */
-#define RAFT_RANDOM_ELECTION_FACTOR 0.1
-
 /**
  * When decoding we should never trust that there is
  * a valid data incomes.
@@ -124,10 +119,7 @@ raft_new_random_election_shift(const struct raft *raft)
 {
 	double timeout = raft->election_timeout;
 	/* Translate to ms. Integer is needed to be able to use mod below. */
-	uint32_t rand_part =
-		(uint32_t)(timeout * RAFT_RANDOM_ELECTION_FACTOR * 1000);
-	if (rand_part == 0)
-		rand_part = 1;
+	uint32_t rand_part = (uint32_t)(timeout * raft->max_shift * 1000);
 	/*
 	 * XXX: this is not giving a good distribution, but it is not so trivial
 	 * to implement a correct random value generator. There is a task to
@@ -552,7 +544,7 @@ raft_process_heartbeat(struct raft *raft, uint32_t source)
 	 * anything was heard from the leader. Then in the timer callback check
 	 * the timestamp, and restart the timer, if it is fine.
 	 */
-	assert(raft_ev_is_active(&raft->timer));
+	assert(raft_ev_timer_is_active(&raft->timer));
 	raft_ev_timer_stop(raft_loop(), &raft->timer);
 	raft_sm_wait_leader_dead(raft);
 }
@@ -795,7 +787,7 @@ raft_sm_schedule_new_election_cb(struct ev_loop *loop, struct ev_timer *timer,
 static void
 raft_sm_wait_leader_dead(struct raft *raft)
 {
-	assert(!raft_ev_is_active(&raft->timer));
+	assert(!raft_ev_timer_is_active(&raft->timer));
 	assert(!raft->is_write_in_progress);
 	assert(raft->is_candidate);
 	assert(raft->state == RAFT_STATE_FOLLOWER);
@@ -807,7 +799,7 @@ raft_sm_wait_leader_dead(struct raft *raft)
 static void
 raft_sm_wait_leader_found(struct raft *raft)
 {
-	assert(!raft_ev_is_active(&raft->timer));
+	assert(!raft_ev_timer_is_active(&raft->timer));
 	assert(!raft->is_write_in_progress);
 	assert(raft->is_candidate);
 	assert(raft->state == RAFT_STATE_FOLLOWER);
@@ -819,7 +811,7 @@ raft_sm_wait_leader_found(struct raft *raft)
 static void
 raft_sm_wait_election_end(struct raft *raft)
 {
-	assert(!raft_ev_is_active(&raft->timer));
+	assert(!raft_ev_timer_is_active(&raft->timer));
 	assert(!raft->is_write_in_progress);
 	assert(raft->is_candidate);
 	assert(raft->state == RAFT_STATE_FOLLOWER ||
@@ -841,7 +833,7 @@ static void
 raft_sm_start(struct raft *raft)
 {
 	say_info("RAFT: start state machine");
-	assert(!raft_ev_is_active(&raft->timer));
+	assert(!raft_ev_timer_is_active(&raft->timer));
 	assert(!raft->is_enabled);
 	assert(raft->state == RAFT_STATE_FOLLOWER);
 	raft->is_enabled = true;
@@ -1022,7 +1014,7 @@ raft_cfg_election_timeout(struct raft *raft, double timeout)
 	    raft->is_write_in_progress)
 		return;
 
-	assert(raft_ev_is_active(&raft->timer));
+	assert(raft_ev_timer_is_active(&raft->timer));
 	struct ev_loop *loop = raft_loop();
 	timeout += raft_ev_timer_remaining(loop, &raft->timer) - old_timeout;
 	if (timeout < 0)
@@ -1057,7 +1049,7 @@ raft_cfg_death_timeout(struct raft *raft, double timeout)
 	    raft->leader == 0)
 		return;
 
-	assert(raft_ev_is_active(&raft->timer));
+	assert(raft_ev_timer_is_active(&raft->timer));
 	struct ev_loop *loop = raft_loop();
 	timeout += raft_ev_timer_remaining(loop, &raft->timer) - old_timeout;
 	if (timeout < 0)
@@ -1067,6 +1059,12 @@ raft_cfg_death_timeout(struct raft *raft, double timeout)
 	raft_ev_timer_start(loop, &raft->timer);
 }
 
+void
+raft_cfg_max_shift(struct raft *raft, double shift)
+{
+	raft->max_shift = shift;
+}
+
 void
 raft_cfg_instance_id(struct raft *raft, uint32_t instance_id)
 {
@@ -1138,7 +1136,7 @@ raft_check_split_vote(struct raft *raft)
 		return;
 	if (!raft_has_split_vote(raft))
 		return;
-	assert(raft_ev_is_active(&raft->timer));
+	assert(raft_ev_timer_is_active(&raft->timer));
 	/*
 	 * Could be already detected before. The timeout would be updated by now
 	 * then.
@@ -1175,6 +1173,7 @@ raft_create(struct raft *raft, const struct raft_vtab *vtab)
 		.election_quorum = 1,
 		.election_timeout = 5,
 		.death_timeout = 5,
+		.max_shift = 0.1,
 		.cluster_size = VCLOCK_MAX,
 		.vtab = vtab,
 	};
diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h
index 05e373254..8935d2a82 100644
--- a/src/lib/raft/raft.h
+++ b/src/lib/raft/raft.h
@@ -223,6 +223,8 @@ struct raft {
 	 * elections can be started.
 	 */
 	double death_timeout;
+	/** Maximal deviation from the election timeout. */
+	double max_shift;
 	/** Number of instances registered in the cluster. */
 	int cluster_size;
 	/** Virtual table to perform application-specific actions. */
@@ -325,6 +327,13 @@ raft_cfg_election_quorum(struct raft *raft, int election_quorum);
 void
 raft_cfg_death_timeout(struct raft *raft, double timeout);
 
+/**
+ * Configure maximal random shift from the election timeout. The timeout during
+ * election is randomized as cfg_timeout * (1 + shift).
+ */
+void
+raft_cfg_max_shift(struct raft *raft, double shift);
+
 /**
  * Configure ID of the given Raft instance. The ID can't be changed after it is
  * assigned first time.
diff --git a/src/lib/raft/raft_ev.h b/src/lib/raft/raft_ev.h
index cfa663743..01f069b79 100644
--- a/src/lib/raft/raft_ev.h
+++ b/src/lib/raft/raft_ev.h
@@ -52,6 +52,11 @@ raft_loop(void);
 
 #define raft_ev_is_active ev_is_active
 
+#define raft_ev_is_pending ev_is_pending
+
 #define raft_ev_timer_init ev_timer_init
 
 #define raft_ev_timer_set ev_timer_set
+
+#define raft_ev_timer_is_active(w)						\
+	(raft_ev_is_active(w) || raft_ev_is_pending(w))
diff --git a/test/unit/raft.c b/test/unit/raft.c
index 19144d632..41ad8c632 100644
--- a/test/unit/raft.c
+++ b/test/unit/raft.c
@@ -1565,7 +1565,7 @@ raft_test_bump_term_before_cfg()
 static void
 raft_test_split_vote(void)
 {
-	raft_start_test(58);
+	raft_start_test(64);
 	struct raft_node node;
 	raft_node_create(&node);
 
@@ -1856,6 +1856,37 @@ raft_test_split_vote(void)
 	is(node.raft.term, 3, "bump term");
 	is(node.raft.vote, 1, "vote for self");
 
+	/*
+	 * Split vote can make delay to next election 0. Timer with 0 timeout
+	 * has a special state in libev. Another vote can come on the next
+	 * even loop iteration just before the timer is triggered. It should be
+	 * ready to the special state of the timer.
+	 */
+	raft_node_destroy(&node);
+	raft_node_create(&node);
+	raft_node_cfg_cluster_size(&node, 3);
+	raft_node_cfg_election_quorum(&node, 3);
+	raft_node_cfg_max_shift(&node, 0);
+
+	raft_run_next_event();
+	is(node.raft.term, 2, "bump term");
+	is(node.raft.vote, 1, "vote for self");
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		2 /* Vote. */,
+		2 /* Source. */
+	), 0, "vote response for 2 from 2");
+
+	is(node.raft.timer.repeat, 0, "planned new election after yield");
+
+	is(raft_node_send_vote_response(&node,
+		2 /* Term. */,
+		3 /* Vote. */,
+		3 /* Source. */
+	), 0, "vote response for 3 from 3");
+
+	is(node.raft.timer.repeat, 0, "still waiting for yield");
+
 	raft_node_destroy(&node);
 	raft_finish_test();
 }
diff --git a/test/unit/raft.result b/test/unit/raft.result
index a4057acee..07349e4e0 100644
--- a/test/unit/raft.result
+++ b/test/unit/raft.result
@@ -273,7 +273,7 @@ ok 14 - subtests
 ok 15 - subtests
 	*** raft_test_bump_term_before_cfg: done ***
 	*** raft_test_split_vote ***
-    1..58
+    1..64
     ok 1 - elections with a new term
     ok 2 - vote response for 1 from 2
     ok 3 - vote response for 3 from 3
@@ -332,6 +332,12 @@ ok 15 - subtests
     ok 56 - split vote after quorum increase
     ok 57 - bump term
     ok 58 - vote for self
+    ok 59 - bump term
+    ok 60 - vote for self
+    ok 61 - vote response for 2 from 2
+    ok 62 - planned new election after yield
+    ok 63 - vote response for 3 from 3
+    ok 64 - still waiting for yield
 ok 16 - subtests
 	*** raft_test_split_vote: done ***
 	*** main_f: done ***
diff --git a/test/unit/raft_test_utils.c b/test/unit/raft_test_utils.c
index d7e50984c..7fd945185 100644
--- a/test/unit/raft_test_utils.c
+++ b/test/unit/raft_test_utils.c
@@ -193,6 +193,7 @@ raft_node_create(struct raft_node *node)
 	node->cfg_election_timeout = 5;
 	node->cfg_election_quorum = 3;
 	node->cfg_death_timeout = 5;
+	node->cfg_max_shift = 0.1;
 	node->cfg_instance_id = 1;
 	node->cfg_cluster_size = VCLOCK_MAX;
 	node->cfg_vclock = &node->journal.vclock;
@@ -384,6 +385,7 @@ raft_node_cfg(struct raft_node *node)
 	raft_cfg_election_timeout(&node->raft, node->cfg_election_timeout);
 	raft_cfg_election_quorum(&node->raft, node->cfg_election_quorum);
 	raft_cfg_death_timeout(&node->raft, node->cfg_death_timeout);
+	raft_cfg_max_shift(&node->raft, node->cfg_max_shift);
 	raft_cfg_instance_id(&node->raft, node->cfg_instance_id);
 	raft_cfg_cluster_size(&node->raft, node->cfg_cluster_size);
 	raft_cfg_vclock(&node->raft, node->cfg_vclock);
@@ -484,6 +486,16 @@ raft_node_cfg_death_timeout(struct raft_node *node, double value)
 	}
 }
 
+void
+raft_node_cfg_max_shift(struct raft_node *node, double value)
+{
+	node->cfg_max_shift = value;
+	if (raft_node_is_started(node)) {
+		raft_cfg_max_shift(&node->raft, value);
+		raft_run_async_work();
+	}
+}
+
 bool
 raft_msg_check(const struct raft_msg *msg, enum raft_state state, uint64_t term,
 	       uint32_t vote, const char *vclock)
diff --git a/test/unit/raft_test_utils.h b/test/unit/raft_test_utils.h
index 8eab41490..7fd17f16a 100644
--- a/test/unit/raft_test_utils.h
+++ b/test/unit/raft_test_utils.h
@@ -105,6 +105,7 @@ struct raft_node {
 	double cfg_election_timeout;
 	int cfg_election_quorum;
 	double cfg_death_timeout;
+	double cfg_max_shift;
 	uint32_t cfg_instance_id;
 	int cfg_cluster_size;
 	struct vclock *cfg_vclock;
@@ -249,6 +250,9 @@ raft_node_cfg_election_quorum(struct raft_node *node, int value);
 void
 raft_node_cfg_death_timeout(struct raft_node *node, double value);
 
+void
+raft_node_cfg_max_shift(struct raft_node *node, double value);
+
 /** Check that @a msg message matches the given arguments. */
 bool
 raft_msg_check(const struct raft_msg *msg, enum raft_state state, uint64_t term,
-- 
2.24.3 (Apple Git-128)


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-02-10 21:19 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-10  1:09 [Tarantool-patches] [PATCH 1/1] raft: do not rely on just ev_is_active for timers Vladislav Shpilevoy via Tarantool-patches
2022-02-10 14:25 ` Serge Petrenko via Tarantool-patches
2022-02-10 21:19   ` Vladislav Shpilevoy via Tarantool-patches

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox