[tarantool-patches] [PATCH 1/2] swim: prefer new events for dissemination over old

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sun Jun 30 02:45:02 MSK 2019


Before the patch old events were disseminated before new ones
until their TTD becomes 0. It was considered fair, and not much
important. But after some experiments with closed-source version
of SWIM it was found, that I was wrong as never.

Firstly, SWIM in the original paper explicitly says, that newer
events should be sent first. Secondly, it really has a goal. For
example, when a big cluster is just started, there is huge number
of events like 'a new member is added'. They consume the whole
UDP packet, and newer events like 'a member is dead' starve. In
fact, dissemination just doesn't work at start of a big cluster.

This patch make SWIM prefer newer events for dissemination.

Part of #4253
---
 src/lib/swim/swim.c   | 13 ++++++++++---
 test/unit/swim.c      | 44 ++++++++++++++++++++++++++++++++++++++++++-
 test/unit/swim.result |  7 ++++++-
 3 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
index bb9e9f519..8edf0495d 100644
--- a/src/lib/swim/swim.c
+++ b/src/lib/swim/swim.c
@@ -367,7 +367,7 @@ struct swim_member {
 	int payload_ttd;
 	/**
 	 * All created events are put into a queue sorted by event
-	 * time.
+	 * time in descending order.
 	 */
 	struct rlist in_dissemination_queue;
 	/**
@@ -608,13 +608,20 @@ swim_wait_ack(struct swim *swim, struct swim_member *member,
  * The status change itself occupies only 2 bytes in a packet, so
  * it is cheap to send it on any update, while does reduce
  * entropy.
+ *
+ * It is important to put new events in the head of the queue.
+ * Otherwise if there are lots of old events, new ones starve. It
+ * is especially notable when a big cluster is just started, and
+ * the whole UDP packet is occupied by events like 'a new member
+ * is added'. More important events like 'a member is dead' won't
+ * get a chance to be disseminated.
  */
 static inline void
 swim_register_event(struct swim *swim, struct swim_member *member)
 {
 	if (rlist_empty(&member->in_dissemination_queue)) {
-		rlist_add_tail_entry(&swim->dissemination_queue, member,
-				     in_dissemination_queue);
+		rlist_add_entry(&swim->dissemination_queue, member,
+				in_dissemination_queue);
 	}
 	member->status_ttd = mh_size(swim->members);
 	swim_cached_round_msg_invalidate(swim);
diff --git a/test/unit/swim.c b/test/unit/swim.c
index 3486d3f73..07cc0f06f 100644
--- a/test/unit/swim.c
+++ b/test/unit/swim.c
@@ -43,6 +43,7 @@
 #include "swim_test_utils.h"
 #include "trigger.h"
 #include <fcntl.h>
+#include <math.h>
 
 /**
  * Test result is a real returned value of main_f. Fiber_join can
@@ -1152,10 +1153,50 @@ swim_test_generation(void)
 	swim_finish_test();
 }
 
+static void
+swim_test_dissemination_speed(void)
+{
+	swim_start_test(1);
+
+	int size = 100;
+	double ack_timeout = 0.1;
+	struct swim_cluster *cluster = swim_cluster_new(size);
+	swim_cluster_set_ack_timeout(cluster, ack_timeout);
+	swim_cluster_set_gc(cluster, SWIM_GC_OFF);
+	for (int i = 0; i < size; ++i) {
+		for (int j = i + 1; j < size; ++j)
+			swim_cluster_interconnect(cluster, i, j);
+	}
+	swim_cluster_set_drop(cluster, 0, 100);
+	fail_if(swim_cluster_wait_status_anywhere(cluster, 0,
+						  MEMBER_DEAD, size) != 0);
+	/*
+	 * Not a trivial problem - at start of a cluster there are
+	 * so many events, that they occupy a UDP packet fully.
+	 * All these events are 'a new member is added'. And
+	 * because of that other much more important events
+	 * starve. In this concrete test a new event 'member is
+	 * dead' starves. To beat that problem SWIM says that new
+	 * events should be preferred. In such a case number of
+	 * old events does not matter anymore, and new events
+	 * disseminate in log. Usually this test works in log * 2,
+	 * log * 3 steps. Here it is log * 5 to avoid flakiness in
+	 * some extra rare and slow random cases, but to still
+	 * check for O(log) speed.
+	 */
+	is(swim_cluster_wait_status_everywhere(cluster, 0, MEMBER_DEAD,
+					       log2(size) * 5), 0,
+	   "dissemination work in log time even at the very start of a cluster");
+
+	swim_cluster_delete(cluster);
+
+	swim_finish_test();
+}
+
 static int
 main_f(va_list ap)
 {
-	swim_start_test(22);
+	swim_start_test(23);
 
 	(void) ap;
 	swim_test_ev_init();
@@ -1183,6 +1224,7 @@ main_f(va_list ap)
 	swim_test_slow_net();
 	swim_test_triggers();
 	swim_test_generation();
+	swim_test_dissemination_speed();
 
 	swim_test_transport_free();
 	swim_test_ev_free();
diff --git a/test/unit/swim.result b/test/unit/swim.result
index 04a2778e6..83b6bca64 100644
--- a/test/unit/swim.result
+++ b/test/unit/swim.result
@@ -1,5 +1,5 @@
 	*** main_f ***
-1..22
+1..23
 	*** swim_test_one_link ***
     1..6
     ok 1 - no rounds - no fullmesh
@@ -237,4 +237,9 @@ ok 21 - subtests
     ok 3 - S2 sees new generation of S1
 ok 22 - subtests
 	*** swim_test_generation: done ***
+	*** swim_test_dissemination_speed ***
+    1..1
+    ok 1 - dissemination work in log time even at the very start of a cluster
+ok 23 - subtests
+	*** swim_test_dissemination_speed: done ***
 	*** main_f: done ***
-- 
2.20.1 (Apple Git-117)





More information about the Tarantool-patches mailing list