[tarantool-patches] Re: [PATCH 1/2] swim: prefer new events for dissemination over old

Sun Jun 30 19:25:01 MSK 2019

This commit is dropped. It was discussed verbally, that it would
be better to implement more smart priority queue for events.

On 30/06/2019 01:45, Vladislav Shpilevoy wrote:
> Before the patch old events were disseminated before new ones
> until their TTD becomes 0. It was considered fair, and not much
> important. But after some experiments with closed-source version
> of SWIM it was found, that I was wrong as never.
> 
> Firstly, SWIM in the original paper explicitly says, that newer
> events should be sent first. Secondly, it really has a goal. For
> example, when a big cluster is just started, there is huge number
> of events like 'a new member is added'. They consume the whole
> UDP packet, and newer events like 'a member is dead' starve. In
> fact, dissemination just doesn't work at start of a big cluster.
> 
> This patch make SWIM prefer newer events for dissemination.
> 
> Part of #4253
> ---
>  src/lib/swim/swim.c   | 13 ++++++++++---
>  test/unit/swim.c      | 44 ++++++++++++++++++++++++++++++++++++++++++-
>  test/unit/swim.result |  7 ++++++-
>  3 files changed, 59 insertions(+), 5 deletions(-)
> 
> diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
> index bb9e9f519..8edf0495d 100644
> --- a/src/lib/swim/swim.c
> +++ b/src/lib/swim/swim.c
> @@ -367,7 +367,7 @@ struct swim_member {
>  	int payload_ttd;
>  	/**
>  	 * All created events are put into a queue sorted by event
> -	 * time.
> +	 * time in descending order.
>  	 */
>  	struct rlist in_dissemination_queue;
>  	/**
> @@ -608,13 +608,20 @@ swim_wait_ack(struct swim *swim, struct swim_member *member,
>   * The status change itself occupies only 2 bytes in a packet, so
>   * it is cheap to send it on any update, while does reduce
>   * entropy.
> + *
> + * It is important to put new events in the head of the queue.
> + * Otherwise if there are lots of old events, new ones starve. It
> + * is especially notable when a big cluster is just started, and
> + * the whole UDP packet is occupied by events like 'a new member
> + * is added'. More important events like 'a member is dead' won't
> + * get a chance to be disseminated.
>   */
>  static inline void
>  swim_register_event(struct swim *swim, struct swim_member *member)
>  {
>  	if (rlist_empty(&member->in_dissemination_queue)) {
> -		rlist_add_tail_entry(&swim->dissemination_queue, member,
> -				     in_dissemination_queue);
> +		rlist_add_entry(&swim->dissemination_queue, member,
> +				in_dissemination_queue);
>  	}
>  	member->status_ttd = mh_size(swim->members);
>  	swim_cached_round_msg_invalidate(swim);
> diff --git a/test/unit/swim.c b/test/unit/swim.c
> index 3486d3f73..07cc0f06f 100644
> --- a/test/unit/swim.c
> +++ b/test/unit/swim.c
> @@ -43,6 +43,7 @@
>  #include "swim_test_utils.h"
>  #include "trigger.h"
>  #include <fcntl.h>
> +#include <math.h>
>  
>  /**
>   * Test result is a real returned value of main_f. Fiber_join can
> @@ -1152,10 +1153,50 @@ swim_test_generation(void)
>  	swim_finish_test();
>  }
>  
> +static void
> +swim_test_dissemination_speed(void)
> +{
> +	swim_start_test(1);
> +
> +	int size = 100;
> +	double ack_timeout = 0.1;
> +	struct swim_cluster *cluster = swim_cluster_new(size);
> +	swim_cluster_set_ack_timeout(cluster, ack_timeout);
> +	swim_cluster_set_gc(cluster, SWIM_GC_OFF);
> +	for (int i = 0; i < size; ++i) {
> +		for (int j = i + 1; j < size; ++j)
> +			swim_cluster_interconnect(cluster, i, j);
> +	}
> +	swim_cluster_set_drop(cluster, 0, 100);
> +	fail_if(swim_cluster_wait_status_anywhere(cluster, 0,
> +						  MEMBER_DEAD, size) != 0);
> +	/*
> +	 * Not a trivial problem - at start of a cluster there are
> +	 * so many events, that they occupy a UDP packet fully.
> +	 * All these events are 'a new member is added'. And
> +	 * because of that other much more important events
> +	 * starve. In this concrete test a new event 'member is
> +	 * dead' starves. To beat that problem SWIM says that new
> +	 * events should be preferred. In such a case number of
> +	 * old events does not matter anymore, and new events
> +	 * disseminate in log. Usually this test works in log * 2,
> +	 * log * 3 steps. Here it is log * 5 to avoid flakiness in
> +	 * some extra rare and slow random cases, but to still
> +	 * check for O(log) speed.
> +	 */
> +	is(swim_cluster_wait_status_everywhere(cluster, 0, MEMBER_DEAD,
> +					       log2(size) * 5), 0,
> +	   "dissemination work in log time even at the very start of a cluster");
> +
> +	swim_cluster_delete(cluster);
> +
> +	swim_finish_test();
> +}
> +
>  static int
>  main_f(va_list ap)
>  {
> -	swim_start_test(22);
> +	swim_start_test(23);
>  
>  	(void) ap;
>  	swim_test_ev_init();
> @@ -1183,6 +1224,7 @@ main_f(va_list ap)
>  	swim_test_slow_net();
>  	swim_test_triggers();
>  	swim_test_generation();
> +	swim_test_dissemination_speed();
>  
>  	swim_test_transport_free();
>  	swim_test_ev_free();
> diff --git a/test/unit/swim.result b/test/unit/swim.result
> index 04a2778e6..83b6bca64 100644
> --- a/test/unit/swim.result
> +++ b/test/unit/swim.result
> @@ -1,5 +1,5 @@
>  	*** main_f ***
> -1..22
> +1..23
>  	*** swim_test_one_link ***
>      1..6
>      ok 1 - no rounds - no fullmesh
> @@ -237,4 +237,9 @@ ok 21 - subtests
>      ok 3 - S2 sees new generation of S1
>  ok 22 - subtests
>  	*** swim_test_generation: done ***
> +	*** swim_test_dissemination_speed ***
> +    1..1
> +    ok 1 - dissemination work in log time even at the very start of a cluster
> +ok 23 - subtests
> +	*** swim_test_dissemination_speed: done ***
>  	*** main_f: done ***
>