[tarantool-patches] Re: [PATCH v3 2/6] [RAW] swim: introduce failure detection component

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Tue Jan 15 17:42:16 MSK 2019



On 09/01/2019 16:48, Konstantin Osipov wrote:
> * Vladislav Shpilevoy <v.shpilevoy at tarantool.org> [18/12/29 15:07]:
>>   enum {
>>   	/** How often to send membership messages and pings. */
>>   	HEARTBEAT_RATE_DEFAULT = 1,
>> +	/**
>> +	 * If a ping was sent, it is considered to be lost after
>> +	 * this time without an ack.
>> +	 */
>> +	ACK_TIMEOUT = 1,
> 
> The timeout should be configurable. A reasonable default looks
> more close 30 seconds at least - we have many cases in
> (malfunctioning) production when long requests stall the event
> loop for 10-15 seconds, such requests should not lead to
> membership storms.

Done.

diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
index 97395a3a9..1c0cc2cd4 100644
--- a/src/lib/swim/swim.c
+++ b/src/lib/swim/swim.c
@@ -96,7 +96,7 @@ enum {
  	 * If a ping was sent, it is considered to be lost after
  	 * this time without an ack.
  	 */
-	ACK_TIMEOUT = 1,
+	ACK_TIMEOUT_DEFAULT = 30,
  	/**
  	 * If a member has not been responding to pings this
  	 * number of times, it is considered to be dead.
@@ -765,7 +765,7 @@ swim_schedule_ping(struct swim *swim, struct swim_member *member)
  
  /**
   * Check for failed pings. A ping is failed if an ack was not
- * received during ACK_TIMEOUT. A failed ping is resent here.
+ * received during ACK timeout. A failed ping is resent here.
   */
  static void
  swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
@@ -775,10 +775,11 @@ swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
  	(void) events;
  	struct swim *swim = (struct swim *) p->data;
  	struct swim_member *m, *tmp;
+	double ack_timeout = swim->wait_ack_tick.interval;
  	double current_time = fiber_time();
  	rlist_foreach_entry_safe(m, &swim->queue_wait_ack, in_queue_wait_ack,
  				 tmp) {
-		if (current_time - m->ping_ts < ACK_TIMEOUT)
+		if (current_time - m->ping_ts < ack_timeout)
  			break;
  		++m->failed_pings;
  		if (m->failed_pings >= NO_ACKS_TO_GC) {
@@ -1113,7 +1114,7 @@ swim_new(const struct swim_transport_vtab *transport_vtab)
  	/* Failure detection component. */
  	rlist_create(&swim->queue_wait_ack);
  	ev_init(&swim->wait_ack_tick, swim_check_acks);
-	ev_periodic_set(&swim->wait_ack_tick, 0, ACK_TIMEOUT, NULL);
+	ev_periodic_set(&swim->wait_ack_tick, 0, ACK_TIMEOUT_DEFAULT, NULL);
  	swim->wait_ack_tick.data = (void *) swim;
  	return swim;
  }
@@ -1133,7 +1134,8 @@ swim_uri_to_addr(const char *uri, struct sockaddr_in *addr)
  }
  
  int
-swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate)
+swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate,
+	 double ack_timeout)
  {
  	struct sockaddr_in addr;
  	if (swim_uri_to_addr(uri, &addr) != 0)
@@ -1154,6 +1156,9 @@ swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate)
  	if (swim->round_tick.interval != heartbeat_rate && heartbeat_rate > 0)
  		ev_periodic_set(&swim->round_tick, 0, heartbeat_rate, NULL);
  
+	if (swim->wait_ack_tick.interval != ack_timeout && ack_timeout > 0)
+		ev_periodic_set(&swim->wait_ack_tick, 0, ack_timeout, NULL);
+
  	swim->self = new_self;
  	return 0;
  }
diff --git a/src/lib/swim/swim.h b/src/lib/swim/swim.h
index b54fc47b2..001900311 100644
--- a/src/lib/swim/swim.h
+++ b/src/lib/swim/swim.h
@@ -56,12 +56,15 @@ swim_new(const struct swim_transport_vtab *transport_vtab);
   *        @heartbeat_rate seconds. It is rather the protocol
   *        speed. Protocol period depends on member count and
   *        @heartbeat_rate.
+ * @param ack_timeout Time in seconds after which a ping is
+ *        considered to be unacknowledged.
   *
   * @retval 0 Success.
   * @retval -1 Error.
   */
  int
-swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate);
+swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate,
+	 double ack_timeout);
  
  /**
   * Stop listening and broadcasting messages, cleanup all internal
diff --git a/src/lua/swim.c b/src/lua/swim.c
index 6a78e5dd5..8b6f0b0b0 100644
--- a/src/lua/swim.c
+++ b/src/lua/swim.c
@@ -75,6 +75,28 @@ lua_swim_gc(struct lua_State *L)
  	return 0;
  }
  
+static inline double
+lua_swim_get_timeout_field(struct lua_State *L, int ncfg,
+			   const char *fieldname, const char *funcname)
+{
+	double timeout;
+	lua_getfield(L, ncfg, fieldname);
+	if (lua_isnumber(L, -1)) {
+		timeout = lua_tonumber(L, -1);
+		if (timeout <= 0) {
+			return luaL_error(L, "swim.%s: %s should be positive "\
+					  "number", funcname, fieldname);
+		}
+	} else if (! lua_isnil(L, -1)) {
+		return luaL_error(L, "swim.%s: %s should be positive number",
+				  funcname, fieldname);
+	} else {
+		timeout = -1;
+	}
+	lua_pop(L, 1);
+	return timeout;
+}
+
  /**
   * Configure @a swim instance using a table stored in @a ncfg-th
   * position on Lua stack.
@@ -106,23 +128,12 @@ lua_swim_cfg_impl(struct lua_State *L, int ncfg, struct swim *swim,
  	}
  	lua_pop(L, 1);
  
-	double heartbeat_rate;
-	lua_getfield(L, ncfg, "heartbeat");
-	if (lua_isnumber(L, -1)) {
-		heartbeat_rate = lua_tonumber(L, -1);
-		if (heartbeat_rate <= 0) {
-			return luaL_error(L, "swim.%s: heartbeat should be "\
-					  "positive number", funcname);
-		}
-	} else if (! lua_isnil(L, -1)) {
-		return luaL_error(L, "swim.%s: heartbeat should be positive "\
-				  "number", funcname);
-	} else {
-		heartbeat_rate = -1;
-	}
-	lua_pop(L, 1);
+	double heartbeat_rate =
+		lua_swim_get_timeout_field(L, ncfg, "heartbeat", funcname);
+	double ack_timeout =
+		lua_swim_get_timeout_field(L, ncfg, "ack_timeout", funcname);
  
-	return swim_cfg(swim, server_uri, heartbeat_rate);
+	return swim_cfg(swim, server_uri, heartbeat_rate, ack_timeout);
  }
  
  static int

> 
>> +	/**
>> +	 * If a member has not been responding to pings this
>> +	 * number of times, it is considered to be dead.
>> +	 */
>> +	NO_ACKS_TO_DEAD = 3,
>> +	/**
>> +	 * If a not pinned member confirmed to be dead, it is
>> +	 * removed from the membership after at least this number
>> +	 * of failed pings.
>> +	 */
>> +	NO_ACKS_TO_GC = NO_ACKS_TO_DEAD + 2,
>>   };
>>   
>> +	bool is_pinned;
>> +	/** Growing number to refute old messages. */
> 
> reject, or perhaps ignore?
> 
> refute is usually used for arguments in a heated conversation.

No, exactly refute. Reject/ignore are almost the same in this
context and mean 'do nothing'. Refute means, according to Oxford
Dictionary:

     "Deny or contradict (a statement or accusation). Prove
      (a statement or theory) to be wrong or false; disprove."

Here the swim instance refutes incorrect assumptions about its
own incarnation emitted by other cluster members.

> 
>> +	/**
>> +	 * How many pings did not receive an ack in a row. After
>> +	 * a threshold the instance is marked as dead. After more
>> +	 * it is removed from the table (if not pinned).
>> +	 */
>> +	int failed_pings;
> 
> These are more like unacknowledged pings. Have they failed? Maybe.

Done.

diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
index 1c0cc2cd4..75f57f69f 100644
--- a/src/lib/swim/swim.c
+++ b/src/lib/swim/swim.c
@@ -105,7 +105,7 @@ enum {
  	/**
  	 * If a not pinned member confirmed to be dead, it is
  	 * removed from the membership after at least this number
-	 * of failed pings.
+	 * of unacknowledged pings.
  	 */
  	NO_ACKS_TO_GC = NO_ACKS_TO_DEAD + 2,
  };
@@ -180,7 +180,7 @@ struct swim_member {
  	 * a threshold the instance is marked as dead. After more
  	 * it is removed from the table (if not pinned).
  	 */
-	int failed_pings;
+	int unacknowledged_pings;
  	/** When the latest ping was sent to this member. */
  	double ping_ts;
  	/** Ready at hand regular ACK task. */
@@ -764,8 +764,9 @@ swim_schedule_ping(struct swim *swim, struct swim_member *member)
  }
  
  /**
- * Check for failed pings. A ping is failed if an ack was not
- * received during ACK timeout. A failed ping is resent here.
+ * Check for unacknowledged pings. A ping is unacknowledged if an
+ * ack was not received during ACK timeout. An unacknowledged ping
+ * is resent here.
   */
  static void
  swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
@@ -781,13 +782,13 @@ swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
  				 tmp) {
  		if (current_time - m->ping_ts < ack_timeout)
  			break;
-		++m->failed_pings;
-		if (m->failed_pings >= NO_ACKS_TO_GC) {
+		++m->unacknowledged_pings;
+		if (m->unacknowledged_pings >= NO_ACKS_TO_GC) {
  			if (!m->is_pinned)
  				swim_member_delete(swim, m);
  			continue;
  		}
-		if (m->failed_pings >= NO_ACKS_TO_DEAD)
+		if (m->unacknowledged_pings >= NO_ACKS_TO_DEAD)
  			m->status = MEMBER_DEAD;
  		swim_schedule_ping(swim, m);
  		rlist_del_entry(m, in_queue_wait_ack);
@@ -1040,7 +1041,7 @@ swim_process_failure_detection(struct swim *swim, const char **pos,
  	} else {
  		assert(type == SWIM_FD_MSG_ACK);
  		if (incarnation >= sender->incarnation) {
-			sender->failed_pings = 0;
+			sender->unacknowledged_pings = 0;
  			rlist_del_entry(sender, in_queue_wait_ack);
  		}
  	}

> 
>> +	/** When the latest ping was sent to this member. */
>> +	double ping_ts;
> 
> last_ping_time? Why use double and not ev_time_t?

I did not use ev_time_t since fiber_time() returns double and
ping_ts is initialized from fiber_time(), so I'll keep double.

The name is changed.

diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
index 75f57f69f..3e1b26356 100644
--- a/src/lib/swim/swim.c
+++ b/src/lib/swim/swim.c
@@ -182,7 +182,7 @@ struct swim_member {
  	 */
  	int unacknowledged_pings;
  	/** When the latest ping was sent to this member. */
-	double ping_ts;
+	double last_ping_time;
  	/** Ready at hand regular ACK task. */
  	struct swim_task ack_task;
  	struct swim_task ping_task;
@@ -335,7 +335,7 @@ static void
  swim_member_schedule_ack_wait(struct swim *swim, struct swim_member *member)
  {
  	if (rlist_empty(&member->in_queue_wait_ack)) {
-		member->ping_ts = fiber_time();
+		member->last_ping_time = fiber_time();
  		rlist_add_tail_entry(&swim->queue_wait_ack, member,
  				     in_queue_wait_ack);
  	}
@@ -780,7 +780,7 @@ swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
  	double current_time = fiber_time();
  	rlist_foreach_entry_safe(m, &swim->queue_wait_ack, in_queue_wait_ack,
  				 tmp) {
-		if (current_time - m->ping_ts < ack_timeout)
+		if (current_time - m->last_ping_time < ack_timeout)
  			break;
  		++m->unacknowledged_pings;
  		if (m->unacknowledged_pings >= NO_ACKS_TO_GC) {

> 
>> +	/** Type of the failure detection message: ping or ack. */
>> +	SWIM_FD_MSG_TYPE,
>> +	/**
>> +	 * Incarnation of the sender. To make the member alive if
>> +	 * it was considered to be dead, but ping/ack with greater
>> +	 * incarnation was received from it.
>> +	 */
>> +	SWIM_FD_INCARNATION,
> 
> Uhm, FD is too commonly used for a file descriptor. Please use a
> different name. Why not simply SWIM_PING and SWIM_ACK?

Because I do not want to put all protocol constants into a single
namespace, starting with SWIM_, so I used SWIM_FD_, SWIM_MEMBER_ etc.
I tried to use SWIM_FAILURE_DETECTION_, but it is too long.

Currently I have these names with 'fd': enum swim_fd_key,
enum swim_fd_msg_type, const char *swim_fd_msg_type_strs[],
struct swim_fd_header_bin.

How about expanding 'fd' to 'fail_det', 'fail_detec'?

Also, I could remove 'fd' from enum constants, but then it
would be harder to determine to which component a constant
belongs.

Please, choose one of new suffixes, or approve removal of
them at all.

> 
>>   struct swim_transport swim_udp_transport = {
>>   	/* .send_round_msg = */ swim_udp_send_msg,
>> +	/* .send_ping = */ swim_udp_send_msg,
>> +	/* .send_ack = */ swim_udp_send_msg,
> 
> Why do you need a separate transport api for ack/ping sends?
> Shouldn't send/recv be enough? This is a transport layer, it
> should be unaware of protocol details.
> 
> 

Because this is what you asked me explicitly for. But honestly I
prefer send/recv, without explicit send_ping/ack etc. So I fixed it
in the first commit.



More information about the Tarantool-patches mailing list