[tarantool-patches] Re: [PATCH v3 2/6] [RAW] swim: introduce failure detection component
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Tue Jan 15 17:42:16 MSK 2019
On 09/01/2019 16:48, Konstantin Osipov wrote:
> * Vladislav Shpilevoy <v.shpilevoy at tarantool.org> [18/12/29 15:07]:
>> enum {
>> /** How often to send membership messages and pings. */
>> HEARTBEAT_RATE_DEFAULT = 1,
>> + /**
>> + * If a ping was sent, it is considered to be lost after
>> + * this time without an ack.
>> + */
>> + ACK_TIMEOUT = 1,
>
> The timeout should be configurable. A reasonable default looks
> more close 30 seconds at least - we have many cases in
> (malfunctioning) production when long requests stall the event
> loop for 10-15 seconds, such requests should not lead to
> membership storms.
Done.
diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
index 97395a3a9..1c0cc2cd4 100644
--- a/src/lib/swim/swim.c
+++ b/src/lib/swim/swim.c
@@ -96,7 +96,7 @@ enum {
* If a ping was sent, it is considered to be lost after
* this time without an ack.
*/
- ACK_TIMEOUT = 1,
+ ACK_TIMEOUT_DEFAULT = 30,
/**
* If a member has not been responding to pings this
* number of times, it is considered to be dead.
@@ -765,7 +765,7 @@ swim_schedule_ping(struct swim *swim, struct swim_member *member)
/**
* Check for failed pings. A ping is failed if an ack was not
- * received during ACK_TIMEOUT. A failed ping is resent here.
+ * received during ACK timeout. A failed ping is resent here.
*/
static void
swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
@@ -775,10 +775,11 @@ swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
(void) events;
struct swim *swim = (struct swim *) p->data;
struct swim_member *m, *tmp;
+ double ack_timeout = swim->wait_ack_tick.interval;
double current_time = fiber_time();
rlist_foreach_entry_safe(m, &swim->queue_wait_ack, in_queue_wait_ack,
tmp) {
- if (current_time - m->ping_ts < ACK_TIMEOUT)
+ if (current_time - m->ping_ts < ack_timeout)
break;
++m->failed_pings;
if (m->failed_pings >= NO_ACKS_TO_GC) {
@@ -1113,7 +1114,7 @@ swim_new(const struct swim_transport_vtab *transport_vtab)
/* Failure detection component. */
rlist_create(&swim->queue_wait_ack);
ev_init(&swim->wait_ack_tick, swim_check_acks);
- ev_periodic_set(&swim->wait_ack_tick, 0, ACK_TIMEOUT, NULL);
+ ev_periodic_set(&swim->wait_ack_tick, 0, ACK_TIMEOUT_DEFAULT, NULL);
swim->wait_ack_tick.data = (void *) swim;
return swim;
}
@@ -1133,7 +1134,8 @@ swim_uri_to_addr(const char *uri, struct sockaddr_in *addr)
}
int
-swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate)
+swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate,
+ double ack_timeout)
{
struct sockaddr_in addr;
if (swim_uri_to_addr(uri, &addr) != 0)
@@ -1154,6 +1156,9 @@ swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate)
if (swim->round_tick.interval != heartbeat_rate && heartbeat_rate > 0)
ev_periodic_set(&swim->round_tick, 0, heartbeat_rate, NULL);
+ if (swim->wait_ack_tick.interval != ack_timeout && ack_timeout > 0)
+ ev_periodic_set(&swim->wait_ack_tick, 0, ack_timeout, NULL);
+
swim->self = new_self;
return 0;
}
diff --git a/src/lib/swim/swim.h b/src/lib/swim/swim.h
index b54fc47b2..001900311 100644
--- a/src/lib/swim/swim.h
+++ b/src/lib/swim/swim.h
@@ -56,12 +56,15 @@ swim_new(const struct swim_transport_vtab *transport_vtab);
* @heartbeat_rate seconds. It is rather the protocol
* speed. Protocol period depends on member count and
* @heartbeat_rate.
+ * @param ack_timeout Time in seconds after which a ping is
+ * considered to be unacknowledged.
*
* @retval 0 Success.
* @retval -1 Error.
*/
int
-swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate);
+swim_cfg(struct swim *swim, const char *uri, double heartbeat_rate,
+ double ack_timeout);
/**
* Stop listening and broadcasting messages, cleanup all internal
diff --git a/src/lua/swim.c b/src/lua/swim.c
index 6a78e5dd5..8b6f0b0b0 100644
--- a/src/lua/swim.c
+++ b/src/lua/swim.c
@@ -75,6 +75,28 @@ lua_swim_gc(struct lua_State *L)
return 0;
}
+static inline double
+lua_swim_get_timeout_field(struct lua_State *L, int ncfg,
+ const char *fieldname, const char *funcname)
+{
+ double timeout;
+ lua_getfield(L, ncfg, fieldname);
+ if (lua_isnumber(L, -1)) {
+ timeout = lua_tonumber(L, -1);
+ if (timeout <= 0) {
+ return luaL_error(L, "swim.%s: %s should be positive "\
+ "number", funcname, fieldname);
+ }
+ } else if (! lua_isnil(L, -1)) {
+ return luaL_error(L, "swim.%s: %s should be positive number",
+ funcname, fieldname);
+ } else {
+ timeout = -1;
+ }
+ lua_pop(L, 1);
+ return timeout;
+}
+
/**
* Configure @a swim instance using a table stored in @a ncfg-th
* position on Lua stack.
@@ -106,23 +128,12 @@ lua_swim_cfg_impl(struct lua_State *L, int ncfg, struct swim *swim,
}
lua_pop(L, 1);
- double heartbeat_rate;
- lua_getfield(L, ncfg, "heartbeat");
- if (lua_isnumber(L, -1)) {
- heartbeat_rate = lua_tonumber(L, -1);
- if (heartbeat_rate <= 0) {
- return luaL_error(L, "swim.%s: heartbeat should be "\
- "positive number", funcname);
- }
- } else if (! lua_isnil(L, -1)) {
- return luaL_error(L, "swim.%s: heartbeat should be positive "\
- "number", funcname);
- } else {
- heartbeat_rate = -1;
- }
- lua_pop(L, 1);
+ double heartbeat_rate =
+ lua_swim_get_timeout_field(L, ncfg, "heartbeat", funcname);
+ double ack_timeout =
+ lua_swim_get_timeout_field(L, ncfg, "ack_timeout", funcname);
- return swim_cfg(swim, server_uri, heartbeat_rate);
+ return swim_cfg(swim, server_uri, heartbeat_rate, ack_timeout);
}
static int
>
>> + /**
>> + * If a member has not been responding to pings this
>> + * number of times, it is considered to be dead.
>> + */
>> + NO_ACKS_TO_DEAD = 3,
>> + /**
>> + * If a not pinned member confirmed to be dead, it is
>> + * removed from the membership after at least this number
>> + * of failed pings.
>> + */
>> + NO_ACKS_TO_GC = NO_ACKS_TO_DEAD + 2,
>> };
>>
>> + bool is_pinned;
>> + /** Growing number to refute old messages. */
>
> reject, or perhaps ignore?
>
> refute is usually used for arguments in a heated conversation.
No, exactly refute. Reject/ignore are almost the same in this
context and mean 'do nothing'. Refute means, according to Oxford
Dictionary:
"Deny or contradict (a statement or accusation). Prove
(a statement or theory) to be wrong or false; disprove."
Here the swim instance refutes incorrect assumptions about its
own incarnation emitted by other cluster members.
>
>> + /**
>> + * How many pings did not receive an ack in a row. After
>> + * a threshold the instance is marked as dead. After more
>> + * it is removed from the table (if not pinned).
>> + */
>> + int failed_pings;
>
> These are more like unacknowledged pings. Have they failed? Maybe.
Done.
diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
index 1c0cc2cd4..75f57f69f 100644
--- a/src/lib/swim/swim.c
+++ b/src/lib/swim/swim.c
@@ -105,7 +105,7 @@ enum {
/**
* If a not pinned member confirmed to be dead, it is
* removed from the membership after at least this number
- * of failed pings.
+ * of unacknowledged pings.
*/
NO_ACKS_TO_GC = NO_ACKS_TO_DEAD + 2,
};
@@ -180,7 +180,7 @@ struct swim_member {
* a threshold the instance is marked as dead. After more
* it is removed from the table (if not pinned).
*/
- int failed_pings;
+ int unacknowledged_pings;
/** When the latest ping was sent to this member. */
double ping_ts;
/** Ready at hand regular ACK task. */
@@ -764,8 +764,9 @@ swim_schedule_ping(struct swim *swim, struct swim_member *member)
}
/**
- * Check for failed pings. A ping is failed if an ack was not
- * received during ACK timeout. A failed ping is resent here.
+ * Check for unacknowledged pings. A ping is unacknowledged if an
+ * ack was not received during ACK timeout. An unacknowledged ping
+ * is resent here.
*/
static void
swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
@@ -781,13 +782,13 @@ swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
tmp) {
if (current_time - m->ping_ts < ack_timeout)
break;
- ++m->failed_pings;
- if (m->failed_pings >= NO_ACKS_TO_GC) {
+ ++m->unacknowledged_pings;
+ if (m->unacknowledged_pings >= NO_ACKS_TO_GC) {
if (!m->is_pinned)
swim_member_delete(swim, m);
continue;
}
- if (m->failed_pings >= NO_ACKS_TO_DEAD)
+ if (m->unacknowledged_pings >= NO_ACKS_TO_DEAD)
m->status = MEMBER_DEAD;
swim_schedule_ping(swim, m);
rlist_del_entry(m, in_queue_wait_ack);
@@ -1040,7 +1041,7 @@ swim_process_failure_detection(struct swim *swim, const char **pos,
} else {
assert(type == SWIM_FD_MSG_ACK);
if (incarnation >= sender->incarnation) {
- sender->failed_pings = 0;
+ sender->unacknowledged_pings = 0;
rlist_del_entry(sender, in_queue_wait_ack);
}
}
>
>> + /** When the latest ping was sent to this member. */
>> + double ping_ts;
>
> last_ping_time? Why use double and not ev_time_t?
I did not use ev_time_t since fiber_time() returns double and
ping_ts is initialized from fiber_time(), so I'll keep double.
The name is changed.
diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
index 75f57f69f..3e1b26356 100644
--- a/src/lib/swim/swim.c
+++ b/src/lib/swim/swim.c
@@ -182,7 +182,7 @@ struct swim_member {
*/
int unacknowledged_pings;
/** When the latest ping was sent to this member. */
- double ping_ts;
+ double last_ping_time;
/** Ready at hand regular ACK task. */
struct swim_task ack_task;
struct swim_task ping_task;
@@ -335,7 +335,7 @@ static void
swim_member_schedule_ack_wait(struct swim *swim, struct swim_member *member)
{
if (rlist_empty(&member->in_queue_wait_ack)) {
- member->ping_ts = fiber_time();
+ member->last_ping_time = fiber_time();
rlist_add_tail_entry(&swim->queue_wait_ack, member,
in_queue_wait_ack);
}
@@ -780,7 +780,7 @@ swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events)
double current_time = fiber_time();
rlist_foreach_entry_safe(m, &swim->queue_wait_ack, in_queue_wait_ack,
tmp) {
- if (current_time - m->ping_ts < ack_timeout)
+ if (current_time - m->last_ping_time < ack_timeout)
break;
++m->unacknowledged_pings;
if (m->unacknowledged_pings >= NO_ACKS_TO_GC) {
>
>> + /** Type of the failure detection message: ping or ack. */
>> + SWIM_FD_MSG_TYPE,
>> + /**
>> + * Incarnation of the sender. To make the member alive if
>> + * it was considered to be dead, but ping/ack with greater
>> + * incarnation was received from it.
>> + */
>> + SWIM_FD_INCARNATION,
>
> Uhm, FD is too commonly used for a file descriptor. Please use a
> different name. Why not simply SWIM_PING and SWIM_ACK?
Because I do not want to put all protocol constants into a single
namespace, starting with SWIM_, so I used SWIM_FD_, SWIM_MEMBER_ etc.
I tried to use SWIM_FAILURE_DETECTION_, but it is too long.
Currently I have these names with 'fd': enum swim_fd_key,
enum swim_fd_msg_type, const char *swim_fd_msg_type_strs[],
struct swim_fd_header_bin.
How about expanding 'fd' to 'fail_det', 'fail_detec'?
Also, I could remove 'fd' from enum constants, but then it
would be harder to determine to which component a constant
belongs.
Please, choose one of new suffixes, or approve removal of
them at all.
>
>> struct swim_transport swim_udp_transport = {
>> /* .send_round_msg = */ swim_udp_send_msg,
>> + /* .send_ping = */ swim_udp_send_msg,
>> + /* .send_ack = */ swim_udp_send_msg,
>
> Why do you need a separate transport api for ack/ping sends?
> Shouldn't send/recv be enough? This is a transport layer, it
> should be unaware of protocol details.
>
>
Because this is what you asked me explicitly for. But honestly I
prefer send/recv, without explicit send_ping/ack etc. So I fixed it
in the first commit.
More information about the Tarantool-patches
mailing list