From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org> To: tarantool-patches@freelists.org Cc: vdavydov.dev@gmail.com, kostja@tarantool.org Subject: [PATCH v2 3/6] [RAW] swim: introduce a dissemination component Date: Tue, 25 Dec 2018 22:19:26 +0300 [thread overview] Message-ID: <3b5f96c93bdb29e60de4565ba4bcb93edcd6389a.1545765055.git.v.shpilevoy@tarantool.org> (raw) In-Reply-To: <cover.1545765055.git.v.shpilevoy@tarantool.org> In-Reply-To: <cover.1545765055.git.v.shpilevoy@tarantool.org> Dissemination components broadcasts events about member status updates. Part of #3234 --- src/lib/swim/swim.c | 270 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 265 insertions(+), 5 deletions(-) diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c index 22bc06a60..15e079f11 100644 --- a/src/lib/swim/swim.c +++ b/src/lib/swim/swim.c @@ -377,6 +377,26 @@ struct swim_member { struct swim_io_task ping_task; /** Position in a queue of members waiting for an ack. */ struct rlist in_queue_wait_ack; + /** + * + * Dissemination component + * + * Dissemination component sends events. Event is a + * notification about member status update. So formally, + * this structure already has all the needed attributes. + * But also an event somehow should be sent to all members + * at least once according to SWIM, so it requires + * something like TTL, which decrements on each send. And + * a member can not be removed from the global table until + * it gets dead and its dissemination TTL is 0, so as to + * allow other members learn its dead status. + */ + int dissemination_ttl; + /** + * Events are put into a queue sorted by event occurrence + * time. + */ + struct rlist in_queue_events; }; /** @@ -451,6 +471,8 @@ struct swim { struct rlist queue_wait_ack; /** Generator of ack checking events. */ struct ev_periodic wait_ack_tick; + /** Queue of events sorted by occurrence time. */ + struct rlist queue_events; }; static inline uint64_t @@ -467,6 +489,7 @@ sockaddr_in_hash(const struct sockaddr_in *a) enum swim_component_type { SWIM_ANTI_ENTROPY = 0, SWIM_FAILURE_DETECTION, + SWIM_DISSEMINATION, }; /** {{{ Failure detection component */ @@ -634,6 +657,88 @@ swim_member_bin_create(struct swim_member_bin *header) /** }}} Anti-entropy component */ +/** {{{ Dissemination component */ + +/** SWIM dissemination MsgPack template. */ +struct PACKED swim_diss_header_bin { + /** mp_encode_uint(SWIM_DISSEMINATION) */ + uint8_t k_header; + /** mp_encode_array() */ + uint8_t m_header; + uint32_t v_header; +}; + +static inline void +swim_diss_header_bin_create(struct swim_diss_header_bin *header, int batch_size) +{ + header->k_header = SWIM_DISSEMINATION; + header->m_header = 0xdd; + header->v_header = mp_bswap_u32(batch_size); +} + +/** SWIM event MsgPack template. */ +struct PACKED swim_event_bin { + /** mp_encode_map(4) */ + uint8_t m_header; + + /** mp_encode_uint(SWIM_MEMBER_STATUS) */ + uint8_t k_status; + /** mp_encode_uint(enum member_status) */ + uint8_t v_status; + + /** mp_encode_uint(SWIM_MEMBER_ADDR) */ + uint8_t k_addr; + /** mp_encode_uint(addr.sin_addr.s_addr) */ + uint8_t m_addr; + uint32_t v_addr; + + /** mp_encode_uint(SWIM_MEMBER_PORT) */ + uint8_t k_port; + /** mp_encode_uint(addr.sin_port) */ + uint8_t m_port; + uint16_t v_port; + + /** mp_encode_uint(SWIM_MEMBER_INCARNATION) */ + uint8_t k_incarnation; + /** mp_encode_uint(64bit incarnation) */ + uint8_t m_incarnation; + uint64_t v_incarnation; +}; + +static inline void +swim_event_bin_create(struct swim_event_bin *header) +{ + header->m_header = 0x84; + header->k_status = SWIM_MEMBER_STATUS; + header->k_addr = SWIM_MEMBER_ADDR; + header->m_addr = 0xce; + header->k_port = SWIM_MEMBER_PORT; + header->m_port = 0xcd; + header->k_incarnation = SWIM_MEMBER_INCARNATION; + header->m_incarnation = 0xcf; +} + +static inline void +swim_event_bin_reset(struct swim_event_bin *header, struct swim_member *member) +{ + header->v_status = member->status; + header->v_addr = mp_bswap_u32(member->addr.sin_addr.s_addr); + header->v_port = mp_bswap_u16(member->addr.sin_port); + header->v_incarnation = mp_bswap_u64(member->incarnation); +} + +static inline void +swim_schedule_event(struct swim *swim, struct swim_member *member) +{ + if (rlist_empty(&member->in_queue_events)) { + rlist_add_tail_entry(&swim->queue_events, member, + in_queue_events); + } + member->dissemination_ttl = mh_size(swim->members); +} + +/** }}} Dissemination component */ + /** * SWIM message structure: * { @@ -644,6 +749,18 @@ swim_member_bin_create(struct swim_member_bin *header) * * OR/AND * + * SWIM_DISSEMINATION: [ + * { + * SWIM_MEMBER_STATUS: uint, enum member_status, + * SWIM_MEMBER_ADDR: uint, ip, + * SWIM_MEMBER_PORT: uint, port, + * SWIM_MEMBER_INCARNATION: uint + * }, + * ... + * ], + * + * OR/AND + * * SWIM_ANTI_ENTROPY: [ * { * SWIM_MEMBER_STATUS: uint, enum member_status, @@ -663,6 +780,16 @@ swim_io_task_push(struct swim_io_task *task) ev_io_start(loop(), &task->swim->output); } +/** + * Make all needed actions to process a member's update like a + * change of its status, or incarnation, or both. + */ +static void +swim_member_is_updated(struct swim *swim, struct swim_member *member) +{ + swim_schedule_event(swim, member); +} + /** * Update status of the member if needed. Statuses are compared as * a compound key: {incarnation, status}. So @a new_status can @@ -678,14 +805,16 @@ swim_member_update_status(struct swim *swim, struct swim_member *member, enum swim_member_status new_status, uint64_t incarnation) { - (void) swim; assert(member != swim->self); if (member->incarnation == incarnation) { - if (member->status < new_status) + if (member->status < new_status) { member->status = new_status; + swim_member_is_updated(swim, member); + } } else if (member->incarnation < incarnation) { member->status = new_status; member->incarnation = incarnation; + swim_member_is_updated(swim, member); } } @@ -725,6 +854,8 @@ swim_member_new(struct swim *swim, const struct sockaddr_in *addr, swim_io_task_create(&member->ping_task, swim_send_ping, swim); rlist_add_entry(&swim->queue_round, member, in_queue_round); rlist_create(&member->in_queue_wait_ack); + rlist_create(&member->in_queue_events); + swim_schedule_event(swim, member); return member; } @@ -753,6 +884,7 @@ swim_member_delete(struct swim *swim, struct swim_member *member) swim_io_task_destroy(&member->ping_task); rlist_del_entry(member, in_queue_round); rlist_del_entry(member, in_queue_wait_ack); + assert(rlist_empty(&member->in_queue_events)); free(member); } @@ -872,6 +1004,62 @@ swim_encode_failure_detection(struct swim *swim, struct swim_msg *msg, return 1; } +static int +swim_encode_dissemination_part(struct swim_msg *msg, struct rlist **queue_pos) +{ + struct swim_diss_header_bin diss_header_bin; + struct swim_msg_part *part = + swim_msg_reserve(msg, sizeof(diss_header_bin)); + if (part == NULL) + return -1; + char *header = swim_msg_part_pos(part); + char *end = swim_msg_part_end(part); + char *pos = header + sizeof(diss_header_bin); + + int i = 0; + struct swim_member *member, *prev = NULL; + struct swim_event_bin event_bin; + swim_event_bin_create(&event_bin); + rlist_foreach_entry(member, *queue_pos, in_queue_events) { + if (pos + sizeof(event_bin) > end) + break; + swim_event_bin_reset(&event_bin, member); + memcpy(pos, &event_bin, sizeof(event_bin)); + pos += sizeof(event_bin); + ++i; + prev = member; + } + if (i == 0) + return 0; + swim_diss_header_bin_create(&diss_header_bin, i); + memcpy(header, &diss_header_bin, sizeof(diss_header_bin)); + swim_msg_part_advance(part, pos - header); + if (prev == rlist_last_entry(*queue_pos, struct swim_member, + in_queue_events)) + *queue_pos = NULL; + else + *queue_pos = rlist_next(&prev->in_queue_events); + return i; +} + +/** + * Encode failure dissemination component. + * @retval Number of encoded events. + */ +static int +swim_encode_dissemination(struct swim *swim, struct swim_msg *msg) +{ + int count = 0; + struct rlist *pos = rlist_first(&swim->queue_events); + while (pos != rlist_last(&swim->queue_events)) { + int rc = swim_encode_dissemination_part(msg, &pos); + if (rc < 0) + return -1; + count += rc; + } + return count; +} + /** Encode SWIM components into a sequence of UDP packets. */ static int swim_encode_round_msg(struct swim *swim, struct swim_msg *msg) @@ -889,6 +1077,11 @@ swim_encode_round_msg(struct swim *swim, struct swim_msg *msg) goto error; map_size += rc > 0; + rc = swim_encode_dissemination(swim, msg); + if (rc < 0) + goto error; + map_size += rc > 0; + rc = swim_encode_anti_entropy(swim, msg); if (rc < 0) goto error; @@ -902,6 +1095,17 @@ error: return -1; } +static void +swim_decrease_events_ttl(struct swim *swim) +{ + struct swim_member *member, *tmp; + rlist_foreach_entry_safe(member, &swim->queue_events, in_queue_events, + tmp) { + if (--member->dissemination_ttl == 0) + rlist_del_entry(member, in_queue_events); + } +} + /** * Do one round step. Send encoded components to a next member * from the queue. @@ -943,6 +1147,7 @@ swim_send_round_msg(struct swim_io_task *task) } swim_msg_destroy(&msg); swim_member_schedule_ack_wait(swim, m); + swim_decrease_events_ttl(swim); rlist_del_entry(m, in_queue_round); next_round_step: ev_periodic_start(loop(), &swim->round_tick); @@ -1040,12 +1245,14 @@ swim_check_acks(struct ev_loop *loop, struct ev_periodic *p, int events) break; ++m->failed_pings; if (m->failed_pings >= NO_ACKS_TO_GC) { - if (!m->is_pinned) + if (!m->is_pinned && m->dissemination_ttl == 0) swim_member_delete(swim, m); continue; } - if (m->failed_pings >= NO_ACKS_TO_DEAD) + if (m->failed_pings >= NO_ACKS_TO_DEAD) { m->status = MEMBER_DEAD; + swim_member_is_updated(swim, m); + } swim_io_task_push(&m->ping_task); rlist_del_entry(m, in_queue_wait_ack); } @@ -1296,6 +1503,50 @@ swim_process_failure_detection(struct swim *swim, const char **pos, return 0; } +static int +swim_process_dissemination(struct swim *swim, const char **pos, const char *end) +{ + const char *msg_pref = "Invald SWIM dissemination message:"; + if (mp_typeof(**pos) != MP_ARRAY || mp_check_array(*pos, end) > 0) { + say_error("%s message should be an array", msg_pref); + return -1; + } + uint64_t size = mp_decode_array(pos); + for (uint64_t i = 0; i < size; ++i) { + if (mp_typeof(**pos) != MP_MAP || + mp_check_map(*pos, end) > 0) { + say_error("%s event should be map", msg_pref); + return -1; + } + uint64_t map_size = mp_decode_map(pos); + struct swim_member_def def; + swim_member_def_create(&def); + for (uint64_t j = 0; j < map_size; ++j) { + if (mp_typeof(**pos) != MP_UINT || + mp_check_uint(*pos, end) > 0) { + say_error("%s event key should be uint", + msg_pref); + return -1; + } + uint64_t key = mp_decode_uint(pos); + if (key >= swim_member_key_MAX) { + say_error("%s unknown event key", msg_pref); + return -1; + } + if (swim_process_member_key(key, pos, end, msg_pref, + &def) != 0) + return -1; + } + if (def.addr.sin_port == 0 || def.addr.sin_addr.s_addr == 0) { + say_error("%s member address should be specified", + msg_pref); + return -1; + } + swim_process_member_update(swim, &def); + } + return 0; +} + /** Receive and process a new message. */ static void swim_on_input(struct ev_loop *loop, struct ev_io *io, int events) @@ -1344,6 +1595,11 @@ swim_on_input(struct ev_loop *loop, struct ev_io *io, int events) &addr) != 0) return; break; + case SWIM_DISSEMINATION: + say_verbose("SWIM: process dissemination"); + if (swim_process_dissemination(swim, &pos, end) != 0) + return; + break; default: say_error("%s unknown component type component is "\ "supported", msg_pref); @@ -1422,6 +1678,7 @@ swim_new(void) ev_init(&swim->wait_ack_tick, swim_check_acks); ev_periodic_set(&swim->wait_ack_tick, 0, ACK_TIMEOUT, NULL); swim->wait_ack_tick.data = (void *) swim; + rlist_create(&swim->queue_events); return swim; } @@ -1508,8 +1765,10 @@ swim_remove_member(struct swim *swim, const char *uri) if (uri_to_addr(uri, &addr) != 0) return -1; struct swim_member *member = swim_find_member(swim, &addr); - if (member != NULL) + if (member != NULL) { + rlist_del_entry(member, in_queue_events); swim_member_delete(swim, member); + } return 0; } @@ -1546,6 +1805,7 @@ swim_delete(struct swim *swim) while (node != mh_end(swim->members)) { struct swim_member *m = (struct swim_member *) mh_i64ptr_node(swim->members, node)->val; + rlist_del_entry(m, in_queue_events); swim_member_delete(swim, m); node = mh_first(swim->members); } -- 2.17.2 (Apple Git-113)
next prev parent reply other threads:[~2018-12-25 19:19 UTC|newest] Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top 2018-12-25 19:19 [PATCH v2 0/6] SWIM Vladislav Shpilevoy 2018-12-25 19:19 ` [PATCH v2 1/6] [RAW] swim: introduce SWIM's anti-entropy component Vladislav Shpilevoy 2018-12-25 19:19 ` [PATCH v2 2/6] [RAW] swim: introduce failure detection component Vladislav Shpilevoy 2018-12-25 19:19 ` Vladislav Shpilevoy [this message] 2018-12-25 19:19 ` [PATCH v2 4/6] [RAW] swim: keep encoded round message cached Vladislav Shpilevoy 2018-12-25 19:19 ` [PATCH v2 5/6] [RAW] swim: send one UDP packet per EV_WRITE event Vladislav Shpilevoy 2018-12-26 21:01 ` [tarantool-patches] [PATCH v2 0/6] SWIM Vladislav Shpilevoy
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=3b5f96c93bdb29e60de4565ba4bcb93edcd6389a.1545765055.git.v.shpilevoy@tarantool.org \ --to=v.shpilevoy@tarantool.org \ --cc=kostja@tarantool.org \ --cc=tarantool-patches@freelists.org \ --cc=vdavydov.dev@gmail.com \ --subject='Re: [PATCH v2 3/6] [RAW] swim: introduce a dissemination component' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox