From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH v2 3/3] replication: improve join/subscribe logging Date: Fri, 15 Feb 2019 15:25:49 +0300 Message-Id: <1b1b685df8acc98cd88295961ef82271dc39627f.1550232829.git.vdavydov.dev@gmail.com> In-Reply-To: References: In-Reply-To: References: To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: This patch makes logging a little bit more verbose so as to facilitate post-mortem replication failure analysis: - When an instance starts up, print its uuid and vclock. - When a replica connects to a master, print the master's uuid to the replica's log. - Print replica uuid and address to the master's log on join/subscribe. - Log local and remote vclock on subscribe, both on master and on replica. - Log ids assigned to replicas both on recovery and when online. Also, log when a replica is deleted from the cluster. --- src/box/alter.cc | 1 + src/box/applier.cc | 17 +++++++++++++---- src/box/box.cc | 21 ++++++++++++++++++--- src/box/replication.cc | 5 +++++ 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/box/alter.cc b/src/box/alter.cc index de8372b3..0e340468 100644 --- a/src/box/alter.cc +++ b/src/box/alter.cc @@ -3017,6 +3017,7 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event) tt_uuid uu; tuple_field_uuid_xc(new_tuple, BOX_CLUSTER_FIELD_UUID, &uu); REPLICASET_UUID = uu; + say_info("cluster uuid %s", tt_uuid_str(&uu)); } } diff --git a/src/box/applier.cc b/src/box/applier.cc index c09253be..e9addcb3 100644 --- a/src/box/applier.cc +++ b/src/box/applier.cc @@ -207,11 +207,12 @@ applier_connect(struct applier *applier) } if (applier->version_id != greeting.version_id) { - say_info("remote master is %u.%u.%u at %s", + say_info("remote master %s at %s running Tarantool %u.%u.%u", + tt_uuid_str(&greeting.uuid), + sio_strfaddr(&applier->addr, applier->addr_len), version_id_major(greeting.version_id), version_id_minor(greeting.version_id), - version_id_patch(greeting.version_id), - sio_strfaddr(&applier->addr, applier->addr_len)); + version_id_patch(greeting.version_id)); } /* Save the remote instance version and UUID on connect. */ @@ -393,8 +394,11 @@ applier_subscribe(struct applier *applier) struct vclock remote_vclock_at_subscribe; struct tt_uuid cluster_id = uuid_nil; + struct vclock vclock; + vclock_create(&vclock); + vclock_copy(&vclock, &replicaset.vclock); xrow_encode_subscribe_xc(&row, &REPLICASET_UUID, &INSTANCE_UUID, - &replicaset.vclock); + &vclock); coio_write_xrow(coio, &row); /* Read SUBSCRIBE response */ @@ -429,6 +433,11 @@ applier_subscribe(struct applier *applier) tt_uuid_str(&cluster_id), tt_uuid_str(&REPLICASET_UUID)); } + + say_info("subscribed"); + say_info("remote vclock %s local vclock %s", + vclock_to_string(&remote_vclock_at_subscribe), + vclock_to_string(&vclock)); } /* * Tarantool < 1.6.7: diff --git a/src/box/box.cc b/src/box/box.cc index 8136cf33..058ba266 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -1509,6 +1509,9 @@ box_process_join(struct ev_io *io, struct xrow_header *header) row.sync = header->sync; coio_write_xrow(io, &row); + say_info("joining replica %s at %s", + tt_uuid_str(&instance_uuid), sio_socketname(io->fd)); + /* * Initial stream: feed replica with dirty data from engines. */ @@ -1600,6 +1603,9 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header) "wal_mode = 'none'"); } + struct vclock vclock; + vclock_create(&vclock); + vclock_copy(&vclock, &replicaset.vclock); /* * Send a response to SUBSCRIBE request, tell * the replica how many rows we have in stock for it, @@ -1612,9 +1618,7 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header) * the additional field. */ struct xrow_header row; - xrow_encode_subscribe_response_xc(&row, - &REPLICASET_UUID, - &replicaset.vclock); + xrow_encode_subscribe_response_xc(&row, &REPLICASET_UUID, &vclock); /* * Identify the message with the replica id of this * instance, this is the only way for a replica to find @@ -1626,6 +1630,11 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header) row.sync = header->sync; coio_write_xrow(io, &row); + say_info("subscribed replica %s at %s", + tt_uuid_str(&replica_uuid), sio_socketname(io->fd)); + say_info("remote vclock %s local vclock %s", + vclock_to_string(&replica_clock), vclock_to_string(&vclock)); + /* * Process SUBSCRIBE request via replication relay * Send current recovery vector clock as a marker @@ -1848,6 +1857,9 @@ bootstrap(const struct tt_uuid *instance_uuid, INSTANCE_UUID = *instance_uuid; else tt_uuid_create(&INSTANCE_UUID); + + say_info("instance uuid %s", tt_uuid_str(&INSTANCE_UUID)); + /* * Begin listening on the socket to enable * master-master replication leader election. @@ -1905,6 +1917,8 @@ local_recovery(const struct tt_uuid *instance_uuid, tt_uuid_str(&INSTANCE_UUID)); } + say_info("instance uuid %s", tt_uuid_str(&INSTANCE_UUID)); + struct wal_stream wal_stream; wal_stream_create(&wal_stream, cfg_geti64("rows_per_wal")); @@ -1930,6 +1944,7 @@ local_recovery(const struct tt_uuid *instance_uuid, * not attempt to apply these rows twice. */ recovery_scan(recovery, &replicaset.vclock, &gc.vclock); + say_info("instance vclock %s", vclock_to_string(&replicaset.vclock)); if (wal_dir_lock >= 0) { box_listen(); diff --git a/src/box/replication.cc b/src/box/replication.cc index 19ad5026..e3ceb009 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -197,6 +197,9 @@ replica_set_id(struct replica *replica, uint32_t replica_id) instance_id = replica_id; } replicaset.replica_by_id[replica_id] = replica; + + say_info("assigned id %d to replica %s", + replica->id, tt_uuid_str(&replica->uuid)); } void @@ -214,6 +217,8 @@ replica_clear_id(struct replica *replica) */ replicaset.replica_by_id[replica->id] = NULL; replica->id = REPLICA_ID_NIL; + say_info("removed replica %s", tt_uuid_str(&replica->uuid)); + /* * The replica will never resubscribe so we don't need to keep * WALs for it anymore. Unregister it with the garbage collector -- 2.11.0