[PATCH v2 3/3] replication: improve join/subscribe logging
Vladimir Davydov
vdavydov.dev at gmail.com
Fri Feb 15 15:25:49 MSK 2019
This patch makes logging a little bit more verbose so as to facilitate
post-mortem replication failure analysis:
- When an instance starts up, print its uuid and vclock.
- When a replica connects to a master, print the master's uuid to
the replica's log.
- Print replica uuid and address to the master's log on join/subscribe.
- Log local and remote vclock on subscribe, both on master and on
replica.
- Log ids assigned to replicas both on recovery and when online.
Also, log when a replica is deleted from the cluster.
---
src/box/alter.cc | 1 +
src/box/applier.cc | 17 +++++++++++++----
src/box/box.cc | 21 ++++++++++++++++++---
src/box/replication.cc | 5 +++++
4 files changed, 37 insertions(+), 7 deletions(-)
diff --git a/src/box/alter.cc b/src/box/alter.cc
index de8372b3..0e340468 100644
--- a/src/box/alter.cc
+++ b/src/box/alter.cc
@@ -3017,6 +3017,7 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event)
tt_uuid uu;
tuple_field_uuid_xc(new_tuple, BOX_CLUSTER_FIELD_UUID, &uu);
REPLICASET_UUID = uu;
+ say_info("cluster uuid %s", tt_uuid_str(&uu));
}
}
diff --git a/src/box/applier.cc b/src/box/applier.cc
index c09253be..e9addcb3 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -207,11 +207,12 @@ applier_connect(struct applier *applier)
}
if (applier->version_id != greeting.version_id) {
- say_info("remote master is %u.%u.%u at %s",
+ say_info("remote master %s at %s running Tarantool %u.%u.%u",
+ tt_uuid_str(&greeting.uuid),
+ sio_strfaddr(&applier->addr, applier->addr_len),
version_id_major(greeting.version_id),
version_id_minor(greeting.version_id),
- version_id_patch(greeting.version_id),
- sio_strfaddr(&applier->addr, applier->addr_len));
+ version_id_patch(greeting.version_id));
}
/* Save the remote instance version and UUID on connect. */
@@ -393,8 +394,11 @@ applier_subscribe(struct applier *applier)
struct vclock remote_vclock_at_subscribe;
struct tt_uuid cluster_id = uuid_nil;
+ struct vclock vclock;
+ vclock_create(&vclock);
+ vclock_copy(&vclock, &replicaset.vclock);
xrow_encode_subscribe_xc(&row, &REPLICASET_UUID, &INSTANCE_UUID,
- &replicaset.vclock);
+ &vclock);
coio_write_xrow(coio, &row);
/* Read SUBSCRIBE response */
@@ -429,6 +433,11 @@ applier_subscribe(struct applier *applier)
tt_uuid_str(&cluster_id),
tt_uuid_str(&REPLICASET_UUID));
}
+
+ say_info("subscribed");
+ say_info("remote vclock %s local vclock %s",
+ vclock_to_string(&remote_vclock_at_subscribe),
+ vclock_to_string(&vclock));
}
/*
* Tarantool < 1.6.7:
diff --git a/src/box/box.cc b/src/box/box.cc
index 8136cf33..058ba266 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1509,6 +1509,9 @@ box_process_join(struct ev_io *io, struct xrow_header *header)
row.sync = header->sync;
coio_write_xrow(io, &row);
+ say_info("joining replica %s at %s",
+ tt_uuid_str(&instance_uuid), sio_socketname(io->fd));
+
/*
* Initial stream: feed replica with dirty data from engines.
*/
@@ -1600,6 +1603,9 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header)
"wal_mode = 'none'");
}
+ struct vclock vclock;
+ vclock_create(&vclock);
+ vclock_copy(&vclock, &replicaset.vclock);
/*
* Send a response to SUBSCRIBE request, tell
* the replica how many rows we have in stock for it,
@@ -1612,9 +1618,7 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header)
* the additional field.
*/
struct xrow_header row;
- xrow_encode_subscribe_response_xc(&row,
- &REPLICASET_UUID,
- &replicaset.vclock);
+ xrow_encode_subscribe_response_xc(&row, &REPLICASET_UUID, &vclock);
/*
* Identify the message with the replica id of this
* instance, this is the only way for a replica to find
@@ -1626,6 +1630,11 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header)
row.sync = header->sync;
coio_write_xrow(io, &row);
+ say_info("subscribed replica %s at %s",
+ tt_uuid_str(&replica_uuid), sio_socketname(io->fd));
+ say_info("remote vclock %s local vclock %s",
+ vclock_to_string(&replica_clock), vclock_to_string(&vclock));
+
/*
* Process SUBSCRIBE request via replication relay
* Send current recovery vector clock as a marker
@@ -1848,6 +1857,9 @@ bootstrap(const struct tt_uuid *instance_uuid,
INSTANCE_UUID = *instance_uuid;
else
tt_uuid_create(&INSTANCE_UUID);
+
+ say_info("instance uuid %s", tt_uuid_str(&INSTANCE_UUID));
+
/*
* Begin listening on the socket to enable
* master-master replication leader election.
@@ -1905,6 +1917,8 @@ local_recovery(const struct tt_uuid *instance_uuid,
tt_uuid_str(&INSTANCE_UUID));
}
+ say_info("instance uuid %s", tt_uuid_str(&INSTANCE_UUID));
+
struct wal_stream wal_stream;
wal_stream_create(&wal_stream, cfg_geti64("rows_per_wal"));
@@ -1930,6 +1944,7 @@ local_recovery(const struct tt_uuid *instance_uuid,
* not attempt to apply these rows twice.
*/
recovery_scan(recovery, &replicaset.vclock, &gc.vclock);
+ say_info("instance vclock %s", vclock_to_string(&replicaset.vclock));
if (wal_dir_lock >= 0) {
box_listen();
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 19ad5026..e3ceb009 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -197,6 +197,9 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
instance_id = replica_id;
}
replicaset.replica_by_id[replica_id] = replica;
+
+ say_info("assigned id %d to replica %s",
+ replica->id, tt_uuid_str(&replica->uuid));
}
void
@@ -214,6 +217,8 @@ replica_clear_id(struct replica *replica)
*/
replicaset.replica_by_id[replica->id] = NULL;
replica->id = REPLICA_ID_NIL;
+ say_info("removed replica %s", tt_uuid_str(&replica->uuid));
+
/*
* The replica will never resubscribe so we don't need to keep
* WALs for it anymore. Unregister it with the garbage collector
--
2.11.0
More information about the Tarantool-patches
mailing list