Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [PATCH 1/2] replication: fix recoverable error reporting
Date: Sun, 23 Sep 2018 18:31:20 +0300	[thread overview]
Message-ID: <64506bb0767e75223c8465b98745db55f7cb98be.1537715258.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1537715258.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1537715258.git.vdavydov.dev@gmail.com>

First, we print "will retry every XX second" to the log after an error
message only for socket and system errors although we keep trying to
establish a replication connection after configuration errors as well.
Let's print this message for those errors too to avoid confusion.

Second, in case we receive an error in reply to SUBSCRIBE command, we
log "can't read row" instead of "can't join/subscribe". This happens,
because we switch an applier to SYNC/FOLLOW state before receiving a
reply to SUBSCRIBE command. Fix this by updating an applier state only
after successfully subscribing.

Third, we detect duplicate connections coming from the same replica on
the master only after sending a reply to SUBSCRIBE command, that is in
relay_subscribe rather than in box_process_subscribe. This results in
"can't read row" being printed to the replica's log even though it's
actually a SUBSCRIBE error. Fix this by moving the check where it
actually belongs.
---
 src/box/applier.cc | 67 ++++++++++++++++++++++++++++--------------------------
 src/box/box.cc     |  6 +++++
 src/box/relay.cc   |  6 +----
 3 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index 604119e9..7304e83f 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -91,9 +91,18 @@ applier_log_error(struct applier *applier, struct error *e)
 		break;
 	}
 	error_log(e);
-	if (type_cast(SocketError, e) || type_cast(SystemError, e))
+	switch (errcode) {
+	case ER_LOADING:
+	case ER_CFG:
+	case ER_ACCESS_DENIED:
+	case ER_NO_SUCH_USER:
+	case ER_SYSTEM:
 		say_info("will retry every %.2lf second",
 			 replication_reconnect_interval());
+		break;
+	default:
+		break;
+	}
 	applier->last_logged_errcode = errcode;
 }
 
@@ -382,30 +391,7 @@ applier_subscribe(struct applier *applier)
 				 &replicaset.vclock);
 	coio_write_xrow(coio, &row);
 
-	if (applier->state == APPLIER_READY) {
-		/*
-		 * Tarantool < 1.7.7 does not send periodic heartbeat
-		 * messages so we cannot enable applier synchronization
-		 * for it without risking getting stuck in the 'orphan'
-		 * mode until a DML operation happens on the master.
-		 */
-		if (applier->version_id >= version_id(1, 7, 7))
-			applier_set_state(applier, APPLIER_SYNC);
-		else
-			applier_set_state(applier, APPLIER_FOLLOW);
-	} else {
-		/*
-		 * Tarantool < 1.7.0 sends replica id during
-		 * "subscribe" stage. We can't finish bootstrap
-		 * until it is received.
-		 */
-		assert(applier->state == APPLIER_FINAL_JOIN);
-		assert(applier->version_id < version_id(1, 7, 0));
-	}
-
-	/*
-	 * Read SUBSCRIBE response
-	 */
+	/* Read SUBSCRIBE response */
 	if (applier->version_id >= version_id(1, 6, 7)) {
 		coio_read_xrow(coio, ibuf, &row);
 		if (iproto_type_is_error(row.type)) {
@@ -421,7 +407,7 @@ applier_subscribe(struct applier *applier)
 		vclock_create(&remote_vclock_at_subscribe);
 		xrow_decode_vclock_xc(&row, &remote_vclock_at_subscribe);
 	}
-	/**
+	/*
 	 * Tarantool < 1.6.7:
 	 * If there is an error in subscribe, it's sent directly
 	 * in response to subscribe.  If subscribe is successful,
@@ -429,6 +415,27 @@ applier_subscribe(struct applier *applier)
 	 * the binary log.
 	 */
 
+	if (applier->state == APPLIER_READY) {
+		/*
+		 * Tarantool < 1.7.7 does not send periodic heartbeat
+		 * messages so we cannot enable applier synchronization
+		 * for it without risking getting stuck in the 'orphan'
+		 * mode until a DML operation happens on the master.
+		 */
+		if (applier->version_id >= version_id(1, 7, 7))
+			applier_set_state(applier, APPLIER_SYNC);
+		else
+			applier_set_state(applier, APPLIER_FOLLOW);
+	} else {
+		/*
+		 * Tarantool < 1.7.0 sends replica id during
+		 * "subscribe" stage. We can't finish bootstrap
+		 * until it is received.
+		 */
+		assert(applier->state == APPLIER_FINAL_JOIN);
+		assert(applier->version_id < version_id(1, 7, 0));
+	}
+
 	/* Re-enable warnings after successful execution of SUBSCRIBE */
 	applier->last_logged_errcode = 0;
 	if (applier->version_id >= version_id(1, 7, 4)) {
@@ -607,7 +614,8 @@ applier_f(va_list ap)
 				applier_log_error(applier, e);
 				applier_disconnect(applier, APPLIER_LOADING);
 				goto reconnect;
-			} else if (e->errcode() == ER_ACCESS_DENIED ||
+			} else if (e->errcode() == ER_CFG ||
+				   e->errcode() == ER_ACCESS_DENIED ||
 				   e->errcode() == ER_NO_SUCH_USER) {
 				/* Invalid configuration */
 				applier_log_error(applier, e);
@@ -618,11 +626,6 @@ applier_f(va_list ap)
 				applier_log_error(applier, e);
 				applier_disconnect(applier, APPLIER_DISCONNECTED);
 				goto reconnect;
-			} else if (e->errcode() == ER_CFG) {
-				/* Invalid configuration */
-				applier_log_error(applier, e);
-				applier_disconnect(applier, APPLIER_DISCONNECTED);
-				goto reconnect;
 			} else {
 				/* Unrecoverable errors */
 				applier_log_error(applier, e);
diff --git a/src/box/box.cc b/src/box/box.cc
index f25146d0..804fc00e 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1560,6 +1560,12 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header)
 			  tt_uuid_str(&REPLICASET_UUID));
 	}
 
+	/* Don't allow multiple relays for the same replica */
+	if (relay_get_state(replica->relay) == RELAY_FOLLOW) {
+		tnt_raise(ClientError, ER_CFG, "replication",
+			  "duplicate connection with the same replica UUID");
+	}
+
 	/* Forbid replication with disabled WAL */
 	if (wal_mode() == WAL_NONE) {
 		tnt_raise(ClientError, ER_UNSUPPORTED, "Replication",
diff --git a/src/box/relay.cc b/src/box/relay.cc
index b0bc03ef..c90383d4 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -585,11 +585,7 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
 {
 	assert(replica->id != REPLICA_ID_NIL);
 	struct relay *relay = replica->relay;
-	/* Don't allow multiple relays for the same replica */
-	if (relay->state == RELAY_FOLLOW) {
-		tnt_raise(ClientError, ER_CFG, "replication",
-			  "duplicate connection with the same replica UUID");
-	}
+	assert(relay->state != RELAY_FOLLOW);
 	/*
 	 * Register the replica with the garbage collector
 	 * unless it has already been registered by initial
-- 
2.11.0

  reply	other threads:[~2018-09-23 15:31 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-09-23 15:31 [PATCH 0/2] replication: don't stop syncing on configuration errors Vladimir Davydov
2018-09-23 15:31 ` Vladimir Davydov [this message]
2018-09-25 22:24   ` [tarantool-patches] Re: [PATCH 1/2] replication: fix recoverable error reporting Konstantin Osipov
2018-09-23 15:31 ` [PATCH 2/2] replication: don't stop syncing on configuration errors Vladimir Davydov
2018-09-25 22:29   ` [tarantool-patches] " Konstantin Osipov
2018-09-26  8:26     ` Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=64506bb0767e75223c8465b98745db55f7cb98be.1537715258.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [PATCH 1/2] replication: fix recoverable error reporting' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox