Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH 0/3] replication: improve logging.
@ 2018-05-16 11:32 Konstantin Belyavskiy
  2018-05-16 11:32 ` [tarantool-patches] [PATCH 1/3] replication: use applier_state to check quorum Konstantin Belyavskiy
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Konstantin Belyavskiy @ 2018-05-16 11:32 UTC (permalink / raw)
  To: georgy, kostja; +Cc: tarantool-patches

Ticket: https://github.com/tarantool/tarantool/issues/3365
Branch: https://github.com/tarantool/tarantool/compare/gh-3365-display-an-error-at-downstream-on-replica-failure-or-disconnect-v3

This is set of patches originally aiming to improve error logging.
First patch includes small refactoring (reuse a subset of applier
state machine for tracking replication quorum state).
Second patch is about do not remove relay on applier disconnect.
AR: Konstya, Gosha currently I'm not happy about relay_create part
there I use memset for all relay structure, resulting in possible
data/state loss. I'm thinking about to use memset only for new
relya and reset for others.
Third pathc improves logging (duplicate same error for master and
replica and store it in relay state).

Konstantin Belyavskiy (3):
  replication: use applier_state to check quorum
  replication: do not delete relay on applier disconnect
  replication: display downstream status at upstream

 src/box/lua/info.c                                 |  19 +++-
 src/box/relay.cc                                   |  78 ++++++++++----
 src/box/relay.h                                    |  30 ++++++
 src/box/replication.cc                             |  60 +++++------
 src/box/replication.h                              |  43 +++-----
 test/replication/show_error_on_disconnect.result   | 120 +++++++++++++++++++++
 test/replication/show_error_on_disconnect.test.lua |  38 +++++++
 7 files changed, 305 insertions(+), 83 deletions(-)
 create mode 100644 test/replication/show_error_on_disconnect.result
 create mode 100644 test/replication/show_error_on_disconnect.test.lua

-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tarantool-patches] [PATCH 1/3] replication: use applier_state to check quorum
  2018-05-16 11:32 [tarantool-patches] [PATCH 0/3] replication: improve logging Konstantin Belyavskiy
@ 2018-05-16 11:32 ` Konstantin Belyavskiy
  2018-05-16 11:32 ` [tarantool-patches] [PATCH 2/3] replication: do not delete relay on applier disconnect Konstantin Belyavskiy
  2018-05-16 11:32 ` [tarantool-patches] [PATCH 3/3] replication: display downstream status at upstream Konstantin Belyavskiy
  2 siblings, 0 replies; 4+ messages in thread
From: Konstantin Belyavskiy @ 2018-05-16 11:32 UTC (permalink / raw)
  To: georgy, kostja; +Cc: tarantool-patches

Small refactoring: remove 'enum replica_state' since reuse a subset
from applier state machine 'enum replica_state' to check if we have
achieved replication quorum and hence can leave read-only mode.
---
 src/box/replication.cc | 31 +++++++++++++++----------------
 src/box/replication.h  | 32 ++++++++++++--------------------
 2 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/src/box/replication.cc b/src/box/replication.cc
index 185c05305..9aac1f077 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -38,7 +38,6 @@
 
 #include "box.h"
 #include "gc.h"
-#include "applier.h"
 #include "error.h"
 #include "vclock.h" /* VCLOCK_MAX */
 
@@ -138,7 +137,7 @@ replica_new(void)
 	rlist_create(&replica->in_anon);
 	trigger_create(&replica->on_applier_state,
 		       replica_on_applier_state_f, NULL, NULL);
-	replica->state = REPLICA_DISCONNECTED;
+	replica->applier_sync_state = APPLIER_DISCONNECTED;
 	return replica;
 }
 
@@ -219,9 +218,9 @@ replica_clear_applier(struct replica *replica)
 static void
 replica_on_applier_sync(struct replica *replica)
 {
-	assert(replica->state == REPLICA_CONNECTED);
+	assert(replica->applier_sync_state == APPLIER_CONNECTED);
 
-	replica->state = REPLICA_SYNCED;
+	replica->applier_sync_state = APPLIER_SYNC;
 	replicaset.applier.synced++;
 
 	replicaset_check_quorum();
@@ -234,7 +233,7 @@ replica_on_applier_connect(struct replica *replica)
 
 	assert(tt_uuid_is_nil(&replica->uuid));
 	assert(!tt_uuid_is_nil(&applier->uuid));
-	assert(replica->state == REPLICA_DISCONNECTED);
+	assert(replica->applier_sync_state == APPLIER_DISCONNECTED);
 
 	replica->uuid = applier->uuid;
 
@@ -265,7 +264,7 @@ replica_on_applier_connect(struct replica *replica)
 		replica_hash_insert(&replicaset.hash, replica);
 	}
 
-	replica->state = REPLICA_CONNECTED;
+	replica->applier_sync_state = APPLIER_CONNECTED;
 	replicaset.applier.connected++;
 }
 
@@ -276,7 +275,7 @@ replica_on_applier_reconnect(struct replica *replica)
 
 	assert(!tt_uuid_is_nil(&replica->uuid));
 	assert(!tt_uuid_is_nil(&applier->uuid));
-	assert(replica->state == REPLICA_DISCONNECTED);
+	assert(replica->applier_sync_state == APPLIER_DISCONNECTED);
 
 	if (!tt_uuid_is_equal(&replica->uuid, &applier->uuid)) {
 		/*
@@ -298,32 +297,32 @@ replica_on_applier_reconnect(struct replica *replica)
 
 		replica_set_applier(orig, applier);
 		replica_clear_applier(replica);
-		replica->state = REPLICA_DISCONNECTED;
+		replica->applier_sync_state = APPLIER_DISCONNECTED;
 		replica = orig;
 	}
 
-	replica->state = REPLICA_CONNECTED;
+	replica->applier_sync_state = APPLIER_CONNECTED;
 	replicaset.applier.connected++;
 }
 
 static void
 replica_on_applier_disconnect(struct replica *replica)
 {
-	switch (replica->state) {
-	case REPLICA_SYNCED:
+	switch (replica->applier_sync_state) {
+	case APPLIER_SYNC:
 		assert(replicaset.applier.synced > 0);
 		replicaset.applier.synced--;
 		FALLTHROUGH;
-	case REPLICA_CONNECTED:
+	case APPLIER_CONNECTED:
 		assert(replicaset.applier.connected > 0);
 		replicaset.applier.connected--;
 		break;
-	case REPLICA_DISCONNECTED:
+	case APPLIER_DISCONNECTED:
 		break;
 	default:
 		unreachable();
 	}
-	replica->state = REPLICA_DISCONNECTED;
+	replica->applier_sync_state = APPLIER_DISCONNECTED;
 }
 
 static void
@@ -424,7 +423,7 @@ replicaset_update(struct applier **appliers, int count)
 			continue;
 		applier = replica->applier;
 		replica_clear_applier(replica);
-		replica->state = REPLICA_DISCONNECTED;
+		replica->applier_sync_state = APPLIER_DISCONNECTED;
 		applier_stop(applier);
 		applier_delete(applier);
 	}
@@ -458,7 +457,7 @@ replicaset_update(struct applier **appliers, int count)
 			replica_hash_insert(&replicaset.hash, replica);
 		}
 
-		replica->state = REPLICA_CONNECTED;
+		replica->applier_sync_state = APPLIER_CONNECTED;
 		replicaset.applier.connected++;
 	}
 	rlist_swap(&replicaset.anon, &anon_replicas);
diff --git a/src/box/replication.h b/src/box/replication.h
index 7b85c2fc4..e2bdd814f 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -36,6 +36,7 @@
 #define RB_COMPACT 1
 #include <small/rb.h> /* replicaset_t */
 #include <small/rlist.h>
+#include "applier.h"
 #include <small/mempool.h>
 #include "fiber_cond.h"
 #include "vclock.h"
@@ -214,24 +215,6 @@ struct replicaset {
 };
 extern struct replicaset replicaset;
 
-enum replica_state {
-	/**
-	 * Applier has not connected to the master yet
-	 * or has disconnected.
-	 */
-	REPLICA_DISCONNECTED,
-	/**
-	 * Applier has connected to the master and
-	 * received UUID.
-	 */
-	REPLICA_CONNECTED,
-	/**
-	 * Applier has synchronized with the master
-	 * (left "sync" and entered "follow" state).
-	 */
-	REPLICA_SYNCED,
-};
-
 /**
  * Summary information about a replica in the replica set.
  */
@@ -260,8 +243,17 @@ struct replica {
 	 * Trigger invoked when the applier changes its state.
 	 */
 	struct trigger on_applier_state;
-	/** Replica sync state. */
-	enum replica_state state;
+	/**
+	 * During initial connect or reconnect we require applier
+	 * to sync with the master before the replica can leave
+	 * read-only mode. This enum reflects the state of the
+	 * state machine for applier sync. Technically it is a
+	 * subset of the applier state machine, but since it's
+	 * much simpler and is used for a different purpose
+	 * (achieving replication connect quorum), we keep it
+	 * separate from applier.
+	 */
+	enum applier_state applier_sync_state;
 };
 
 enum {
-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tarantool-patches] [PATCH 2/3] replication: do not delete relay on applier disconnect
  2018-05-16 11:32 [tarantool-patches] [PATCH 0/3] replication: improve logging Konstantin Belyavskiy
  2018-05-16 11:32 ` [tarantool-patches] [PATCH 1/3] replication: use applier_state to check quorum Konstantin Belyavskiy
@ 2018-05-16 11:32 ` Konstantin Belyavskiy
  2018-05-16 11:32 ` [tarantool-patches] [PATCH 3/3] replication: display downstream status at upstream Konstantin Belyavskiy
  2 siblings, 0 replies; 4+ messages in thread
From: Konstantin Belyavskiy @ 2018-05-16 11:32 UTC (permalink / raw)
  To: georgy, kostja; +Cc: tarantool-patches

This is a part of more complex task aiming to improve logging.
Do not destroy relay since it stores last error and it can be
useful for diagnostic reason.
Now relay is created with replica and always exists. So also
remove several NULL checks.
Add relay_state { OFF, FOLLOW and STOPPED } to track replica
presence, once connected it either FOLLOW or STOPPED until
master is reset.

Used for #3365.
---
 src/box/lua/info.c     |  2 +-
 src/box/relay.cc       | 70 +++++++++++++++++++++++++++++++++++++-------------
 src/box/relay.h        | 27 +++++++++++++++++++
 src/box/replication.cc | 29 +++++++++------------
 src/box/replication.h  | 11 --------
 5 files changed, 92 insertions(+), 47 deletions(-)

diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 8e8fd9d97..9dbc3f92c 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -145,7 +145,7 @@ lbox_pushreplica(lua_State *L, struct replica *replica)
 		lua_settable(L, -3);
 	}
 
-	if (relay != NULL) {
+	if (relay_get_state(replica->relay) == RELAY_FOLLOW) {
 		lua_pushstring(L, "downstream");
 		lbox_pushrelay(L, relay);
 		lua_settable(L, -3);
diff --git a/src/box/relay.cc b/src/box/relay.cc
index d2ceaf110..49835bcb2 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -131,6 +131,8 @@ struct relay {
 	struct stailq pending_gc;
 	/** Time when last row was sent to peer. */
 	double last_row_tm;
+	/** Relay sync state. */
+	enum relay_state state;
 
 	struct {
 		/* Align to prevent false-sharing with tx thread */
@@ -140,6 +142,17 @@ struct relay {
 	} tx;
 };
 
+enum relay_state
+relay_get_state(const struct relay *relay)
+{
+	return relay->state;
+}
+void
+relay_set_state(struct relay *relay, enum relay_state state)
+{
+	relay->state = state;
+}
+
 const struct vclock *
 relay_vclock(const struct relay *relay)
 {
@@ -153,6 +166,17 @@ relay_send_initial_join_row(struct xstream *stream, struct xrow_header *row);
 static void
 relay_send_row(struct xstream *stream, struct xrow_header *row);
 
+struct relay*
+relay_new(void)
+{
+	size_t size = sizeof(struct relay);
+	struct relay *relay = (struct relay *)malloc(size);
+	if (relay == NULL)
+		tnt_raise(OutOfMemory, size, "relay_new", "malloc");
+	relay->state = RELAY_OFF;
+	return relay;
+}
+
 static void
 relay_create(struct relay *relay, int fd, uint64_t sync,
 	     void (*stream_write)(struct xstream *, struct xrow_header *))
@@ -167,15 +191,23 @@ relay_create(struct relay *relay, int fd, uint64_t sync,
 }
 
 static void
-relay_destroy(struct relay *relay)
+relay_reset(struct relay *relay)
 {
 	struct relay_gc_msg *gc_msg, *next_gc_msg;
 	stailq_foreach_entry_safe(gc_msg, next_gc_msg,
 				  &relay->pending_gc, in_pending) {
 		free(gc_msg);
 	}
+	//stailq_create(&relay->pending_gc);
 	if (relay->r != NULL)
 		recovery_delete(relay->r);
+	relay->r = NULL;
+}
+
+void
+relay_destroy(struct relay *relay)
+{
+	relay_reset(relay);
 	fiber_cond_destroy(&relay->reader_cond);
 	diag_destroy(&relay->diag);
 	TRASH(relay);
@@ -502,7 +534,7 @@ relay_subscribe_f(va_list ap)
 		    NULL, NULL, cbus_process);
 	cbus_endpoint_destroy(&relay->endpoint, cbus_process);
 	if (!diag_is_empty(&relay->diag)) {
-		/* An error has occured while ACKs of xlog reading */
+		/* An error has occurred while reading ACKs of xlog. */
 		diag_move(&relay->diag, diag_get());
 	}
 	struct errinj *inj = errinj(ERRINJ_RELAY_EXIT_DELAY, ERRINJ_DOUBLE);
@@ -518,8 +550,9 @@ relay_subscribe(int fd, uint64_t sync, struct replica *replica,
 		struct vclock *replica_clock, uint32_t replica_version_id)
 {
 	assert(replica->id != REPLICA_ID_NIL);
+	struct relay *relay = replica->relay;
 	/* Don't allow multiple relays for the same replica */
-	if (replica->relay != NULL) {
+	if (relay->state == RELAY_FOLLOW) {
 		tnt_raise(ClientError, ER_CFG, "replication",
 			  "duplicate connection with the same replica UUID");
 	}
@@ -537,24 +570,25 @@ relay_subscribe(int fd, uint64_t sync, struct replica *replica,
 			diag_raise();
 	}
 
-	struct relay relay;
-	relay_create(&relay, fd, sync, relay_send_row);
-	relay.r = recovery_new(cfg_gets("wal_dir"),
-			       cfg_geti("force_recovery"),
-			       replica_clock);
-	vclock_copy(&relay.tx.vclock, replica_clock);
-	relay.version_id = replica_version_id;
-	relay.replica = replica;
-	replica_set_relay(replica, &relay);
-	vclock_copy(&relay.local_vclock_at_subscribe, &replicaset.vclock);
-
-	int rc = cord_costart(&relay.cord, tt_sprintf("relay_%p", &relay),
-			      relay_subscribe_f, &relay);
+	if (relay->state != RELAY_OFF)
+		relay_destroy(relay);
+	relay_create(relay, fd, sync, relay_send_row);
+	relay->r = recovery_new(cfg_gets("wal_dir"),
+			        cfg_geti("force_recovery"),
+			        replica_clock);
+	vclock_copy(&relay->tx.vclock, replica_clock);
+	relay->version_id = replica_version_id;
+	relay->replica = replica;
+	relay->state = RELAY_FOLLOW;
+	vclock_copy(&relay->local_vclock_at_subscribe, &replicaset.vclock);
+
+	int rc = cord_costart(&relay->cord, tt_sprintf("relay_%p", relay),
+			      relay_subscribe_f, relay);
 	if (rc == 0)
-		rc = cord_cojoin(&relay.cord);
+		rc = cord_cojoin(&relay->cord);
 
+	assert(replica->relay == relay);
 	replica_clear_relay(replica);
-	relay_destroy(&relay);
 
 	if (rc != 0)
 		diag_raise();
diff --git a/src/box/relay.h b/src/box/relay.h
index c8a2c2872..e20d4cd13 100644
--- a/src/box/relay.h
+++ b/src/box/relay.h
@@ -42,6 +42,33 @@ struct replica;
 struct tt_uuid;
 struct vclock;
 
+enum relay_state {
+	/**
+	 * Applier has not connected to the master or not expected.
+	 */
+	RELAY_OFF,
+	/**
+	 * Applier has connected to the master.
+	 */
+	RELAY_FOLLOW,
+	/**
+	 * Applier disconnected from the master.
+	 */
+	RELAY_STOPPED,
+};
+
+struct relay*
+relay_new(void);
+
+void
+relay_destroy(struct relay *relay);
+
+enum relay_state
+relay_get_state(const struct relay *relay);
+
+void
+relay_set_state(struct relay *relay, enum relay_state state);
+
 /**
  * Returns relay's vclock
  * @param relay relay
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 9aac1f077..53c2b9351 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -39,6 +39,7 @@
 #include "box.h"
 #include "gc.h"
 #include "error.h"
+#include "relay.h"
 #include "vclock.h" /* VCLOCK_MAX */
 
 uint32_t instance_id = REPLICA_ID_NIL;
@@ -81,8 +82,6 @@ void
 replication_init(void)
 {
 	memset(&replicaset, 0, sizeof(replicaset));
-	mempool_create(&replicaset.pool, &cord()->slabc,
-		       sizeof(struct replica));
 	replica_hash_new(&replicaset.hash);
 	rlist_create(&replicaset.anon);
 	vclock_create(&replicaset.vclock);
@@ -92,7 +91,6 @@ replication_init(void)
 void
 replication_free(void)
 {
-	mempool_destroy(&replicaset.pool);
 	fiber_cond_destroy(&replicaset.applier.cond);
 }
 
@@ -114,8 +112,9 @@ replica_check_id(uint32_t replica_id)
 static bool
 replica_is_orphan(struct replica *replica)
 {
+	assert(replica->relay != NULL);
 	return replica->id == REPLICA_ID_NIL && replica->applier == NULL &&
-	       replica->relay == NULL;
+	       relay_get_state(replica->relay) != RELAY_FOLLOW;
 }
 
 static void
@@ -125,14 +124,14 @@ static struct replica *
 replica_new(void)
 {
 	struct replica *replica = (struct replica *)
-			mempool_alloc(&replicaset.pool);
+			malloc(sizeof(struct replica));
 	if (replica == NULL)
 		tnt_raise(OutOfMemory, sizeof(*replica), "malloc",
 			  "struct replica");
 	replica->id = 0;
 	replica->uuid = uuid_nil;
 	replica->applier = NULL;
-	replica->relay = NULL;
+	replica->relay = relay_new();
 	replica->gc = NULL;
 	rlist_create(&replica->in_anon);
 	trigger_create(&replica->on_applier_state,
@@ -145,9 +144,14 @@ static void
 replica_delete(struct replica *replica)
 {
 	assert(replica_is_orphan(replica));
+	if (replica->relay != NULL) {
+		if (relay_get_state(replica->relay) != RELAY_OFF)
+			relay_destroy(replica->relay);
+		free(replica->relay);
+	}
 	if (replica->gc != NULL)
 		gc_consumer_unregister(replica->gc);
-	mempool_free(&replicaset.pool, replica);
+	free(replica);
 }
 
 struct replica *
@@ -656,19 +660,10 @@ replicaset_check_quorum(void)
 		box_clear_orphan();
 }
 
-void
-replica_set_relay(struct replica *replica, struct relay *relay)
-{
-	assert(replica->id != REPLICA_ID_NIL);
-	assert(replica->relay == NULL);
-	replica->relay = relay;
-}
-
 void
 replica_clear_relay(struct replica *replica)
 {
-	assert(replica->relay != NULL);
-	replica->relay = NULL;
+	relay_set_state(replica->relay, RELAY_STOPPED);
 	if (replica_is_orphan(replica)) {
 		replica_hash_remove(&replicaset.hash, replica);
 		replica_delete(replica);
diff --git a/src/box/replication.h b/src/box/replication.h
index e2bdd814f..2f0a2d7a7 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -37,7 +37,6 @@
 #include <small/rb.h> /* replicaset_t */
 #include <small/rlist.h>
 #include "applier.h"
-#include <small/mempool.h>
 #include "fiber_cond.h"
 #include "vclock.h"
 
@@ -173,8 +172,6 @@ typedef rb_tree(struct replica) replica_hash_t;
  * relays, usually connected in full mesh.
  */
 struct replicaset {
-	/** Memory pool for struct replica allocations. */
-	struct mempool pool;
 	/** Hash of replicas indexed by UUID. */
 	replica_hash_t hash;
 	/**
@@ -301,14 +298,6 @@ replica_set_id(struct replica *replica, uint32_t id);
 void
 replica_clear_id(struct replica *replica);
 
-/**
- * Register \a relay of a \a replica.
- * \pre a replica can have only one relay
- * \pre replica->id != REPLICA_ID_NIL
- */
-void
-replica_set_relay(struct replica *replica, struct relay *relay);
-
 /**
  * Unregister \a relay from the \a replica.
  */
-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tarantool-patches] [PATCH 3/3] replication: display downstream status at upstream
  2018-05-16 11:32 [tarantool-patches] [PATCH 0/3] replication: improve logging Konstantin Belyavskiy
  2018-05-16 11:32 ` [tarantool-patches] [PATCH 1/3] replication: use applier_state to check quorum Konstantin Belyavskiy
  2018-05-16 11:32 ` [tarantool-patches] [PATCH 2/3] replication: do not delete relay on applier disconnect Konstantin Belyavskiy
@ 2018-05-16 11:32 ` Konstantin Belyavskiy
  2 siblings, 0 replies; 4+ messages in thread
From: Konstantin Belyavskiy @ 2018-05-16 11:32 UTC (permalink / raw)
  To: georgy, kostja; +Cc: tarantool-patches

This fix improves 'box.info.replication' output.
If downstream fails and thus disconnects from upstream, improve
logging by printing 'status: disconnected' and error message on
both sides (master and replica).

Closes #3365
---
 src/box/lua/info.c                                 |  17 +++
 src/box/relay.cc                                   |   8 ++
 src/box/relay.h                                    |   3 +
 test/replication/show_error_on_disconnect.result   | 120 +++++++++++++++++++++
 test/replication/show_error_on_disconnect.test.lua |  38 +++++++
 5 files changed, 186 insertions(+)
 create mode 100644 test/replication/show_error_on_disconnect.result
 create mode 100644 test/replication/show_error_on_disconnect.test.lua

diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 9dbc3f92c..8f358d04e 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -148,6 +148,23 @@ lbox_pushreplica(lua_State *L, struct replica *replica)
 	if (relay_get_state(replica->relay) == RELAY_FOLLOW) {
 		lua_pushstring(L, "downstream");
 		lbox_pushrelay(L, relay);
+		lua_settable(L, -3);
+	} else if (relay_get_state(replica->relay) == RELAY_STOPPED) {
+		lua_pushstring(L, "downstream");
+
+		lua_newtable(L);
+		lua_pushstring(L, "status");
+		lua_pushstring(L, "stopped");
+		lua_settable(L, -3);
+
+		assert(replica->relay);
+		struct error *e = diag_last_error(relay_get_diag(replica->relay));
+		if (e != NULL) {
+			lua_pushstring(L, "message");
+			lua_pushstring(L, e->errmsg);
+			lua_settable(L, -3);
+		}
+
 		lua_settable(L, -3);
 	}
 }
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 49835bcb2..92dcd68ba 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -142,6 +142,12 @@ struct relay {
 	} tx;
 };
 
+struct diag*
+relay_get_diag(struct relay *relay)
+{
+	return &relay->diag;
+}
+
 enum relay_state
 relay_get_state(const struct relay *relay)
 {
@@ -536,6 +542,8 @@ relay_subscribe_f(va_list ap)
 	if (!diag_is_empty(&relay->diag)) {
 		/* An error has occurred while reading ACKs of xlog. */
 		diag_move(&relay->diag, diag_get());
+		/* Reference the diag in the status. */
+		diag_add_error(&relay->diag, diag_last_error(diag_get()));
 	}
 	struct errinj *inj = errinj(ERRINJ_RELAY_EXIT_DELAY, ERRINJ_DOUBLE);
 	if (inj != NULL && inj->dparam > 0)
diff --git a/src/box/relay.h b/src/box/relay.h
index e20d4cd13..3fd83bc53 100644
--- a/src/box/relay.h
+++ b/src/box/relay.h
@@ -63,6 +63,9 @@ relay_new(void);
 void
 relay_destroy(struct relay *relay);
 
+struct diag*
+relay_get_diag(struct relay *relay);
+
 enum relay_state
 relay_get_state(const struct relay *relay);
 
diff --git a/test/replication/show_error_on_disconnect.result b/test/replication/show_error_on_disconnect.result
new file mode 100644
index 000000000..c5a91c004
--- /dev/null
+++ b/test/replication/show_error_on_disconnect.result
@@ -0,0 +1,120 @@
+--
+-- gh-3365: display an error in upstream on downstream failure.
+-- Create a gap in LSN to cause replica's failure.
+-- The goal here is to see same error message on both side.
+--
+test_run = require('test_run').new()
+---
+...
+SERVERS = {'master_quorum1', 'master_quorum2'}
+---
+...
+-- Deploy a cluster.
+test_run:create_cluster(SERVERS)
+---
+...
+test_run:wait_fullmesh(SERVERS)
+---
+...
+test_run:cmd("switch master_quorum1")
+---
+- true
+...
+repl = box.cfg.replication
+---
+...
+box.cfg{replication = ""}
+---
+...
+test_run:cmd("switch master_quorum2")
+---
+- true
+...
+box.space.test:insert{1}
+---
+- [1]
+...
+box.snapshot()
+---
+- ok
+...
+box.space.test:insert{2}
+---
+- [2]
+...
+box.snapshot()
+---
+- ok
+...
+test_run:cmd("switch default")
+---
+- true
+...
+fio = require('fio')
+---
+...
+fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+---
+- true
+...
+test_run:cmd("switch master_quorum1")
+---
+- true
+...
+box.cfg{replication = repl}
+---
+...
+require('fiber').sleep(0.1)
+---
+...
+box.space.test:select()
+---
+- []
+...
+other_id = box.info.id % 2 + 1
+---
+...
+box.info.replication[other_id].upstream.status
+---
+- stopped
+...
+box.info.replication[other_id].upstream.message:match("Missing")
+---
+- Missing
+...
+test_run:cmd("switch master_quorum2")
+---
+- true
+...
+box.space.test:select()
+---
+- - [1]
+  - [2]
+...
+other_id = box.info.id % 2 + 1
+---
+...
+box.info.replication[other_id].upstream.status
+---
+- follow
+...
+box.info.replication[other_id].upstream.message
+---
+- null
+...
+box.info.replication[other_id].downstream.status
+---
+- stopped
+...
+box.info.replication[other_id].downstream.message:match("Missing")
+---
+- Missing
+...
+test_run:cmd("switch default")
+---
+- true
+...
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
+---
+...
diff --git a/test/replication/show_error_on_disconnect.test.lua b/test/replication/show_error_on_disconnect.test.lua
new file mode 100644
index 000000000..64a750256
--- /dev/null
+++ b/test/replication/show_error_on_disconnect.test.lua
@@ -0,0 +1,38 @@
+--
+-- gh-3365: display an error in upstream on downstream failure.
+-- Create a gap in LSN to cause replica's failure.
+-- The goal here is to see same error message on both side.
+--
+test_run = require('test_run').new()
+SERVERS = {'master_quorum1', 'master_quorum2'}
+-- Deploy a cluster.
+test_run:create_cluster(SERVERS)
+test_run:wait_fullmesh(SERVERS)
+test_run:cmd("switch master_quorum1")
+repl = box.cfg.replication
+box.cfg{replication = ""}
+test_run:cmd("switch master_quorum2")
+box.space.test:insert{1}
+box.snapshot()
+box.space.test:insert{2}
+box.snapshot()
+test_run:cmd("switch default")
+fio = require('fio')
+fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+test_run:cmd("switch master_quorum1")
+box.cfg{replication = repl}
+require('fiber').sleep(0.1)
+box.space.test:select()
+other_id = box.info.id % 2 + 1
+box.info.replication[other_id].upstream.status
+box.info.replication[other_id].upstream.message:match("Missing")
+test_run:cmd("switch master_quorum2")
+box.space.test:select()
+other_id = box.info.id % 2 + 1
+box.info.replication[other_id].upstream.status
+box.info.replication[other_id].upstream.message
+box.info.replication[other_id].downstream.status
+box.info.replication[other_id].downstream.message:match("Missing")
+test_run:cmd("switch default")
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-05-16 11:32 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-16 11:32 [tarantool-patches] [PATCH 0/3] replication: improve logging Konstantin Belyavskiy
2018-05-16 11:32 ` [tarantool-patches] [PATCH 1/3] replication: use applier_state to check quorum Konstantin Belyavskiy
2018-05-16 11:32 ` [tarantool-patches] [PATCH 2/3] replication: do not delete relay on applier disconnect Konstantin Belyavskiy
2018-05-16 11:32 ` [tarantool-patches] [PATCH 3/3] replication: display downstream status at upstream Konstantin Belyavskiy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox