[PATCH 2/2] replication: display downstream status at upstream

Konstantin Belyavskiy k.belyavskiy at tarantool.org
Mon May 21 20:07:18 MSK 2018


This fix improves 'box.info.replication' output.
If downstream fails and thus disconnects from upstream, improve
logging by printing 'status: disconnected' and error message on
both sides (master and replica).

Closes #3365
---
 src/box/lua/info.c                                 |  17 +++
 src/box/relay.cc                                   |   8 ++
 src/box/relay.h                                    |   4 +
 test/replication/show_error_on_disconnect.result   | 120 +++++++++++++++++++++
 test/replication/show_error_on_disconnect.test.lua |  38 +++++++
 5 files changed, 187 insertions(+)
 create mode 100644 test/replication/show_error_on_disconnect.result
 create mode 100644 test/replication/show_error_on_disconnect.test.lua

diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 9dbc3f92c..8f358d04e 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -148,6 +148,23 @@ lbox_pushreplica(lua_State *L, struct replica *replica)
 	if (relay_get_state(replica->relay) == RELAY_FOLLOW) {
 		lua_pushstring(L, "downstream");
 		lbox_pushrelay(L, relay);
+		lua_settable(L, -3);
+	} else if (relay_get_state(replica->relay) == RELAY_STOPPED) {
+		lua_pushstring(L, "downstream");
+
+		lua_newtable(L);
+		lua_pushstring(L, "status");
+		lua_pushstring(L, "stopped");
+		lua_settable(L, -3);
+
+		assert(replica->relay);
+		struct error *e = diag_last_error(relay_get_diag(replica->relay));
+		if (e != NULL) {
+			lua_pushstring(L, "message");
+			lua_pushstring(L, e->errmsg);
+			lua_settable(L, -3);
+		}
+
 		lua_settable(L, -3);
 	}
 }
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 6470946ae..083d34c51 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -140,6 +140,12 @@ struct relay {
 	} tx;
 };
 
+struct diag*
+relay_get_diag(struct relay *relay)
+{
+	return &relay->diag;
+}
+
 enum relay_state
 relay_get_state(const struct relay *relay)
 {
@@ -542,6 +548,8 @@ relay_subscribe_f(va_list ap)
 	if (!diag_is_empty(&relay->diag)) {
 		/* An error has occurred while reading ACKs of xlog. */
 		diag_move(&relay->diag, diag_get());
+		/* Reference the diag in the status. */
+		diag_add_error(&relay->diag, diag_last_error(diag_get()));
 	}
 	struct errinj *inj = errinj(ERRINJ_RELAY_EXIT_DELAY, ERRINJ_DOUBLE);
 	if (inj != NULL && inj->dparam > 0)
diff --git a/src/box/relay.h b/src/box/relay.h
index f039cbef8..2988e6b0d 100644
--- a/src/box/relay.h
+++ b/src/box/relay.h
@@ -65,6 +65,10 @@ relay_new(struct replica *replica);
 void
 relay_delete(struct relay *relay);
 
+/** Get last relay's diagnostic error */
+struct diag*
+relay_get_diag(struct relay *relay);
+
 /** Return the current state of relay. */
 enum relay_state
 relay_get_state(const struct relay *relay);
diff --git a/test/replication/show_error_on_disconnect.result b/test/replication/show_error_on_disconnect.result
new file mode 100644
index 000000000..c5a91c004
--- /dev/null
+++ b/test/replication/show_error_on_disconnect.result
@@ -0,0 +1,120 @@
+--
+-- gh-3365: display an error in upstream on downstream failure.
+-- Create a gap in LSN to cause replica's failure.
+-- The goal here is to see same error message on both side.
+--
+test_run = require('test_run').new()
+---
+...
+SERVERS = {'master_quorum1', 'master_quorum2'}
+---
+...
+-- Deploy a cluster.
+test_run:create_cluster(SERVERS)
+---
+...
+test_run:wait_fullmesh(SERVERS)
+---
+...
+test_run:cmd("switch master_quorum1")
+---
+- true
+...
+repl = box.cfg.replication
+---
+...
+box.cfg{replication = ""}
+---
+...
+test_run:cmd("switch master_quorum2")
+---
+- true
+...
+box.space.test:insert{1}
+---
+- [1]
+...
+box.snapshot()
+---
+- ok
+...
+box.space.test:insert{2}
+---
+- [2]
+...
+box.snapshot()
+---
+- ok
+...
+test_run:cmd("switch default")
+---
+- true
+...
+fio = require('fio')
+---
+...
+fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+---
+- true
+...
+test_run:cmd("switch master_quorum1")
+---
+- true
+...
+box.cfg{replication = repl}
+---
+...
+require('fiber').sleep(0.1)
+---
+...
+box.space.test:select()
+---
+- []
+...
+other_id = box.info.id % 2 + 1
+---
+...
+box.info.replication[other_id].upstream.status
+---
+- stopped
+...
+box.info.replication[other_id].upstream.message:match("Missing")
+---
+- Missing
+...
+test_run:cmd("switch master_quorum2")
+---
+- true
+...
+box.space.test:select()
+---
+- - [1]
+  - [2]
+...
+other_id = box.info.id % 2 + 1
+---
+...
+box.info.replication[other_id].upstream.status
+---
+- follow
+...
+box.info.replication[other_id].upstream.message
+---
+- null
+...
+box.info.replication[other_id].downstream.status
+---
+- stopped
+...
+box.info.replication[other_id].downstream.message:match("Missing")
+---
+- Missing
+...
+test_run:cmd("switch default")
+---
+- true
+...
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
+---
+...
diff --git a/test/replication/show_error_on_disconnect.test.lua b/test/replication/show_error_on_disconnect.test.lua
new file mode 100644
index 000000000..64a750256
--- /dev/null
+++ b/test/replication/show_error_on_disconnect.test.lua
@@ -0,0 +1,38 @@
+--
+-- gh-3365: display an error in upstream on downstream failure.
+-- Create a gap in LSN to cause replica's failure.
+-- The goal here is to see same error message on both side.
+--
+test_run = require('test_run').new()
+SERVERS = {'master_quorum1', 'master_quorum2'}
+-- Deploy a cluster.
+test_run:create_cluster(SERVERS)
+test_run:wait_fullmesh(SERVERS)
+test_run:cmd("switch master_quorum1")
+repl = box.cfg.replication
+box.cfg{replication = ""}
+test_run:cmd("switch master_quorum2")
+box.space.test:insert{1}
+box.snapshot()
+box.space.test:insert{2}
+box.snapshot()
+test_run:cmd("switch default")
+fio = require('fio')
+fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+test_run:cmd("switch master_quorum1")
+box.cfg{replication = repl}
+require('fiber').sleep(0.1)
+box.space.test:select()
+other_id = box.info.id % 2 + 1
+box.info.replication[other_id].upstream.status
+box.info.replication[other_id].upstream.message:match("Missing")
+test_run:cmd("switch master_quorum2")
+box.space.test:select()
+other_id = box.info.id % 2 + 1
+box.info.replication[other_id].upstream.status
+box.info.replication[other_id].upstream.message
+box.info.replication[other_id].downstream.status
+box.info.replication[other_id].downstream.message:match("Missing")
+test_run:cmd("switch default")
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
-- 
2.14.3 (Apple Git-98)




More information about the Tarantool-patches mailing list