[tarantool-patches] [PATCH] replication: display correct status at upstream

Konstantin Belyavskiy k.belyavskiy at tarantool.org
Fri Apr 27 13:39:38 MSK 2018


This fix improves 'box.info.replication' output.
If downstream fails and thus disconnects from upstream, improve
logging by printing 'status: disconnected'.
Add relay_state { NONE, CONNECTED, DISCONNECTED } to track replica
presence, once connected it either CONNECTED or DISCONNECTED until
master is reset.

Closes #3365
---
Ticket: https://github.com/tarantool/tarantool/issues/3365
Branch: https://github.com/tarantool/tarantool/compare/gh-3365-display-an-error-at-downstream-on-replica-failure-or-disconnect

 src/box/lua/info.c                        |   7 ++
 src/box/replication.cc                    |   2 +
 src/box/replication.h                     |  17 +++++
 test/replication/recovery_quorum.result   | 115 ++++++++++++++++++++++++++++++
 test/replication/recovery_quorum.test.lua |  36 ++++++++++
 5 files changed, 177 insertions(+)
 create mode 100644 test/replication/recovery_quorum.result
 create mode 100644 test/replication/recovery_quorum.test.lua

diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 8e8fd9d97..25bce8565 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -149,6 +149,13 @@ lbox_pushreplica(lua_State *L, struct replica *replica)
 		lua_pushstring(L, "downstream");
 		lbox_pushrelay(L, relay);
 		lua_settable(L, -3);
+	} else if (replica->relay_state == RELAY_DISCONNECTED) {
+		lua_pushstring(L, "downstream");
+		lua_newtable(L);
+		lua_pushstring(L, "status");
+		lua_pushstring(L, "disconnected");
+		lua_settable(L, -3);
+		lua_settable(L, -3);
 	}
 }
 
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 0b770c913..3ae6e739b 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -661,6 +661,7 @@ replica_set_relay(struct replica *replica, struct relay *relay)
 {
 	assert(replica->id != REPLICA_ID_NIL);
 	assert(replica->relay == NULL);
+	replica->relay_state = RELAY_CONNECTED;
 	replica->relay = relay;
 }
 
@@ -669,6 +670,7 @@ replica_clear_relay(struct replica *replica)
 {
 	assert(replica->relay != NULL);
 	replica->relay = NULL;
+	replica->relay_state = RELAY_DISCONNECTED;
 	if (replica_is_orphan(replica)) {
 		replica_hash_remove(&replicaset.hash, replica);
 		replica_delete(replica);
diff --git a/src/box/replication.h b/src/box/replication.h
index 8a9d57543..e1d4aab6d 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -226,6 +226,21 @@ enum replica_state {
 	REPLICA_SYNCED,
 };
 
+enum relay_state {
+	/**
+	 * Applier has not connected to the master or not expected.
+	 */
+	RELAY_NONE,
+	/**
+	 * Applier has connected to the master.
+	 */
+	RELAY_CONNECTED,
+	/**
+	 * Applier disconnected from the master.
+	 */
+	RELAY_DISCONNECTED,
+};
+
 /**
  * Summary information about a replica in the replica set.
  */
@@ -256,6 +271,8 @@ struct replica {
 	struct trigger on_applier_state;
 	/** Replica sync state. */
 	enum replica_state state;
+	/** Relay sync state. */
+	enum relay_state relay_state;
 };
 
 enum {
diff --git a/test/replication/recovery_quorum.result b/test/replication/recovery_quorum.result
new file mode 100644
index 000000000..fef4df9de
--- /dev/null
+++ b/test/replication/recovery_quorum.result
@@ -0,0 +1,115 @@
+--
+-- gh-3365: display an error in upstream on downstream failure.
+-- Create a gap in LSN to cause replica's failure.
+--
+test_run = require('test_run').new()
+---
+...
+SERVERS = {'master_quorum1', 'master_quorum2'}
+---
+...
+-- Deploy a cluster.
+test_run:create_cluster(SERVERS)
+---
+...
+test_run:wait_fullmesh(SERVERS)
+---
+...
+test_run:cmd("switch master_quorum1")
+---
+- true
+...
+repl = box.cfg.replication
+---
+...
+box.cfg{replication = ""}
+---
+...
+test_run:cmd("switch master_quorum2")
+---
+- true
+...
+box.space.test:insert{1}
+---
+- [1]
+...
+box.snapshot()
+---
+- ok
+...
+box.space.test:insert{2}
+---
+- [2]
+...
+box.snapshot()
+---
+- ok
+...
+test_run:cmd("switch default")
+---
+- true
+...
+fio = require('fio')
+---
+...
+fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+---
+- true
+...
+test_run:cmd("switch master_quorum1")
+---
+- true
+...
+box.cfg{replication = repl}
+---
+...
+require('fiber').sleep(0.1)
+---
+...
+box.space.test:select()
+---
+- []
+...
+other_id = box.info.id % 2 + 1
+---
+...
+box.info.replication[other_id].upstream.status
+---
+- stopped
+...
+box.info.replication[other_id].upstream.message:match("Missing")
+---
+- Missing
+...
+test_run:cmd("switch master_quorum2")
+---
+- true
+...
+box.space.test:select()
+---
+- - [1]
+  - [2]
+...
+other_id = box.info.id % 2 + 1
+---
+...
+box.info.replication[other_id].upstream.status
+---
+- follow
+...
+box.info.replication[other_id].upstream.message
+---
+- null
+...
+box.info.replication[other_id].downstream
+---
+- status: disconnected
+...
+test_run:cmd("switch default")
+---
+- true
+...
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
+---
+...
diff --git a/test/replication/recovery_quorum.test.lua b/test/replication/recovery_quorum.test.lua
new file mode 100644
index 000000000..f91ca1ca2
--- /dev/null
+++ b/test/replication/recovery_quorum.test.lua
@@ -0,0 +1,36 @@
+--
+-- gh-3365: display an error in upstream on downstream failure.
+-- Create a gap in LSN to cause replica's failure.
+--
+test_run = require('test_run').new()
+SERVERS = {'master_quorum1', 'master_quorum2'}
+-- Deploy a cluster.
+test_run:create_cluster(SERVERS)
+test_run:wait_fullmesh(SERVERS)
+test_run:cmd("switch master_quorum1")
+repl = box.cfg.replication
+box.cfg{replication = ""}
+test_run:cmd("switch master_quorum2")
+box.space.test:insert{1}
+box.snapshot()
+box.space.test:insert{2}
+box.snapshot()
+test_run:cmd("switch default")
+fio = require('fio')
+fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+test_run:cmd("switch master_quorum1")
+box.cfg{replication = repl}
+require('fiber').sleep(0.1)
+box.space.test:select()
+other_id = box.info.id % 2 + 1
+box.info.replication[other_id].upstream.status
+box.info.replication[other_id].upstream.message:match("Missing")
+test_run:cmd("switch master_quorum2")
+box.space.test:select()
+other_id = box.info.id % 2 + 1
+box.info.replication[other_id].upstream.status
+box.info.replication[other_id].upstream.message
+box.info.replication[other_id].downstream
+test_run:cmd("switch default")
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
-- 
2.14.3 (Apple Git-98)





More information about the Tarantool-patches mailing list