[tarantool-patches] [PATCH] replication: display correct status at upstream
Konstantin Belyavskiy
k.belyavskiy at tarantool.org
Fri Apr 27 13:39:38 MSK 2018
This fix improves 'box.info.replication' output.
If downstream fails and thus disconnects from upstream, improve
logging by printing 'status: disconnected'.
Add relay_state { NONE, CONNECTED, DISCONNECTED } to track replica
presence, once connected it either CONNECTED or DISCONNECTED until
master is reset.
Closes #3365
---
Ticket: https://github.com/tarantool/tarantool/issues/3365
Branch: https://github.com/tarantool/tarantool/compare/gh-3365-display-an-error-at-downstream-on-replica-failure-or-disconnect
src/box/lua/info.c | 7 ++
src/box/replication.cc | 2 +
src/box/replication.h | 17 +++++
test/replication/recovery_quorum.result | 115 ++++++++++++++++++++++++++++++
test/replication/recovery_quorum.test.lua | 36 ++++++++++
5 files changed, 177 insertions(+)
create mode 100644 test/replication/recovery_quorum.result
create mode 100644 test/replication/recovery_quorum.test.lua
diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 8e8fd9d97..25bce8565 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -149,6 +149,13 @@ lbox_pushreplica(lua_State *L, struct replica *replica)
lua_pushstring(L, "downstream");
lbox_pushrelay(L, relay);
lua_settable(L, -3);
+ } else if (replica->relay_state == RELAY_DISCONNECTED) {
+ lua_pushstring(L, "downstream");
+ lua_newtable(L);
+ lua_pushstring(L, "status");
+ lua_pushstring(L, "disconnected");
+ lua_settable(L, -3);
+ lua_settable(L, -3);
}
}
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 0b770c913..3ae6e739b 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -661,6 +661,7 @@ replica_set_relay(struct replica *replica, struct relay *relay)
{
assert(replica->id != REPLICA_ID_NIL);
assert(replica->relay == NULL);
+ replica->relay_state = RELAY_CONNECTED;
replica->relay = relay;
}
@@ -669,6 +670,7 @@ replica_clear_relay(struct replica *replica)
{
assert(replica->relay != NULL);
replica->relay = NULL;
+ replica->relay_state = RELAY_DISCONNECTED;
if (replica_is_orphan(replica)) {
replica_hash_remove(&replicaset.hash, replica);
replica_delete(replica);
diff --git a/src/box/replication.h b/src/box/replication.h
index 8a9d57543..e1d4aab6d 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -226,6 +226,21 @@ enum replica_state {
REPLICA_SYNCED,
};
+enum relay_state {
+ /**
+ * Applier has not connected to the master or not expected.
+ */
+ RELAY_NONE,
+ /**
+ * Applier has connected to the master.
+ */
+ RELAY_CONNECTED,
+ /**
+ * Applier disconnected from the master.
+ */
+ RELAY_DISCONNECTED,
+};
+
/**
* Summary information about a replica in the replica set.
*/
@@ -256,6 +271,8 @@ struct replica {
struct trigger on_applier_state;
/** Replica sync state. */
enum replica_state state;
+ /** Relay sync state. */
+ enum relay_state relay_state;
};
enum {
diff --git a/test/replication/recovery_quorum.result b/test/replication/recovery_quorum.result
new file mode 100644
index 000000000..fef4df9de
--- /dev/null
+++ b/test/replication/recovery_quorum.result
@@ -0,0 +1,115 @@
+--
+-- gh-3365: display an error in upstream on downstream failure.
+-- Create a gap in LSN to cause replica's failure.
+--
+test_run = require('test_run').new()
+---
+...
+SERVERS = {'master_quorum1', 'master_quorum2'}
+---
+...
+-- Deploy a cluster.
+test_run:create_cluster(SERVERS)
+---
+...
+test_run:wait_fullmesh(SERVERS)
+---
+...
+test_run:cmd("switch master_quorum1")
+---
+- true
+...
+repl = box.cfg.replication
+---
+...
+box.cfg{replication = ""}
+---
+...
+test_run:cmd("switch master_quorum2")
+---
+- true
+...
+box.space.test:insert{1}
+---
+- [1]
+...
+box.snapshot()
+---
+- ok
+...
+box.space.test:insert{2}
+---
+- [2]
+...
+box.snapshot()
+---
+- ok
+...
+test_run:cmd("switch default")
+---
+- true
+...
+fio = require('fio')
+---
+...
+fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+---
+- true
+...
+test_run:cmd("switch master_quorum1")
+---
+- true
+...
+box.cfg{replication = repl}
+---
+...
+require('fiber').sleep(0.1)
+---
+...
+box.space.test:select()
+---
+- []
+...
+other_id = box.info.id % 2 + 1
+---
+...
+box.info.replication[other_id].upstream.status
+---
+- stopped
+...
+box.info.replication[other_id].upstream.message:match("Missing")
+---
+- Missing
+...
+test_run:cmd("switch master_quorum2")
+---
+- true
+...
+box.space.test:select()
+---
+- - [1]
+ - [2]
+...
+other_id = box.info.id % 2 + 1
+---
+...
+box.info.replication[other_id].upstream.status
+---
+- follow
+...
+box.info.replication[other_id].upstream.message
+---
+- null
+...
+box.info.replication[other_id].downstream
+---
+- status: disconnected
+...
+test_run:cmd("switch default")
+---
+- true
+...
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
+---
+...
diff --git a/test/replication/recovery_quorum.test.lua b/test/replication/recovery_quorum.test.lua
new file mode 100644
index 000000000..f91ca1ca2
--- /dev/null
+++ b/test/replication/recovery_quorum.test.lua
@@ -0,0 +1,36 @@
+--
+-- gh-3365: display an error in upstream on downstream failure.
+-- Create a gap in LSN to cause replica's failure.
+--
+test_run = require('test_run').new()
+SERVERS = {'master_quorum1', 'master_quorum2'}
+-- Deploy a cluster.
+test_run:create_cluster(SERVERS)
+test_run:wait_fullmesh(SERVERS)
+test_run:cmd("switch master_quorum1")
+repl = box.cfg.replication
+box.cfg{replication = ""}
+test_run:cmd("switch master_quorum2")
+box.space.test:insert{1}
+box.snapshot()
+box.space.test:insert{2}
+box.snapshot()
+test_run:cmd("switch default")
+fio = require('fio')
+fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+test_run:cmd("switch master_quorum1")
+box.cfg{replication = repl}
+require('fiber').sleep(0.1)
+box.space.test:select()
+other_id = box.info.id % 2 + 1
+box.info.replication[other_id].upstream.status
+box.info.replication[other_id].upstream.message:match("Missing")
+test_run:cmd("switch master_quorum2")
+box.space.test:select()
+other_id = box.info.id % 2 + 1
+box.info.replication[other_id].upstream.status
+box.info.replication[other_id].upstream.message
+box.info.replication[other_id].downstream
+test_run:cmd("switch default")
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
--
2.14.3 (Apple Git-98)
More information about the Tarantool-patches
mailing list