[Tarantool-patches] [PATCH v5] qsync: provide box.info.synchro interface for monitoring
    Cyrill Gorcunov 
    gorcunov at gmail.com
       
    Thu Apr  8 17:57:28 MSK 2021
    
    
  
In commit 14fa5fd82 (cfg: support symbolic evaluation of
replication_synchro_quorum) we implemented support of
symbolic evaluation of `replication_synchro_quorum` parameter
and there is no easy way to obtain it current run-time value,
ie evaluated number value.
Moreover we would like to fetch queue length on transaction
limbo for tests and extend this statistics in future. Thus
lets add them.
Closes #5191
Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
@TarantoolBot document
Title: Provide `box.info.synchro` interface
The `box.info.synchro` leaf provides information about details of
synchronous replication.
In particular `quorum` represent the current value of synchronous
replication quorum defined by `replication_synchro_quorum`
configuration parameter because it can be set as dynamic formula
such as `N/2+1` and the value depends on current number of replicas.
Since synchronous replication does not commit data immediately
but waits for its propagation to replicas the data sits in a queue
gathering `commit` responses from remote nodes. Current number of
entries waiting in the queue is shown via `queue.len` member.
A typical output is the following
``` Lua
tarantool> box.info.synchro
---
- queue:
    len: 0
  quorum: 1
...
```
The `len` member shows current number of entries in the queue.
And the `quorum` member shows an evaluated value of
`replication_synchro_quorum` parameter.
---
Guys, I pushed it into gorcunov/gh-5191-qsync-stat-5 because
the patch became bigger as I supposed.
 changelogs/unreleased/box-info-limbo.md       |  4 +
 src/box/lua/info.c                            | 22 ++++++
 src/box/txn_limbo.c                           |  5 +-
 src/box/txn_limbo.h                           |  4 +
 test/box/info.result                          |  1 +
 .../gh-5446-qsync-eval-quorum.result          | 74 ++++++++++++-------
 .../gh-5446-qsync-eval-quorum.test.lua        | 48 ++++++++----
 test/replication/qsync_basic.result           | 40 ++++++++--
 test/replication/qsync_basic.test.lua         | 21 +++++-
 9 files changed, 172 insertions(+), 47 deletions(-)
 create mode 100644 changelogs/unreleased/box-info-limbo.md
diff --git a/changelogs/unreleased/box-info-limbo.md b/changelogs/unreleased/box-info-limbo.md
new file mode 100644
index 000000000..0f75a911d
--- /dev/null
+++ b/changelogs/unreleased/box-info-limbo.md
@@ -0,0 +1,4 @@
+## feature/core
+
+* Provide information about state of synchronous replication via
+  `box.info.synchro` interface (gh-5191).
diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 8cd379756..0eb48b823 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -50,6 +50,7 @@
 #include "version.h"
 #include "box/box.h"
 #include "box/raft.h"
+#include "box/txn_limbo.h"
 #include "lua/utils.h"
 #include "fiber.h"
 #include "sio.h"
@@ -599,6 +600,26 @@ lbox_info_election(struct lua_State *L)
 	return 1;
 }
 
+static int
+lbox_info_synchro(struct lua_State *L)
+{
+	lua_createtable(L, 0, 2);
+
+	/* Quorum value may be evaluated via formula */
+	lua_pushinteger(L, replication_synchro_quorum);
+	lua_setfield(L, -2, "quorum");
+
+	/* Queue information. */
+	struct txn_limbo *queue = &txn_limbo;
+	lua_createtable(L, 0, 1);
+	lua_pushnumber(L, queue->len);
+	lua_setfield(L, -2, "len");
+	lua_setfield(L, -2, "queue");
+
+	return 1;
+}
+
+
 static const struct luaL_Reg lbox_info_dynamic_meta[] = {
 	{"id", lbox_info_id},
 	{"uuid", lbox_info_uuid},
@@ -618,6 +639,7 @@ static const struct luaL_Reg lbox_info_dynamic_meta[] = {
 	{"sql", lbox_info_sql},
 	{"listen", lbox_info_listen},
 	{"election", lbox_info_election},
+	{"synchro", lbox_info_synchro},
 	{NULL, NULL}
 };
 
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index cf0ad9350..a22e0861a 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -41,6 +41,7 @@ static inline void
 txn_limbo_create(struct txn_limbo *limbo)
 {
 	rlist_create(&limbo->queue);
+	limbo->len = 0;
 	limbo->owner_id = REPLICA_ID_NIL;
 	fiber_cond_create(&limbo->wait_cond);
 	vclock_create(&limbo->vclock);
@@ -118,6 +119,7 @@ txn_limbo_append(struct txn_limbo *limbo, uint32_t id, struct txn *txn)
 	e->is_commit = false;
 	e->is_rollback = false;
 	rlist_add_tail_entry(&limbo->queue, e, in_queue);
+	limbo->len++;
 	/*
 	 * We added new entries from a remote instance to an empty limbo.
 	 * Time to make this instance read-only.
@@ -132,8 +134,8 @@ txn_limbo_remove(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 {
 	assert(!rlist_empty(&entry->in_queue));
 	assert(txn_limbo_first_entry(limbo) == entry);
-	(void) limbo;
 	rlist_del_entry(entry, in_queue);
+	limbo->len--;
 }
 
 static inline void
@@ -144,6 +146,7 @@ txn_limbo_pop(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 	assert(entry->is_rollback);
 
 	rlist_del_entry(entry, in_queue);
+	limbo->len--;
 	++limbo->rollback_count;
 }
 
diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
index af0addf8d..f2a98c8bb 100644
--- a/src/box/txn_limbo.h
+++ b/src/box/txn_limbo.h
@@ -94,6 +94,10 @@ struct txn_limbo {
 	 * them LSNs in the same order.
 	 */
 	struct rlist queue;
+	/**
+	 * Number of entries in limbo queue.
+	 */
+	int64_t len;
 	/**
 	 * Instance ID of the owner of all the transactions in the
 	 * queue. Strictly speaking, nothing prevents to store not
diff --git a/test/box/info.result b/test/box/info.result
index c8037818b..3ee653773 100644
--- a/test/box/info.result
+++ b/test/box/info.result
@@ -89,6 +89,7 @@ t
   - signature
   - sql
   - status
+  - synchro
   - uptime
   - uuid
   - vclock
diff --git a/test/replication/gh-5446-qsync-eval-quorum.result b/test/replication/gh-5446-qsync-eval-quorum.result
index 1d13f26db..5f83b248c 100644
--- a/test/replication/gh-5446-qsync-eval-quorum.result
+++ b/test/replication/gh-5446-qsync-eval-quorum.result
@@ -72,6 +72,15 @@ cfg_set_pass_tmo()
  | ---
  | ...
 
+-- gh-5191: we may validate the evaluated number,
+-- we take a canonical formula here.
+function assert_quorum_value(nr_replicas)       \
+    local v = math.floor(nr_replicas / 2) + 1   \
+    assert(box.info.synchro.quorum == v)        \
+end
+ | ---
+ | ...
+
 -- Create a sync space we will operate on
 s = box.schema.space.create('sync', {is_sync = true, engine = engine})
  | ---
@@ -96,6 +105,9 @@ test_run:cmd('start server replica1 with wait=True, wait_load=True')
  | ---
  | - true
  | ...
+assert_quorum_value(2)
+ | ---
+ | ...
 s:insert{2} -- should pass
  | ---
  | - [2]
@@ -133,6 +145,9 @@ test_run:cmd('start server replica2 with wait=True, wait_load=True')
  | ---
  | - true
  | ...
+assert_quorum_value(3)
+ | ---
+ | ...
 
 test_run:cmd('create server replica3 with rpl_master=default,\
               script="replication/replica-quorum-3.lua"')
@@ -143,6 +158,9 @@ test_run:cmd('start server replica3 with wait=True, wait_load=True')
  | ---
  | - true
  | ...
+assert_quorum_value(4)
+ | ---
+ | ...
 
 test_run:cmd('create server replica4 with rpl_master=default,\
               script="replication/replica-quorum-4.lua"')
@@ -153,6 +171,9 @@ test_run:cmd('start server replica4 with wait=True, wait_load=True')
  | ---
  | - true
  | ...
+assert_quorum_value(5)
+ | ---
+ | ...
 
 test_run:cmd('create server replica5 with rpl_master=default,\
               script="replication/replica-quorum-5.lua"')
@@ -163,6 +184,9 @@ test_run:cmd('start server replica5 with wait=True, wait_load=True')
  | ---
  | - true
  | ...
+assert_quorum_value(6)
+ | ---
+ | ...
 
 test_run:cmd('create server replica6 with rpl_master=default,\
               script="replication/replica-quorum-6.lua"')
@@ -173,6 +197,9 @@ test_run:cmd('start server replica6 with wait=True, wait_load=True')
  | ---
  | - true
  | ...
+assert_quorum_value(7)
+ | ---
+ | ...
 
 -- All replicas are up and running
 s:insert{4} -- should pass
@@ -254,55 +281,52 @@ s:insert{11} -- should pass
  | - [11]
  | ...
 
--- cleanup
+-- Cleanup and test formula evaluation same time,
+-- which requires _cluster modification though.
+function delete_replica(name)                                   \
+    local id = test_run:eval(name, 'return box.info.id')[1]     \
+    test_run:cmd('stop server ' .. name)                        \
+    test_run:cmd('delete server ' .. name)                      \
+    box.space._cluster:delete(id)                               \
+end
+ | ---
+ | ...
 
-test_run:cmd('stop server replica1')
+delete_replica('replica1')
  | ---
- | - true
  | ...
-test_run:cmd('delete server replica1')
+assert_quorum_value(6)
  | ---
- | - true
  | ...
-test_run:cmd('stop server replica2')
+delete_replica('replica2')
  | ---
- | - true
  | ...
-test_run:cmd('delete server replica2')
+assert_quorum_value(5)
  | ---
- | - true
  | ...
-test_run:cmd('stop server replica3')
+delete_replica('replica3')
  | ---
- | - true
  | ...
-test_run:cmd('delete server replica3')
+assert_quorum_value(4)
  | ---
- | - true
  | ...
-test_run:cmd('stop server replica4')
+delete_replica('replica4')
  | ---
- | - true
  | ...
-test_run:cmd('delete server replica4')
+assert_quorum_value(3)
  | ---
- | - true
  | ...
-test_run:cmd('stop server replica5')
+delete_replica('replica5')
  | ---
- | - true
  | ...
-test_run:cmd('delete server replica5')
+assert_quorum_value(2)
  | ---
- | - true
  | ...
-test_run:cmd('stop server replica6')
+delete_replica('replica6')
  | ---
- | - true
  | ...
-test_run:cmd('delete server replica6')
+assert_quorum_value(1)
  | ---
- | - true
  | ...
 
 s:drop()
diff --git a/test/replication/gh-5446-qsync-eval-quorum.test.lua b/test/replication/gh-5446-qsync-eval-quorum.test.lua
index 62d87ddcb..6b9e324ed 100644
--- a/test/replication/gh-5446-qsync-eval-quorum.test.lua
+++ b/test/replication/gh-5446-qsync-eval-quorum.test.lua
@@ -27,6 +27,13 @@ function cfg_set_fail_tmo() box.cfg{replication_synchro_timeout = 0.5} end
 box.cfg{replication_synchro_quorum = "N/2+1"}
 cfg_set_pass_tmo()
 
+-- gh-5191: we may validate the evaluated number,
+-- we take a canonical formula here.
+function assert_quorum_value(nr_replicas)       \
+    local v = math.floor(nr_replicas / 2) + 1   \
+    assert(box.info.synchro.quorum == v)        \
+end
+
 -- Create a sync space we will operate on
 s = box.schema.space.create('sync', {is_sync = true, engine = engine})
 _ = s:create_index('pk')
@@ -38,6 +45,7 @@ s:insert{1} -- should pass
 test_run:cmd('create server replica1 with rpl_master=default,\
               script="replication/replica-quorum-1.lua"')
 test_run:cmd('start server replica1 with wait=True, wait_load=True')
+assert_quorum_value(2)
 s:insert{2} -- should pass
 cfg_set_fail_tmo()
 test_run:cmd('stop server replica1')
@@ -50,22 +58,27 @@ s:insert{3} -- should pass
 test_run:cmd('create server replica2 with rpl_master=default,\
               script="replication/replica-quorum-2.lua"')
 test_run:cmd('start server replica2 with wait=True, wait_load=True')
+assert_quorum_value(3)
 
 test_run:cmd('create server replica3 with rpl_master=default,\
               script="replication/replica-quorum-3.lua"')
 test_run:cmd('start server replica3 with wait=True, wait_load=True')
+assert_quorum_value(4)
 
 test_run:cmd('create server replica4 with rpl_master=default,\
               script="replication/replica-quorum-4.lua"')
 test_run:cmd('start server replica4 with wait=True, wait_load=True')
+assert_quorum_value(5)
 
 test_run:cmd('create server replica5 with rpl_master=default,\
               script="replication/replica-quorum-5.lua"')
 test_run:cmd('start server replica5 with wait=True, wait_load=True')
+assert_quorum_value(6)
 
 test_run:cmd('create server replica6 with rpl_master=default,\
               script="replication/replica-quorum-6.lua"')
 test_run:cmd('start server replica6 with wait=True, wait_load=True')
+assert_quorum_value(7)
 
 -- All replicas are up and running
 s:insert{4} -- should pass
@@ -92,20 +105,27 @@ s:insert{10} -- should pass
 test_run:cmd('start server replica1 with wait=True, wait_load=True')
 s:insert{11} -- should pass
 
--- cleanup
-
-test_run:cmd('stop server replica1')
-test_run:cmd('delete server replica1')
-test_run:cmd('stop server replica2')
-test_run:cmd('delete server replica2')
-test_run:cmd('stop server replica3')
-test_run:cmd('delete server replica3')
-test_run:cmd('stop server replica4')
-test_run:cmd('delete server replica4')
-test_run:cmd('stop server replica5')
-test_run:cmd('delete server replica5')
-test_run:cmd('stop server replica6')
-test_run:cmd('delete server replica6')
+-- Cleanup and test formula evaluation same time,
+-- which requires _cluster modification though.
+function delete_replica(name)                                   \
+    local id = test_run:eval(name, 'return box.info.id')[1]     \
+    test_run:cmd('stop server ' .. name)                        \
+    test_run:cmd('delete server ' .. name)                      \
+    box.space._cluster:delete(id)                               \
+end
+
+delete_replica('replica1')
+assert_quorum_value(6)
+delete_replica('replica2')
+assert_quorum_value(5)
+delete_replica('replica3')
+assert_quorum_value(4)
+delete_replica('replica4')
+assert_quorum_value(3)
+delete_replica('replica5')
+assert_quorum_value(2)
+delete_replica('replica6')
+assert_quorum_value(1)
 
 s:drop()
 
diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
index bd3c3cce1..3457d2cc9 100644
--- a/test/replication/qsync_basic.result
+++ b/test/replication/qsync_basic.result
@@ -637,12 +637,46 @@ box.space.sync:count()
  | - 0
  | ...
 
--- Cleanup.
+--
+-- gh-5191: test box.info.synchro interface. For
+-- this sake we stop the replica and initiate data
+-- write in sync space which won't pass due to timeout.
+-- While we're sitting in a wait cycle the queue should
+-- not be empty.
+--
+-- Make sure this test is the *LAST* one since we stop
+-- the replica node and never restart it back before the
+-- cleanup procedure, also we're spinning on default node
+-- and do not switch to other nodes.
+--
 test_run:cmd('switch default')
  | ---
  | - true
  | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+assert(box.info.synchro.queue.len == 0)
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_timeout = 2}
+ | ---
+ | ...
+f = fiber.new(function() box.space.sync:insert{1024} end)
+ | ---
+ | ...
+test_run:wait_cond(function() return box.info.synchro.queue.len == 1 end)
+ | ---
+ | - true
+ | ...
+test_run:wait_cond(function() return box.info.synchro.queue.len == 0 end)
+ | ---
+ | - true
+ | ...
 
+-- Cleanup
 box.cfg{                                                                        \
     replication_synchro_quorum = old_synchro_quorum,                            \
     replication_synchro_timeout = old_synchro_timeout,                          \
@@ -650,10 +684,6 @@ box.cfg{
 }
  | ---
  | ...
-test_run:cmd('stop server replica')
- | ---
- | - true
- | ...
 test_run:cmd('delete server replica')
  | ---
  | - true
diff --git a/test/replication/qsync_basic.test.lua b/test/replication/qsync_basic.test.lua
index 94235547d..a604d80ee 100644
--- a/test/replication/qsync_basic.test.lua
+++ b/test/replication/qsync_basic.test.lua
@@ -248,15 +248,32 @@ for i = 1, 100 do box.space.sync:delete{i} end
 test_run:cmd('switch replica')
 box.space.sync:count()
 
--- Cleanup.
+--
+-- gh-5191: test box.info.synchro interface. For
+-- this sake we stop the replica and initiate data
+-- write in sync space which won't pass due to timeout.
+-- While we're sitting in a wait cycle the queue should
+-- not be empty.
+--
+-- Make sure this test is the *LAST* one since we stop
+-- the replica node and never restart it back before the
+-- cleanup procedure, also we're spinning on default node
+-- and do not switch to other nodes.
+--
 test_run:cmd('switch default')
+test_run:cmd('stop server replica')
+assert(box.info.synchro.queue.len == 0)
+box.cfg{replication_synchro_timeout = 2}
+f = fiber.new(function() box.space.sync:insert{1024} end)
+test_run:wait_cond(function() return box.info.synchro.queue.len == 1 end)
+test_run:wait_cond(function() return box.info.synchro.queue.len == 0 end)
 
+-- Cleanup
 box.cfg{                                                                        \
     replication_synchro_quorum = old_synchro_quorum,                            \
     replication_synchro_timeout = old_synchro_timeout,                          \
     replication_timeout = old_timeout,                                          \
 }
-test_run:cmd('stop server replica')
 test_run:cmd('delete server replica')
 box.space.test:drop()
 box.space.sync:drop()
-- 
2.30.2
    
    
More information about the Tarantool-patches
mailing list