Tarantool development patches archive
 help / color / mirror / Atom feed
* [Tarantool-patches] [PATCH v2] qsync: provide box.info interface for monitoring
@ 2021-04-05 15:58 Cyrill Gorcunov via Tarantool-patches
  2021-04-05 20:34 ` [Tarantool-patches] [PATCH v3] " Cyrill Gorcunov via Tarantool-patches
  2021-04-07 20:53 ` [Tarantool-patches] [PATCH v2] " Vladislav Shpilevoy via Tarantool-patches
  0 siblings, 2 replies; 14+ messages in thread
From: Cyrill Gorcunov via Tarantool-patches @ 2021-04-05 15:58 UTC (permalink / raw)
  To: tml; +Cc: Vladislav Shpilevoy

Since commit 14fa5fd82 we support symbolic evaluation of
`replication_synchro_quorum` parameter and there is no easy
way to obtain it current run-time value, ie evaluated one.
Moreover we would like to fetch queue length on transaction
limbo for tests and extend this statistics in future.

Thus we introduce the "synchro" leaf in box.info interface.
For now only a few entries are printed out

 | tarantool> box.info.synchro
 | ---
 | - queue:
 |     1:
 |       len: 0
 |   quorum: 1
 | ...

The `queue` represents limbo instances (since we support only
one limbo for now the sole entry is printed) and `len` member
shows the number of entries in the queue.

The `quorum` member shows the evaluated value of
`replication_synchro_quorum` parameter.

Closes #5191

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>

@TarantoolBot document
Title: Provice `box.info.synchro` interface

The `box.info.synchro` leaf provides information about details of
synchronous replication.

In particular `quorum` represent the current value of synchronous
replication quorum defined by `replication_synchro_quorum`
configuration parameter since it can be set as dynamic formula
such as `N/2+1` and the value depends on current number
of replicas.

Since synchronous replication does not commit data immediately
but waits for its propagation to replicas such data sits in
a queue gathering `commit` responses from remote nodes. Current
number of entries sitting in the queue is shown by `queue.len`
member.

A typical output is the following

``` Lua
tarantool> box.info.synchro
---
- queue:
    1:
      len: 0
  quorum: 1
...
```

For now only one `queue` is supported so the output is filled with
one array entry. In future the multiple queues might be implemented.
---

v2:
 - reformat output
 - update test

issue https://github.com/tarantool/tarantool/issues/5191
branch gorcunov/gh-5191-qsync-stat-2

 src/box/lua/info.c                            | 43 +++++++++++++
 src/box/txn_limbo.c                           |  5 +-
 src/box/txn_limbo.h                           |  4 ++
 .../gh-5446-qsync-eval-quorum.result          | 64 +++++++++++++++----
 .../gh-5446-qsync-eval-quorum.test.lua        | 33 ++++++++--
 5 files changed, 129 insertions(+), 20 deletions(-)

diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 8cd379756..18b18cc90 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -50,6 +50,7 @@
 #include "version.h"
 #include "box/box.h"
 #include "box/raft.h"
+#include "box/txn_limbo.h"
 #include "lua/utils.h"
 #include "fiber.h"
 #include "sio.h"
@@ -599,6 +600,47 @@ lbox_info_election(struct lua_State *L)
 	return 1;
 }
 
+static void
+lbox_push_synchro_queue(struct lua_State *L)
+{
+	/*
+	 * For fancy formatting.
+	 */
+	lua_newtable(L);
+	lua_pushliteral(L, "mapping");
+	lua_setfield(L, -2, "__serialize");
+	lua_setmetatable(L, -2);
+
+	struct txn_limbo *queue = &txn_limbo;
+	const int queue_nr = 1;
+
+	lua_createtable(L, 0, 1);
+	lua_pushstring(L, "len");
+	lua_pushnumber(L, queue->len);
+	lua_settable(L, -3);
+	lua_rawseti(L, -2, queue_nr);
+}
+
+static int
+lbox_info_synchro(struct lua_State *L)
+{
+	lua_newtable(L);
+
+	/* Quorum value may be evaluated via formula */
+	lua_pushinteger(L, replication_synchro_quorum);
+	lua_setfield(L, -2, "quorum");
+
+	/*
+	 * Queue information.
+	 */
+	lua_newtable(L);
+	lbox_push_synchro_queue(L);
+	lua_setfield(L, -2, "queue");
+
+	return 1;
+}
+
+
 static const struct luaL_Reg lbox_info_dynamic_meta[] = {
 	{"id", lbox_info_id},
 	{"uuid", lbox_info_uuid},
@@ -618,6 +660,7 @@ static const struct luaL_Reg lbox_info_dynamic_meta[] = {
 	{"sql", lbox_info_sql},
 	{"listen", lbox_info_listen},
 	{"election", lbox_info_election},
+	{"synchro", lbox_info_synchro},
 	{NULL, NULL}
 };
 
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index cf0ad9350..a22e0861a 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -41,6 +41,7 @@ static inline void
 txn_limbo_create(struct txn_limbo *limbo)
 {
 	rlist_create(&limbo->queue);
+	limbo->len = 0;
 	limbo->owner_id = REPLICA_ID_NIL;
 	fiber_cond_create(&limbo->wait_cond);
 	vclock_create(&limbo->vclock);
@@ -118,6 +119,7 @@ txn_limbo_append(struct txn_limbo *limbo, uint32_t id, struct txn *txn)
 	e->is_commit = false;
 	e->is_rollback = false;
 	rlist_add_tail_entry(&limbo->queue, e, in_queue);
+	limbo->len++;
 	/*
 	 * We added new entries from a remote instance to an empty limbo.
 	 * Time to make this instance read-only.
@@ -132,8 +134,8 @@ txn_limbo_remove(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 {
 	assert(!rlist_empty(&entry->in_queue));
 	assert(txn_limbo_first_entry(limbo) == entry);
-	(void) limbo;
 	rlist_del_entry(entry, in_queue);
+	limbo->len--;
 }
 
 static inline void
@@ -144,6 +146,7 @@ txn_limbo_pop(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 	assert(entry->is_rollback);
 
 	rlist_del_entry(entry, in_queue);
+	limbo->len--;
 	++limbo->rollback_count;
 }
 
diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
index af0addf8d..f2a98c8bb 100644
--- a/src/box/txn_limbo.h
+++ b/src/box/txn_limbo.h
@@ -94,6 +94,10 @@ struct txn_limbo {
 	 * them LSNs in the same order.
 	 */
 	struct rlist queue;
+	/**
+	 * Number of entries in limbo queue.
+	 */
+	int64_t len;
 	/**
 	 * Instance ID of the owner of all the transactions in the
 	 * queue. Strictly speaking, nothing prevents to store not
diff --git a/test/replication/gh-5446-qsync-eval-quorum.result b/test/replication/gh-5446-qsync-eval-quorum.result
index 1d13f26db..33d64b5b1 100644
--- a/test/replication/gh-5446-qsync-eval-quorum.result
+++ b/test/replication/gh-5446-qsync-eval-quorum.result
@@ -2,6 +2,9 @@
 test_run = require('test_run').new()
  | ---
  | ...
+fiber = require('fiber')
+ | ---
+ | ...
 engine = test_run:get_cfg('engine')
  | ---
  | ...
@@ -123,6 +126,11 @@ s:insert{3} -- should pass
  | - [3]
  | ...
 
+assert(box.info.synchro.quorum == 2)
+ | ---
+ | - true
+ | ...
+
 -- 6 replicas, 7 nodes -> replication_synchro_quorum = 7/2 + 1 = 4
 test_run:cmd('create server replica2 with rpl_master=default,\
               script="replication/replica-quorum-2.lua"')
@@ -174,6 +182,11 @@ test_run:cmd('start server replica6 with wait=True, wait_load=True')
  | - true
  | ...
 
+assert(box.info.synchro.quorum == 4)
+ | ---
+ | - true
+ | ...
+
 -- All replicas are up and running
 s:insert{4} -- should pass
  | ---
@@ -254,49 +267,76 @@ s:insert{11} -- should pass
  | - [11]
  | ...
 
--- cleanup
-
+-- To test queue lentgh tracking we should enter a state
+-- where attempt to write data stucks waiting for replication
+-- to timeout. For this sake we stop all replicas and try
+-- to insert a new record. This record queued into the limbo.
+-- Note the replication timeout set to a reasonably small value
+-- just to not wait too long and still be able to detect the length
+-- change.
 test_run:cmd('stop server replica1')
  | ---
  | - true
  | ...
-test_run:cmd('delete server replica1')
+test_run:cmd('stop server replica2')
  | ---
  | - true
  | ...
-test_run:cmd('stop server replica2')
+test_run:cmd('stop server replica3')
  | ---
  | - true
  | ...
-test_run:cmd('delete server replica2')
+test_run:cmd('stop server replica4')
  | ---
  | - true
  | ...
-test_run:cmd('stop server replica3')
+test_run:cmd('stop server replica5')
  | ---
  | - true
  | ...
-test_run:cmd('delete server replica3')
+test_run:cmd('stop server replica6')
  | ---
  | - true
  | ...
-test_run:cmd('stop server replica4')
+
+assert(box.info.synchro.queue[1].len == 0)
  | ---
  | - true
  | ...
-test_run:cmd('delete server replica4')
+box.cfg{replication_synchro_timeout = 2}
+ | ---
+ | ...
+f = fiber.new(function() s:insert{12} end)
+ | ---
+ | ...
+test_run:wait_cond(function() return box.info.synchro.queue[1].len == 1 end)
  | ---
  | - true
  | ...
-test_run:cmd('stop server replica5')
+test_run:wait_cond(function() return box.info.synchro.queue[1].len == 0 end)
  | ---
  | - true
  | ...
-test_run:cmd('delete server replica5')
+
+-- Cleanup
+
+test_run:cmd('delete server replica1')
  | ---
  | - true
  | ...
-test_run:cmd('stop server replica6')
+test_run:cmd('delete server replica2')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica3')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica4')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica5')
  | ---
  | - true
  | ...
diff --git a/test/replication/gh-5446-qsync-eval-quorum.test.lua b/test/replication/gh-5446-qsync-eval-quorum.test.lua
index 62d87ddcb..929571798 100644
--- a/test/replication/gh-5446-qsync-eval-quorum.test.lua
+++ b/test/replication/gh-5446-qsync-eval-quorum.test.lua
@@ -1,4 +1,5 @@
 test_run = require('test_run').new()
+fiber = require('fiber')
 engine = test_run:get_cfg('engine')
 
 box.schema.user.grant('guest', 'replication')
@@ -46,6 +47,8 @@ cfg_set_pass_tmo()
 test_run:cmd('start server replica1 with wait=True, wait_load=True')
 s:insert{3} -- should pass
 
+assert(box.info.synchro.quorum == 2)
+
 -- 6 replicas, 7 nodes -> replication_synchro_quorum = 7/2 + 1 = 4
 test_run:cmd('create server replica2 with rpl_master=default,\
               script="replication/replica-quorum-2.lua"')
@@ -67,6 +70,8 @@ test_run:cmd('create server replica6 with rpl_master=default,\
               script="replication/replica-quorum-6.lua"')
 test_run:cmd('start server replica6 with wait=True, wait_load=True')
 
+assert(box.info.synchro.quorum == 4)
+
 -- All replicas are up and running
 s:insert{4} -- should pass
 
@@ -92,19 +97,33 @@ s:insert{10} -- should pass
 test_run:cmd('start server replica1 with wait=True, wait_load=True')
 s:insert{11} -- should pass
 
--- cleanup
-
+-- To test queue lentgh tracking we should enter a state
+-- where attempt to write data stucks waiting for replication
+-- to timeout. For this sake we stop all replicas and try
+-- to insert a new record. This record queued into the limbo.
+-- Note the replication timeout set to a reasonably small value
+-- just to not wait too long and still be able to detect the length
+-- change.
 test_run:cmd('stop server replica1')
-test_run:cmd('delete server replica1')
 test_run:cmd('stop server replica2')
-test_run:cmd('delete server replica2')
 test_run:cmd('stop server replica3')
-test_run:cmd('delete server replica3')
 test_run:cmd('stop server replica4')
-test_run:cmd('delete server replica4')
 test_run:cmd('stop server replica5')
-test_run:cmd('delete server replica5')
 test_run:cmd('stop server replica6')
+
+assert(box.info.synchro.queue[1].len == 0)
+box.cfg{replication_synchro_timeout = 2}
+f = fiber.new(function() s:insert{12} end)
+test_run:wait_cond(function() return box.info.synchro.queue[1].len == 1 end)
+test_run:wait_cond(function() return box.info.synchro.queue[1].len == 0 end)
+
+-- Cleanup
+
+test_run:cmd('delete server replica1')
+test_run:cmd('delete server replica2')
+test_run:cmd('delete server replica3')
+test_run:cmd('delete server replica4')
+test_run:cmd('delete server replica5')
 test_run:cmd('delete server replica6')
 
 s:drop()
-- 
2.30.2


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-04-08 20:19 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-05 15:58 [Tarantool-patches] [PATCH v2] qsync: provide box.info interface for monitoring Cyrill Gorcunov via Tarantool-patches
2021-04-05 20:34 ` [Tarantool-patches] [PATCH v3] " Cyrill Gorcunov via Tarantool-patches
2021-04-08  7:34   ` Serge Petrenko via Tarantool-patches
2021-04-08  7:40     ` Cyrill Gorcunov via Tarantool-patches
2021-04-07 20:53 ` [Tarantool-patches] [PATCH v2] " Vladislav Shpilevoy via Tarantool-patches
2021-04-07 21:15   ` Cyrill Gorcunov via Tarantool-patches
2021-04-07 22:52     ` Vladislav Shpilevoy via Tarantool-patches
2021-04-08  7:11       ` Cyrill Gorcunov via Tarantool-patches
2021-04-08 19:37         ` Vladislav Shpilevoy via Tarantool-patches
2021-04-08 19:54           ` Cyrill Gorcunov via Tarantool-patches
2021-04-08 19:57             ` Vladislav Shpilevoy via Tarantool-patches
2021-04-08 20:04               ` Cyrill Gorcunov via Tarantool-patches
2021-04-08 20:10                 ` Vladislav Shpilevoy via Tarantool-patches
2021-04-08 20:19                   ` Cyrill Gorcunov via Tarantool-patches

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox