[Tarantool-patches] [PATCH v4] qsync: provide box.info.synchro interface for monitoring

Serge Petrenko sergepetrenko at tarantool.org
Thu Apr 8 15:58:38 MSK 2021



08.04.2021 15:18, Cyrill Gorcunov пишет:
> In commit 14fa5fd82 (cfg: support symbolic evaluation of
> replication_synchro_quorum) we implemented support of
> symbolic evaluation of `replication_synchro_quorum` parameter
> and there is no easy way to obtain it current run-time value,
> ie evaluated number value.
>
> Moreover we would like to fetch queue length on transaction
> limbo for tests and extend this statistics in future. Thus
> lets add them.
>
> Closes #5191

Thanks for the fixes!

Please return `box.info.synchro.quorum` assertions to the
test regarding quorum evaluation. Like you had it in the
previous patch version.

Other than that, LGTM.

>
> Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
>
> @TarantoolBot document
> Title: Provide `box.info.synchro` interface
>
> The `box.info.synchro` leaf provides information about details of
> synchronous replication.
>
> In particular `quorum` represent the current value of synchronous
> replication quorum defined by `replication_synchro_quorum`
> configuration parameter because it can be set as dynamic formula
> such as `N/2+1` and the value depends on current number of replicas.
>
> Since synchronous replication does not commit data immediately
> but waits for its propagation to replicas the data sits in a queue
> gathering `commit` responses from remote nodes. Current number of
> entries waiting in the queue is shown via `queue.len` member.
>
> A typical output is the following
>
> ``` Lua
> tarantool> box.info.synchro
> ---
> - queue:
>      len: 0
>    quorum: 1
> ...
> ```
>
> The `len` member shows current number of entries in the queue.
> And the `quorum` member shows an evaluated value of
> `replication_synchro_quorum` parameter.
> ---
> issue https://github.com/tarantool/tarantool/issues/5191
> branch gorcunov/gh-5191-qsync-stat-4
>
>   changelogs/unreleased/box-info-limbo.md |  4 +++
>   src/box/lua/info.c                      | 22 ++++++++++++++
>   src/box/txn_limbo.c                     |  5 +++-
>   src/box/txn_limbo.h                     |  4 +++
>   test/box/info.result                    |  1 +
>   test/replication/qsync_basic.result     | 40 +++++++++++++++++++++----
>   test/replication/qsync_basic.test.lua   | 21 +++++++++++--
>   7 files changed, 89 insertions(+), 8 deletions(-)
>   create mode 100644 changelogs/unreleased/box-info-limbo.md
>
> diff --git a/changelogs/unreleased/box-info-limbo.md b/changelogs/unreleased/box-info-limbo.md
> new file mode 100644
> index 000000000..0f75a911d
> --- /dev/null
> +++ b/changelogs/unreleased/box-info-limbo.md
> @@ -0,0 +1,4 @@
> +## feature/core
> +
> +* Provide information about state of synchronous replication via
> +  `box.info.synchro` interface (gh-5191).
> diff --git a/src/box/lua/info.c b/src/box/lua/info.c
> index 8cd379756..7c63c8f5e 100644
> --- a/src/box/lua/info.c
> +++ b/src/box/lua/info.c
> @@ -50,6 +50,7 @@
>   #include "version.h"
>   #include "box/box.h"
>   #include "box/raft.h"
> +#include "box/txn_limbo.h"
>   #include "lua/utils.h"
>   #include "fiber.h"
>   #include "sio.h"
> @@ -599,6 +600,26 @@ lbox_info_election(struct lua_State *L)
>   	return 1;
>   }
>   
> +static int
> +lbox_info_synchro(struct lua_State *L)
> +{
> +	lua_createtable(L, 0, 2);
> +
> +	/* Quorum value may be evaluated via formula */
> +	lua_pushinteger(L, replication_synchro_quorum);
> +	lua_setfield(L, -2, "quorum");
> +
> +	/*Queue information. */
> +	struct txn_limbo *queue = &txn_limbo;
> +	lua_createtable(L, 0, 1);
> +	lua_pushnumber(L, queue->len);
> +	lua_setfield(L, -2, "len");
> +	lua_setfield(L, -2, "queue");
> +
> +	return 1;
> +}
> +
> +
>   static const struct luaL_Reg lbox_info_dynamic_meta[] = {
>   	{"id", lbox_info_id},
>   	{"uuid", lbox_info_uuid},
> @@ -618,6 +639,7 @@ static const struct luaL_Reg lbox_info_dynamic_meta[] = {
>   	{"sql", lbox_info_sql},
>   	{"listen", lbox_info_listen},
>   	{"election", lbox_info_election},
> +	{"synchro", lbox_info_synchro},
>   	{NULL, NULL}
>   };
>   
> diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
> index cf0ad9350..a22e0861a 100644
> --- a/src/box/txn_limbo.c
> +++ b/src/box/txn_limbo.c
> @@ -41,6 +41,7 @@ static inline void
>   txn_limbo_create(struct txn_limbo *limbo)
>   {
>   	rlist_create(&limbo->queue);
> +	limbo->len = 0;
>   	limbo->owner_id = REPLICA_ID_NIL;
>   	fiber_cond_create(&limbo->wait_cond);
>   	vclock_create(&limbo->vclock);
> @@ -118,6 +119,7 @@ txn_limbo_append(struct txn_limbo *limbo, uint32_t id, struct txn *txn)
>   	e->is_commit = false;
>   	e->is_rollback = false;
>   	rlist_add_tail_entry(&limbo->queue, e, in_queue);
> +	limbo->len++;
>   	/*
>   	 * We added new entries from a remote instance to an empty limbo.
>   	 * Time to make this instance read-only.
> @@ -132,8 +134,8 @@ txn_limbo_remove(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
>   {
>   	assert(!rlist_empty(&entry->in_queue));
>   	assert(txn_limbo_first_entry(limbo) == entry);
> -	(void) limbo;
>   	rlist_del_entry(entry, in_queue);
> +	limbo->len--;
>   }
>   
>   static inline void
> @@ -144,6 +146,7 @@ txn_limbo_pop(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
>   	assert(entry->is_rollback);
>   
>   	rlist_del_entry(entry, in_queue);
> +	limbo->len--;
>   	++limbo->rollback_count;
>   }
>   
> diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
> index af0addf8d..f2a98c8bb 100644
> --- a/src/box/txn_limbo.h
> +++ b/src/box/txn_limbo.h
> @@ -94,6 +94,10 @@ struct txn_limbo {
>   	 * them LSNs in the same order.
>   	 */
>   	struct rlist queue;
> +	/**
> +	 * Number of entries in limbo queue.
> +	 */
> +	int64_t len;
>   	/**
>   	 * Instance ID of the owner of all the transactions in the
>   	 * queue. Strictly speaking, nothing prevents to store not
> diff --git a/test/box/info.result b/test/box/info.result
> index c8037818b..3ee653773 100644
> --- a/test/box/info.result
> +++ b/test/box/info.result
> @@ -89,6 +89,7 @@ t
>     - signature
>     - sql
>     - status
> +  - synchro
>     - uptime
>     - uuid
>     - vclock
> diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
> index bd3c3cce1..3457d2cc9 100644
> --- a/test/replication/qsync_basic.result
> +++ b/test/replication/qsync_basic.result
> @@ -637,12 +637,46 @@ box.space.sync:count()
>    | - 0
>    | ...
>   
> --- Cleanup.
> +--
> +-- gh-5191: test box.info.synchro interface. For
> +-- this sake we stop the replica and initiate data
> +-- write in sync space which won't pass due to timeout.
> +-- While we're sitting in a wait cycle the queue should
> +-- not be empty.
> +--
> +-- Make sure this test is the *LAST* one since we stop
> +-- the replica node and never restart it back before the
> +-- cleanup procedure, also we're spinning on default node
> +-- and do not switch to other nodes.
> +--
>   test_run:cmd('switch default')
>    | ---
>    | - true
>    | ...
> +test_run:cmd('stop server replica')
> + | ---
> + | - true
> + | ...
> +assert(box.info.synchro.queue.len == 0)
> + | ---
> + | - true
> + | ...
> +box.cfg{replication_synchro_timeout = 2}
> + | ---
> + | ...
> +f = fiber.new(function() box.space.sync:insert{1024} end)
> + | ---
> + | ...
> +test_run:wait_cond(function() return box.info.synchro.queue.len == 1 end)
> + | ---
> + | - true
> + | ...
> +test_run:wait_cond(function() return box.info.synchro.queue.len == 0 end)
> + | ---
> + | - true
> + | ...
>   
> +-- Cleanup
>   box.cfg{                                                                        \
>       replication_synchro_quorum = old_synchro_quorum,                            \
>       replication_synchro_timeout = old_synchro_timeout,                          \
> @@ -650,10 +684,6 @@ box.cfg{
>   }
>    | ---
>    | ...
> -test_run:cmd('stop server replica')
> - | ---
> - | - true
> - | ...
>   test_run:cmd('delete server replica')
>    | ---
>    | - true
> diff --git a/test/replication/qsync_basic.test.lua b/test/replication/qsync_basic.test.lua
> index 94235547d..a604d80ee 100644
> --- a/test/replication/qsync_basic.test.lua
> +++ b/test/replication/qsync_basic.test.lua
> @@ -248,15 +248,32 @@ for i = 1, 100 do box.space.sync:delete{i} end
>   test_run:cmd('switch replica')
>   box.space.sync:count()
>   
> --- Cleanup.
> +--
> +-- gh-5191: test box.info.synchro interface. For
> +-- this sake we stop the replica and initiate data
> +-- write in sync space which won't pass due to timeout.
> +-- While we're sitting in a wait cycle the queue should
> +-- not be empty.
> +--
> +-- Make sure this test is the *LAST* one since we stop
> +-- the replica node and never restart it back before the
> +-- cleanup procedure, also we're spinning on default node
> +-- and do not switch to other nodes.
> +--
>   test_run:cmd('switch default')
> +test_run:cmd('stop server replica')
> +assert(box.info.synchro.queue.len == 0)
> +box.cfg{replication_synchro_timeout = 2}
> +f = fiber.new(function() box.space.sync:insert{1024} end)
> +test_run:wait_cond(function() return box.info.synchro.queue.len == 1 end)
> +test_run:wait_cond(function() return box.info.synchro.queue.len == 0 end)
>   
> +-- Cleanup
>   box.cfg{                                                                        \
>       replication_synchro_quorum = old_synchro_quorum,                            \
>       replication_synchro_timeout = old_synchro_timeout,                          \
>       replication_timeout = old_timeout,                                          \
>   }
> -test_run:cmd('stop server replica')
>   test_run:cmd('delete server replica')
>   box.space.test:drop()
>   box.space.sync:drop()

-- 
Serge Petrenko



More information about the Tarantool-patches mailing list