From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp39.i.mail.ru (smtp39.i.mail.ru [94.100.177.99]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id A7C6145C304 for ; Wed, 16 Dec 2020 16:21:33 +0300 (MSK) References: <20201214113935.1040421-1-gorcunov@gmail.com> <20201214113935.1040421-3-gorcunov@gmail.com> From: Serge Petrenko Message-ID: <51ea4672-6d4a-811d-160b-b92544745b10@tarantool.org> Date: Wed, 16 Dec 2020 16:21:31 +0300 MIME-Version: 1.0 In-Reply-To: <20201214113935.1040421-3-gorcunov@gmail.com> Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Transfer-Encoding: 8bit Content-Language: en-GB Subject: Re: [Tarantool-patches] [PATCH v4 2/3] cfg: support symbolic evaluation of replication_synchro_quorum List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Cyrill Gorcunov , tml Cc: Mons Anderson , Vladislav Shpilevoy 14.12.2020 14:39, Cyrill Gorcunov пишет: > When synchronous replication is used we prefer a user to specify > a quorum number, ie the number of replicas where data must be > replicated before the master node continue accepting new > transactions. > > This is not very convenient since a user may not know initially > how many replicas will be used. Moreover the number of replicas > may vary dynamically. For this sake we allow to specify the > number of quorum in a symbolic way. > > For example > > box.cfg { > replication_synchro_quorum = "N/2+1", > } > > where `N` is a number of registered replicas in a cluster. > Once new replica attached or old one detached the number > is renewed and propagated. > > Internally on each replica_set_id() and replica_clear_id(), > ie at moment when replica get registered or unregistered, > we call box_update_replication_synchro_quorum() helper which > finds out if evaluation of replication_synchro_quorum is > needed and if so we calculate new replication_synchro_quorum > value based on number of currently registered replicas. Then > we notify dependent systems such as qsync and raft to update > their guts. > > Note: we do *not* change the default settings for this option, > it remains 1 by default for now. Change the default option should > be done as a separate commit once we make sure that everything is > fine. > > Closes #5446 > > Signed-off-by: Cyrill Gorcunov > > @TarantoolBot document > Title: Support dynamic evaluation of synchronous replication quorum > > Setting `replication_synchro_quorum` option to an explicit integer > value was introduced rather for simplicity sake mostly. For example > if the cluster's size is not a constant value and new replicas are > connected in dynamically then an administrator might need to increase > the option by hands or by some other external tool. > > Instead one can use a dynamic evaluation of a quorum value via formal > representation using symbol `N` as a current number of registered replicas > in a cluster. > > For example the canonical definition for a quorum (ie majority > of members in a set) of `N` replicas is `N/2+1`. For such configuration > define > > ``` > box.cfg {replication_synchro_quorum = "N/2+1"} > ``` > > The formal statement allows to provide a flexible configuration but keep > in mind that only canonical quorum (and bigger values, say `N` for all > replicas) guarantees data reliability and various weird forms such as > `N/3+1` while allowed may lead to unexpected results. > --- > src/box/box.cc | 147 +++++++++++++++++++++++++++++++++++++-- > src/box/box.h | 1 + > src/box/lua/load_cfg.lua | 2 +- > src/box/replication.cc | 4 +- > 4 files changed, 147 insertions(+), 7 deletions(-) > > diff --git a/src/box/box.cc b/src/box/box.cc > index a8bc3471d..b820af5d0 100644 > --- a/src/box/box.cc > +++ b/src/box/box.cc > @@ -554,10 +554,119 @@ box_check_replication_sync_lag(void) > return lag; > } > > +/** > + * Evaluate replication syncro quorum number from a formula. > + */ > +static int > +box_eval_replication_synchro_quorum(int nr_replicas) > +{ > + const char fmt[] = > + "local expr = [[%s]]\n" > + "local f, err = loadstring('return ('..expr..')')\n" > + "if not f then " > + "error(string.format('Failed to load \%\%s:" > + "\%\%s', expr, err)) " > + "end\n" > + "setfenv(f, {N = %d, math = {" > + "ceil = math.ceil," > + "floor = math.floor," > + "abs = math.abs," > + "random = math.random," > + "min = math.min," > + "max = math.abs," typo: math.max Other than that, LGTM. > + "sqrt = math.sqrt," > + "fmod = math.fmod," > + "}})\n" > + "local res = f()\n" > + "if type(res) ~= 'number' then\n" > + "error('Expression should return a number')\n" > + "end\n" > + "return math.floor(res)\n"; > + const char *expr = cfg_gets("replication_synchro_quorum"); > + int quorum = -1; > + > + /* > + * cfg_gets uses static buffer as well so we need a local > + * one, 1K should be enough to carry arbitrary but sane > + * formula. > + */ > + char buf[1024]; > + int len = snprintf(buf, sizeof(buf), fmt, expr, > + nr_replicas); > + if (len >= (int)sizeof(buf)) { > + diag_set(ClientError, ER_CFG, > + "replication_synchro_quorum", > + "the formula is too big"); > + return -1; > + } > + > + luaL_loadstring(tarantool_L, buf); > + if (lua_pcall(tarantool_L, 0, 1, 0) != 0) { > + diag_set(ClientError, ER_CFG, > + "replication_synchro_quorum", > + lua_tostring(tarantool_L, -1)); > + return -1; > + } > + > + if (lua_isnumber(tarantool_L, -1)) > + quorum = (int)lua_tonumber(tarantool_L, -1); > + lua_pop(tarantool_L, 1); > + > + /* > + * At least we should have 1 node to sync, the weird > + * formulas such as N-2 do not guarantee quorums thus > + * return an error. > + * > + * Since diag_set doesn't allow to show the valid range > + * lets print a warning too. > + */ > + if (quorum <= 0 || quorum >= VCLOCK_MAX) { > + say_warn("the replication_synchro_quorum formula " > + "is evaluated to the quorum %d for replica " > + "number %d, which is out of range [%d;%d]", > + quorum, nr_replicas, 1, VCLOCK_MAX - 1); > + diag_set(ClientError, ER_CFG, > + "replication_synchro_quorum", > + "evaluated value is out of range"); > + return -1; > + } > + > + return quorum; > +} > + > static int > box_check_replication_synchro_quorum(void) > { > - int quorum = cfg_geti("replication_synchro_quorum"); > + int quorum = 0; > + > + if (!cfg_isnumber("replication_synchro_quorum")) { > + /* > + * The formula uses symbolic name 'N' as > + * a number of currently registered replicas. > + * > + * When we're in "checking" mode we should walk > + * over all possible number of replicas to make > + * sure the formula is correct. > + * > + * Note that currently VCLOCK_MAX is pretty small > + * value but if we gonna increase this limit make > + * sure that the cycle won't take too much time. > + */ > + for (int i = 1; i < VCLOCK_MAX; i++) { > + quorum = box_eval_replication_synchro_quorum(i); > + if (quorum < 0) > + return -1; > + } > + /* > + * Just to make clear the number we return here doesn't > + * have any special meaning, only errors are matter. > + * The real value is dynamic and will be updated on demand. > + */ > + quorum = 1; > + } else { > + quorum = cfg_geti("replication_synchro_quorum"); > + } > + > if (quorum <= 0 || quorum >= VCLOCK_MAX) { > diag_set(ClientError, ER_CFG, "replication_synchro_quorum", > "the value must be greater than zero and less than " > @@ -910,15 +1019,45 @@ box_set_replication_sync_lag(void) > replication_sync_lag = box_check_replication_sync_lag(); > } > > +/** > + * Renew replication_synchro_quorum value if defined > + * as a formula and we need to recalculate it. > + */ > +void > +box_update_replication_synchro_quorum(void) > +{ > + int quorum = -1; > + > + if (!cfg_isnumber("replication_synchro_quorum")) { > + /* > + * The formula has been verified already. For bootstrap > + * stage pass 1 as a number of replicas to sync because > + * we're at early stage and registering a new replica. > + * > + * This should cover the valid case where formula is plain > + * "N", ie all replicas are to be synchro mode. > + */ > + int value = MAX(1, replicaset.registered_count); > + quorum = box_eval_replication_synchro_quorum(value); > + if (quorum <= 0 || quorum >= VCLOCK_MAX) > + panic("failed to eval replication_synchro_quorum"); > + say_info("update replication_synchro_quorum = %d", quorum); > + } else { > + quorum = cfg_geti("replication_synchro_quorum"); > + } > + > + replication_synchro_quorum = quorum; > + txn_limbo_on_parameters_change(&txn_limbo); > + box_raft_update_election_quorum(); > +} > + > int > box_set_replication_synchro_quorum(void) > { > int value = box_check_replication_synchro_quorum(); > if (value < 0) > return -1; > - replication_synchro_quorum = value; > - txn_limbo_on_parameters_change(&txn_limbo); > - box_raft_update_election_quorum(); > + box_update_replication_synchro_quorum(); > return 0; > } > > diff --git a/src/box/box.h b/src/box/box.h > index b47a220b7..c3e1a1276 100644 > --- a/src/box/box.h > +++ b/src/box/box.h > @@ -252,6 +252,7 @@ void box_set_replication_connect_timeout(void); > void box_set_replication_connect_quorum(void); > void box_set_replication_sync_lag(void); > int box_set_replication_synchro_quorum(void); > +void box_update_replication_synchro_quorum(void); > int box_set_replication_synchro_timeout(void); > void box_set_replication_sync_timeout(void); > void box_set_replication_skip_conflict(void); > diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua > index 770442052..2355dbcd2 100644 > --- a/src/box/lua/load_cfg.lua > +++ b/src/box/lua/load_cfg.lua > @@ -172,7 +172,7 @@ local template_cfg = { > replication_timeout = 'number', > replication_sync_lag = 'number', > replication_sync_timeout = 'number', > - replication_synchro_quorum = 'number', > + replication_synchro_quorum = 'string, number', > replication_synchro_timeout = 'number', > replication_connect_timeout = 'number', > replication_connect_quorum = 'number', > diff --git a/src/box/replication.cc b/src/box/replication.cc > index 931c73a37..3126d86ac 100644 > --- a/src/box/replication.cc > +++ b/src/box/replication.cc > @@ -251,7 +251,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id) > say_info("assigned id %d to replica %s", > replica->id, tt_uuid_str(&replica->uuid)); > replica->anon = false; > - box_raft_update_election_quorum(); > + box_update_replication_synchro_quorum(); > } > > void > @@ -300,7 +300,7 @@ replica_clear_id(struct replica *replica) > assert(!replica->anon); > replica_delete(replica); > } > - box_raft_update_election_quorum(); > + box_update_replication_synchro_quorum(); > } > > void -- Serge Petrenko