From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-lf1-f68.google.com (mail-lf1-f68.google.com [209.85.167.68]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 9858945C305 for ; Thu, 3 Dec 2020 17:05:15 +0300 (MSK) Received: by mail-lf1-f68.google.com with SMTP id s27so2870290lfp.5 for ; Thu, 03 Dec 2020 06:05:15 -0800 (PST) From: Cyrill Gorcunov Date: Thu, 3 Dec 2020 17:04:45 +0300 Message-Id: <20201203140446.66312-3-gorcunov@gmail.com> In-Reply-To: <20201203140446.66312-1-gorcunov@gmail.com> References: <20201203140446.66312-1-gorcunov@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH v3 2/3] cfg: support symbolic evaluation of replication_synchro_quorum List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: tml Cc: Mons Anderson , Vladislav Shpilevoy When synchronous replication is used we prefer a user to specify a quorum number, ie the number of replicas where data must be replicated before the master node continue accepting new transactions. This is not very convenient since a user may not know initially how many replicas will be used. Moreover the number of replicas may vary dynamically. For this sake we allow to specify the number of quorum in a symbolic way. For example box.cfg { replication_synchro_quorum = "N/2+1", } where `N` is a number of registered replicas in a cluster. Once new replica attached or old one detached the number is renewed and propagated. Internally on each replica_set_id() and replica_clear_id(), ie at moment when replica get registered or unregistered, we call box_update_replication_synchro_quorum() helper which finds out if evaluation of replication_synchro_quorum is needed and if so we calculate new replication_synchro_quorum value based on number of currently registered replicas. Then we notify dependent systems such as qsync and raft to update their guts. Note: we do *not* change the default settings for this option, it remains 1 by default for now. Change the default option should be done as a separate commit once we make sure that everything is fine. Closes #5446 Signed-off-by: Cyrill Gorcunov @TarantoolBot document Title: Synchronous replication The `replication_synchro_quorum` parameter allows to specify value not just as a plain integer number but as a formula too. The formula should use symbol `N` to represent amount of registered replicas. For example the canonical definition for a quorum (ie majority of members in a set) of `N` replicas is `N/2+1`. For such configuration one can define ``` box.cfg {replication_synchro_quorum = "N/2+1"} ``` Note that for sake of simplicity quorum evaluation never returns negative values thus for the case of formula say `N-2` the result will be 1 until number of replicas become 4 and more. --- src/box/box.cc | 142 +++++++++++++++++++++++++++++++++++++-- src/box/box.h | 1 + src/box/lua/load_cfg.lua | 2 +- src/box/replication.cc | 4 +- 4 files changed, 142 insertions(+), 7 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index a8bc3471d..b9d078de4 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -554,10 +554,101 @@ box_check_replication_sync_lag(void) return lag; } +/** + * Evaluate replication syncro quorum number from a formula. + */ +static int +eval_replication_synchro_quorum(int nr_replicas) +{ + const char fmt[] = + "local expr = [[%s]]\n" + "local f, err = loadstring('return ('..expr..')')\n" + "if not f then " + "error(string.format('Failed to load \%\%s:" + "\%\%s', expr, err)) " + "end\n" + "setfenv(f, {N = %d, math = {" + "ceil = math.ceil," + "floor = math.floor," + "abs = math.abs," + "random = math.random," + "min = math.min," + "max = math.abs," + "sqrt = math.sqrt," + "fmod = math.fmod," + "}})\n" + "return math.floor(f())\n"; + char buf[1024]; + int value = -1; + + const char *expr = cfg_gets("replication_synchro_quorum"); + size_t ret = snprintf(buf, sizeof(buf), fmt, expr, nr_replicas); + if (ret >= sizeof(buf)) { + diag_set(ClientError, ER_CFG, + "replication_synchro_quorum", + "the expression is too big"); + return -1; + } + + luaL_loadstring(tarantool_L, buf); + if (lua_pcall(tarantool_L, 0, 1, 0) != 0) { + diag_set(ClientError, ER_CFG, + "replication_synchro_quorum", + lua_tostring(tarantool_L, -1)); + return -1; + } + + if (lua_isnumber(tarantool_L, -1)) + value = (int)lua_tonumber(tarantool_L, -1); + lua_pop(tarantool_L, 1); + + /* + * At least we should have 1 node to sync, thus + * if the formula has evaluated to some negative + * value (say it was n-2) do not treat it as an + * error but just yield a minimum valid magnitude. + */ + if (value < 0) { + const int value_min = 1; + say_warn_ratelimited("replication_synchro_quorum evaluated " + "to the negative value %d, set to %d", + value, value_min); + value = value_min; + } else if (value >= VCLOCK_MAX) { + const int value_max = VCLOCK_MAX - 1; + say_warn_ratelimited("replication_synchro_quorum evaluated " + "to value %d, set to %d", + value, value_max); + value = value_max; + } + + /* + * We never return 0, even if we're in bootstrap + * stage were number of replicas equals zero we + * should consider the node itself as a minimum + * quorum number. + */ + return MAX(1, value); +} + static int box_check_replication_synchro_quorum(void) { - int quorum = cfg_geti("replication_synchro_quorum"); + int quorum = 0; + + if (!cfg_isnumber("replication_synchro_quorum")) { + /* + * The formula uses symbolic name 'N' as + * a number of currently registered replicas. + */ + int value = replicaset.registered_count; + quorum = eval_replication_synchro_quorum(value); + if (quorum < 0) + return -1; + } else { + quorum = cfg_geti("replication_synchro_quorum"); + } + if (quorum <= 0 || quorum >= VCLOCK_MAX) { diag_set(ClientError, ER_CFG, "replication_synchro_quorum", "the value must be greater than zero and less than " @@ -910,18 +1001,61 @@ box_set_replication_sync_lag(void) replication_sync_lag = box_check_replication_sync_lag(); } +/** + * Assign new replication_synchro_quorum value + * and notify dependent subsystems. + */ +static void +set_replication_synchro_quorum(int quorum) +{ + assert(quorum > 0 && quorum < VCLOCK_MAX); + + replication_synchro_quorum = quorum; + txn_limbo_on_parameters_change(&txn_limbo); + box_raft_update_election_quorum(); +} + int box_set_replication_synchro_quorum(void) { int value = box_check_replication_synchro_quorum(); if (value < 0) return -1; - replication_synchro_quorum = value; - txn_limbo_on_parameters_change(&txn_limbo); - box_raft_update_election_quorum(); + set_replication_synchro_quorum(value); return 0; } +/** + * Renew replication_synchro_quorum value if defined + * as a formula and we need to recalculate it. + */ +void +box_update_replication_synchro_quorum(void) +{ + if (cfg_isnumber("replication_synchro_quorum")) { + /* + * Even if replication_synchro_quorum is a constant + * number the RAFT engine should be notified on + * change of replicas amount. + */ + box_raft_update_election_quorum(); + return; + } + + /* + * The formula has been verified already on the bootstrap + * stage (and on dynamic reconfig as well), still there + * is a Lua call inside, heck knowns what could go wrong + * there thus panic if we're screwed. + */ + int value = replicaset.registered_count; + int quorum = eval_replication_synchro_quorum(value); + if (quorum < 0 || quorum >= VCLOCK_MAX) + panic("failed to eval replication_synchro_quorum"); + say_info("update replication_synchro_quorum = %d", quorum); + set_replication_synchro_quorum(quorum); +} + int box_set_replication_synchro_timeout(void) { diff --git a/src/box/box.h b/src/box/box.h index b47a220b7..c3e1a1276 100644 --- a/src/box/box.h +++ b/src/box/box.h @@ -252,6 +252,7 @@ void box_set_replication_connect_timeout(void); void box_set_replication_connect_quorum(void); void box_set_replication_sync_lag(void); int box_set_replication_synchro_quorum(void); +void box_update_replication_synchro_quorum(void); int box_set_replication_synchro_timeout(void); void box_set_replication_sync_timeout(void); void box_set_replication_skip_conflict(void); diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua index 76e2e92c2..af66c0e46 100644 --- a/src/box/lua/load_cfg.lua +++ b/src/box/lua/load_cfg.lua @@ -172,7 +172,7 @@ local template_cfg = { replication_timeout = 'number', replication_sync_lag = 'number', replication_sync_timeout = 'number', - replication_synchro_quorum = 'number', + replication_synchro_quorum = 'string, number', replication_synchro_timeout = 'number', replication_connect_timeout = 'number', replication_connect_quorum = 'number', diff --git a/src/box/replication.cc b/src/box/replication.cc index 931c73a37..3126d86ac 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -251,7 +251,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id) say_info("assigned id %d to replica %s", replica->id, tt_uuid_str(&replica->uuid)); replica->anon = false; - box_raft_update_election_quorum(); + box_update_replication_synchro_quorum(); } void @@ -300,7 +300,7 @@ replica_clear_id(struct replica *replica) assert(!replica->anon); replica_delete(replica); } - box_raft_update_election_quorum(); + box_update_replication_synchro_quorum(); } void -- 2.26.2