[Tarantool-patches] [PATCH v6 3/5] cfg: support symbolic evaluation of replication_synchro_quorum

Cyrill Gorcunov gorcunov at gmail.com
Tue Dec 22 14:14:06 MSK 2020


When synchronous replication is used we prefer a user to specify
a quorum number, ie the number of replicas where data must be
replicated before the master node continue accepting new
transactions.

This is not very convenient since a user may not know initially
how many replicas will be used. Moreover the number of replicas
may vary dynamically. For this sake we allow to specify the
number of quorum in a symbolic way.

For example

box.cfg {
	replication_synchro_quorum = "N/2+1",
}

where `N` is a number of registered replicas in a cluster.
Once new replica attached or old one detached the number
is renewed and propagated.

Internally on each replica_set_id() and replica_clear_id(),
ie at moment when replica get registered or unregistered,
we call box_update_replication_synchro_quorum() helper which
finds out if evaluation of replication_synchro_quorum is
needed and if so we calculate new replication_synchro_quorum
value based on number of currently registered replicas. Then
we notify dependent systems such as qsync and raft to update
their guts.

Note: we do *not* change the default settings for this option,
it remains 1 by default for now. Change the default option should
be done as a separate commit once we make sure that everything is
fine.

Closes #5446

Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>

@TarantoolBot document
Title: Support dynamic evaluation of synchronous replication quorum

Setting `replication_synchro_quorum` option to an explicit integer
value was introduced rather for simplicity sake mostly. For example
if the cluster's size is not a constant value and new replicas are
connected in dynamically then an administrator might need to increase
the option by hands or by some other external tool.

Instead one can use a dynamic evaluation of a quorum value via formal
representation using symbol `N` as a current number of registered replicas
in a cluster.

For example the canonical definition for a quorum (ie majority
of members in a set) of `N` replicas is `N/2+1`. For such configuration
define

```
box.cfg {replication_synchro_quorum = "N/2+1"}
```

The formal statement allows to provide a flexible configuration but keep
in mind that only canonical quorum (and bigger values, say `N` for all
replicas) guarantees data reliability and various weird forms such as
`N/3+1` while allowed may lead to unexpected results.
---
 src/box/box.cc           | 127 ++++++++++++++++++++++++++++++++++++++-
 src/box/lua/load_cfg.lua |   2 +-
 src/box/replication.cc   |   4 +-
 3 files changed, 129 insertions(+), 4 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 630f579df..68579c254 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -554,9 +554,109 @@ box_check_replication_sync_lag(void)
 	return lag;
 }
 
+/**
+ * Evaluate replication syncro quorum number from a formula.
+ */
+static int
+box_eval_replication_synchro_quorum(int nr_replicas)
+{
+	assert(nr_replicas > 0 && nr_replicas < VCLOCK_MAX);
+
+	const char fmt[] =
+		"local expr = [[%s]]\n"
+		"local f, err = loadstring('return ('..expr..')')\n"
+		"if not f then "
+			"error(string.format('Failed to load \%\%s:"
+			"\%\%s', expr, err)) "
+		"end\n"
+		"setfenv(f, {N = %d, math = {"
+			"ceil = math.ceil,"
+			"floor = math.floor,"
+			"abs = math.abs,"
+			"random = math.random,"
+			"min = math.min,"
+			"max = math.max,"
+			"sqrt = math.sqrt,"
+			"fmod = math.fmod,"
+		"}})\n"
+		"local res = f()\n"
+		"if type(res) ~= 'number' then\n"
+			"error('Expression should return a number')\n"
+		"end\n"
+		"return math.floor(res)\n";
+	const char *expr = cfg_gets("replication_synchro_quorum");
+
+	/*
+	 * cfg_gets uses static buffer as well so we need a local
+	 * one, 1K should be enough to carry arbitrary but sane
+	 * formula.
+	 */
+	char buf[1024];
+	int len = snprintf(buf, sizeof(buf), fmt, expr,
+			   nr_replicas);
+	if (len >= (int)sizeof(buf)) {
+		diag_set(ClientError, ER_CFG,
+			 "replication_synchro_quorum",
+			 "the formula is too big");
+		return -1;
+	}
+
+	luaL_loadstring(tarantool_L, buf);
+	if (lua_pcall(tarantool_L, 0, 1, 0) != 0) {
+		diag_set(ClientError, ER_CFG,
+			 "replication_synchro_quorum",
+			 lua_tostring(tarantool_L, -1));
+		return -1;
+	}
+
+	int quorum = -1;
+	if (lua_isnumber(tarantool_L, -1))
+		quorum = (int)lua_tonumber(tarantool_L, -1);
+	lua_pop(tarantool_L, 1);
+
+	/*
+	 * At least we should have 1 node to sync, the weird
+	 * formulas such as N-2 do not guarantee quorums thus
+	 * return an error.
+	 */
+	if (quorum <= 0 || quorum >= VCLOCK_MAX) {
+		const char *msg =
+			tt_sprintf("the formula is evaluated "
+				   "to the quorum %d for replica "
+				   "number %d, which is out of range "
+				   "[%d;%d]",
+				   quorum, nr_replicas, 1, VCLOCK_MAX - 1);
+		diag_set(ClientError, ER_CFG,
+			 "replication_synchro_quorum", msg);
+		return -1;
+	}
+
+	return quorum;
+}
+
 static int
 box_check_replication_synchro_quorum(void)
 {
+	if (!cfg_isnumber("replication_synchro_quorum")) {
+		/*
+		 * The formula uses symbolic name 'N' as
+		 * a number of currently registered replicas.
+		 *
+		 * When we're in "checking" mode we should walk
+		 * over all possible number of replicas to make
+		 * sure the formula is correct.
+		 *
+		 * Note that currently VCLOCK_MAX is pretty small
+		 * value but if we gonna increase this limit make
+		 * sure that the cycle won't take too much time.
+		 */
+		for (int i = 1; i < VCLOCK_MAX; i++) {
+			if (box_eval_replication_synchro_quorum(i) < 0)
+				return -1;
+		}
+		return 0;
+	}
+
 	int quorum = cfg_geti("replication_synchro_quorum");
 	if (quorum <= 0 || quorum >= VCLOCK_MAX) {
 		diag_set(ClientError, ER_CFG, "replication_synchro_quorum",
@@ -913,7 +1013,32 @@ box_set_replication_sync_lag(void)
 void
 box_update_replication_synchro_quorum(void)
 {
-	int quorum = cfg_geti("replication_synchro_quorum");
+	int quorum = -1;
+
+	if (!cfg_isnumber("replication_synchro_quorum")) {
+		/*
+		 * The formula has been verified already. For bootstrap
+		 * stage pass 1 as a number of replicas to sync because
+		 * we're at early stage and registering a new replica.
+		 *
+		 * This should cover the valid case where formula is plain
+		 * "N", ie all replicas are to be synchro mode.
+		 */
+		int value = MAX(1, replicaset.registered_count);
+		quorum = box_eval_replication_synchro_quorum(value);
+		say_info("update replication_synchro_quorum = %d", quorum);
+	} else {
+		quorum = cfg_geti("replication_synchro_quorum");
+	}
+
+	/*
+	 * This should never happen because the values were
+	 * validated already but just to prevent from
+	 * unexpected changes and because the value is too
+	 * important for qsync, lets re-check (this is cheap).
+	 */
+	if (quorum <= 0 || quorum >= VCLOCK_MAX)
+		panic("failed to eval/fetch replication_synchro_quorum");
 
 	replication_synchro_quorum = quorum;
 	txn_limbo_on_parameters_change(&txn_limbo);
diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua
index 770442052..2355dbcd2 100644
--- a/src/box/lua/load_cfg.lua
+++ b/src/box/lua/load_cfg.lua
@@ -172,7 +172,7 @@ local template_cfg = {
     replication_timeout = 'number',
     replication_sync_lag = 'number',
     replication_sync_timeout = 'number',
-    replication_synchro_quorum = 'number',
+    replication_synchro_quorum = 'string, number',
     replication_synchro_timeout = 'number',
     replication_connect_timeout = 'number',
     replication_connect_quorum = 'number',
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 931c73a37..3126d86ac 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -251,7 +251,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
 	say_info("assigned id %d to replica %s",
 		 replica->id, tt_uuid_str(&replica->uuid));
 	replica->anon = false;
-	box_raft_update_election_quorum();
+	box_update_replication_synchro_quorum();
 }
 
 void
@@ -300,7 +300,7 @@ replica_clear_id(struct replica *replica)
 		assert(!replica->anon);
 		replica_delete(replica);
 	}
-	box_raft_update_election_quorum();
+	box_update_replication_synchro_quorum();
 }
 
 void
-- 
2.26.2



More information about the Tarantool-patches mailing list