[Tarantool-patches] [PATCH v4 2/3] cfg: support symbolic evaluation of replication_synchro_quorum

Cyrill Gorcunov gorcunov at gmail.com
Mon Dec 14 14:39:34 MSK 2020


When synchronous replication is used we prefer a user to specify
a quorum number, ie the number of replicas where data must be
replicated before the master node continue accepting new
transactions.

This is not very convenient since a user may not know initially
how many replicas will be used. Moreover the number of replicas
may vary dynamically. For this sake we allow to specify the
number of quorum in a symbolic way.

For example

box.cfg {
	replication_synchro_quorum = "N/2+1",
}

where `N` is a number of registered replicas in a cluster.
Once new replica attached or old one detached the number
is renewed and propagated.

Internally on each replica_set_id() and replica_clear_id(),
ie at moment when replica get registered or unregistered,
we call box_update_replication_synchro_quorum() helper which
finds out if evaluation of replication_synchro_quorum is
needed and if so we calculate new replication_synchro_quorum
value based on number of currently registered replicas. Then
we notify dependent systems such as qsync and raft to update
their guts.

Note: we do *not* change the default settings for this option,
it remains 1 by default for now. Change the default option should
be done as a separate commit once we make sure that everything is
fine.

Closes #5446

Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>

@TarantoolBot document
Title: Support dynamic evaluation of synchronous replication quorum

Setting `replication_synchro_quorum` option to an explicit integer
value was introduced rather for simplicity sake mostly. For example
if the cluster's size is not a constant value and new replicas are
connected in dynamically then an administrator might need to increase
the option by hands or by some other external tool.

Instead one can use a dynamic evaluation of a quorum value via formal
representation using symbol `N` as a current number of registered replicas
in a cluster.

For example the canonical definition for a quorum (ie majority
of members in a set) of `N` replicas is `N/2+1`. For such configuration
define

```
box.cfg {replication_synchro_quorum = "N/2+1"}
```

The formal statement allows to provide a flexible configuration but keep
in mind that only canonical quorum (and bigger values, say `N` for all
replicas) guarantees data reliability and various weird forms such as
`N/3+1` while allowed may lead to unexpected results.
---
 src/box/box.cc           | 147 +++++++++++++++++++++++++++++++++++++--
 src/box/box.h            |   1 +
 src/box/lua/load_cfg.lua |   2 +-
 src/box/replication.cc   |   4 +-
 4 files changed, 147 insertions(+), 7 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index a8bc3471d..b820af5d0 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -554,10 +554,119 @@ box_check_replication_sync_lag(void)
 	return lag;
 }
 
+/**
+ * Evaluate replication syncro quorum number from a formula.
+ */
+static int
+box_eval_replication_synchro_quorum(int nr_replicas)
+{
+	const char fmt[] =
+		"local expr = [[%s]]\n"
+		"local f, err = loadstring('return ('..expr..')')\n"
+		"if not f then "
+			"error(string.format('Failed to load \%\%s:"
+			"\%\%s', expr, err)) "
+		"end\n"
+		"setfenv(f, {N = %d, math = {"
+			"ceil = math.ceil,"
+			"floor = math.floor,"
+			"abs = math.abs,"
+			"random = math.random,"
+			"min = math.min,"
+			"max = math.abs,"
+			"sqrt = math.sqrt,"
+			"fmod = math.fmod,"
+		"}})\n"
+		"local res = f()\n"
+		"if type(res) ~= 'number' then\n"
+			"error('Expression should return a number')\n"
+		"end\n"
+		"return math.floor(res)\n";
+	const char *expr = cfg_gets("replication_synchro_quorum");
+	int quorum = -1;
+
+	/*
+	 * cfg_gets uses static buffer as well so we need a local
+	 * one, 1K should be enough to carry arbitrary but sane
+	 * formula.
+	 */
+	char buf[1024];
+	int len = snprintf(buf, sizeof(buf), fmt, expr,
+			   nr_replicas);
+	if (len >= (int)sizeof(buf)) {
+		diag_set(ClientError, ER_CFG,
+			 "replication_synchro_quorum",
+			 "the formula is too big");
+		return -1;
+	}
+
+	luaL_loadstring(tarantool_L, buf);
+	if (lua_pcall(tarantool_L, 0, 1, 0) != 0) {
+		diag_set(ClientError, ER_CFG,
+			 "replication_synchro_quorum",
+			 lua_tostring(tarantool_L, -1));
+		return -1;
+	}
+
+	if (lua_isnumber(tarantool_L, -1))
+		quorum = (int)lua_tonumber(tarantool_L, -1);
+	lua_pop(tarantool_L, 1);
+
+	/*
+	 * At least we should have 1 node to sync, the weird
+	 * formulas such as N-2 do not guarantee quorums thus
+	 * return an error.
+	 *
+	 * Since diag_set doesn't allow to show the valid range
+	 * lets print a warning too.
+	 */
+	if (quorum <= 0 || quorum >= VCLOCK_MAX) {
+		say_warn("the replication_synchro_quorum formula "
+			 "is evaluated to the quorum %d for replica "
+			 "number %d, which is out of range [%d;%d]",
+			 quorum, nr_replicas, 1, VCLOCK_MAX - 1);
+		diag_set(ClientError, ER_CFG,
+			 "replication_synchro_quorum",
+			 "evaluated value is out of range");
+		return -1;
+	}
+
+	return quorum;
+}
+
 static int
 box_check_replication_synchro_quorum(void)
 {
-	int quorum = cfg_geti("replication_synchro_quorum");
+	int quorum = 0;
+
+	if (!cfg_isnumber("replication_synchro_quorum")) {
+		/*
+		 * The formula uses symbolic name 'N' as
+		 * a number of currently registered replicas.
+		 *
+		 * When we're in "checking" mode we should walk
+		 * over all possible number of replicas to make
+		 * sure the formula is correct.
+		 *
+		 * Note that currently VCLOCK_MAX is pretty small
+		 * value but if we gonna increase this limit make
+		 * sure that the cycle won't take too much time.
+		 */
+		for (int i = 1; i < VCLOCK_MAX; i++) {
+			quorum = box_eval_replication_synchro_quorum(i);
+			if (quorum < 0)
+				return -1;
+		}
+		/*
+		 * Just to make clear the number we return here doesn't
+		 * have any special meaning, only errors are matter.
+		 * The real value is dynamic and will be updated on demand.
+		 */
+		quorum = 1;
+	} else {
+		quorum = cfg_geti("replication_synchro_quorum");
+	}
+
 	if (quorum <= 0 || quorum >= VCLOCK_MAX) {
 		diag_set(ClientError, ER_CFG, "replication_synchro_quorum",
 			 "the value must be greater than zero and less than "
@@ -910,15 +1019,45 @@ box_set_replication_sync_lag(void)
 	replication_sync_lag = box_check_replication_sync_lag();
 }
 
+/**
+ * Renew replication_synchro_quorum value if defined
+ * as a formula and we need to recalculate it.
+ */
+void
+box_update_replication_synchro_quorum(void)
+{
+	int quorum = -1;
+
+	if (!cfg_isnumber("replication_synchro_quorum")) {
+		/*
+		 * The formula has been verified already. For bootstrap
+		 * stage pass 1 as a number of replicas to sync because
+		 * we're at early stage and registering a new replica.
+		 *
+		 * This should cover the valid case where formula is plain
+		 * "N", ie all replicas are to be synchro mode.
+		 */
+		int value = MAX(1, replicaset.registered_count);
+		quorum = box_eval_replication_synchro_quorum(value);
+		if (quorum <= 0 || quorum >= VCLOCK_MAX)
+			panic("failed to eval replication_synchro_quorum");
+		say_info("update replication_synchro_quorum = %d", quorum);
+	} else {
+		quorum = cfg_geti("replication_synchro_quorum");
+	}
+
+	replication_synchro_quorum = quorum;
+	txn_limbo_on_parameters_change(&txn_limbo);
+	box_raft_update_election_quorum();
+}
+
 int
 box_set_replication_synchro_quorum(void)
 {
 	int value = box_check_replication_synchro_quorum();
 	if (value < 0)
 		return -1;
-	replication_synchro_quorum = value;
-	txn_limbo_on_parameters_change(&txn_limbo);
-	box_raft_update_election_quorum();
+	box_update_replication_synchro_quorum();
 	return 0;
 }
 
diff --git a/src/box/box.h b/src/box/box.h
index b47a220b7..c3e1a1276 100644
--- a/src/box/box.h
+++ b/src/box/box.h
@@ -252,6 +252,7 @@ void box_set_replication_connect_timeout(void);
 void box_set_replication_connect_quorum(void);
 void box_set_replication_sync_lag(void);
 int box_set_replication_synchro_quorum(void);
+void box_update_replication_synchro_quorum(void);
 int box_set_replication_synchro_timeout(void);
 void box_set_replication_sync_timeout(void);
 void box_set_replication_skip_conflict(void);
diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua
index 770442052..2355dbcd2 100644
--- a/src/box/lua/load_cfg.lua
+++ b/src/box/lua/load_cfg.lua
@@ -172,7 +172,7 @@ local template_cfg = {
     replication_timeout = 'number',
     replication_sync_lag = 'number',
     replication_sync_timeout = 'number',
-    replication_synchro_quorum = 'number',
+    replication_synchro_quorum = 'string, number',
     replication_synchro_timeout = 'number',
     replication_connect_timeout = 'number',
     replication_connect_quorum = 'number',
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 931c73a37..3126d86ac 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -251,7 +251,7 @@ replica_set_id(struct replica *replica, uint32_t replica_id)
 	say_info("assigned id %d to replica %s",
 		 replica->id, tt_uuid_str(&replica->uuid));
 	replica->anon = false;
-	box_raft_update_election_quorum();
+	box_update_replication_synchro_quorum();
 }
 
 void
@@ -300,7 +300,7 @@ replica_clear_id(struct replica *replica)
 		assert(!replica->anon);
 		replica_delete(replica);
 	}
-	box_raft_update_election_quorum();
+	box_update_replication_synchro_quorum();
 }
 
 void
-- 
2.26.2



More information about the Tarantool-patches mailing list