[Tarantool-patches] [RFC 4/4] qsync: allow to specify replication_synchro_quorum as a formula
Serge Petrenko
sergepetrenko at tarantool.org
Fri Nov 20 13:50:50 MSK 2020
19.11.2020 22:41, Cyrill Gorcunov пишет:
> When synchronous replication is used we prefer a user to specify
> a quorum number, ie the number of replicas where data must be
> replicated before the master node continue accepting new operations.
>
> This is not that convenient since a user may not know initially
> how many replicas will be used. Moreover the number of replicas
> may variaty dynamically. For this sake we allow to specify the
> number of quorum in symbolic way.
>
> For example
>
> box.cfg {
> replication_synchro_quorum = "n/2+1",
> }
>
> where n is number of registered replicas in the cluster.
> Once new replica attached or old one detached the number
> is evaluated and propagated.
>
> Internally on each "_cluster" system space update the trigger
> runs replication_on_cluster_update() helper which counts the
> number of registered replicas and update the quorum value notifying
> limbo and raft about a change.
>
> Closes #5446
>
> Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
> ---
> src/box/alter.cc | 2 ++
> src/box/box.cc | 11 +++--------
> src/box/replication.cc | 31 +++++++++++++++++++++++++++++++
> src/box/replication.h | 9 +++++++++
> 4 files changed, 45 insertions(+), 8 deletions(-)
>
> diff --git a/src/box/alter.cc b/src/box/alter.cc
> index 075b79d33..bd291ad4f 100644
> --- a/src/box/alter.cc
> +++ b/src/box/alter.cc
> @@ -4196,6 +4196,7 @@ register_replica(struct trigger *trigger, void * /* event */)
> } else {
> try {
> replica = replicaset_add(id, &uuid);
> + replication_on_cluster_update();
> /* Can't throw exceptions from on_commit trigger */
> } catch(Exception *e) {
> panic("Can't register replica: %s", e->errmsg);
> @@ -4216,6 +4217,7 @@ unregister_replica(struct trigger *trigger, void * /* event */)
> struct replica *replica = replica_by_uuid(&old_uuid);
> assert(replica != NULL);
> replica_clear_id(replica);
> + replication_on_cluster_update();
> return 0;
> }
>
We usually perform all the work related to replica register/unregister
directly in replica_set_id
and replica_clear_id.
Besides, these are the places where replicaset.registered_count is updated,
so it'd be nice to call replication_on_cluster_update from there.
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 5f7ddfa99..558a71468 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -634,14 +634,6 @@ box_check_replication_synchro_quorum(void)
> */
> int value = replication_synchro_quorum;
> quorum = eval_replication_synchro_quorum(value);
Pass replicaset.registered_count instead of replication_synchro_quorum here.
> - /*
> - * FIXME: Until we get full support.
> - */
> - diag_set(ClientError, ER_CFG,
> - "replication_synchro_quorum",
> - "symbolic evaluation is not yet supported");
> - diag_log();
> - quorum = -1;
> } else {
> quorum = cfg_geti("replication_synchro_quorum");
> }
> @@ -1004,6 +996,9 @@ box_set_replication_synchro_quorum(void)
> int value = box_check_replication_synchro_quorum();
> if (value < 0)
> return -1;
> +
> + bool isnumber = cfg_isnumber("replication_synchro_quorum");
> + replication_synchro_quorum_eval = !isnumber;
> replication_synchro_quorum_update(value);
> return 0;
> }
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index c83392f81..bde850c1c 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -53,6 +53,7 @@ double replication_connect_timeout = 30.0; /* seconds */
> int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL;
> double replication_sync_lag = 10.0; /* seconds */
> int replication_synchro_quorum = 1;
> +bool replication_synchro_quorum_eval = false;
> double replication_synchro_timeout = 5.0; /* seconds */
> double replication_sync_timeout = 300.0; /* seconds */
> bool replication_skip_conflict = false;
> @@ -100,6 +101,36 @@ replication_synchro_quorum_update(int value)
> raft_cfg_election_quorum(box_raft());
> }
>
> +/**
> + * Evaluate the new synchro quorum number when replica
> + * get registered/unregistered and the quorum depends on
> + * their amount via formula in config.
> + */
> +void
> +replication_on_cluster_update(void)
> +{
> + if (!replication_synchro_quorum_eval)
> + return;
> +
> + /*
> + * Account only registered replicas when evaluating
> + * quorum number from a fromula present in config.
> + */
> + int value = replicaset.registered_count - replicaset.anon_count;
registered_count stands for 'normal' replica count, so no need to subtract
anon_count from it.
> + int quorum = eval_replication_synchro_quorum(value);
> +
> + /*
> + * Upon node bootstrap we verify configuration so there
> + * must never be a value out of bounds, still better to
> + * be sure since evaluation code lays far from here.
> + */
> + if (quorum <= 0 || quorum >= VCLOCK_MAX)
> + panic("Unexpected result for replication_synchro_quorum eval");
> +
> + say_info("replication: evaluated quorum is %d", quorum);
quorum -> replication_synchro_quorum
> + replication_synchro_quorum_update(quorum);
> +}
> +
> void
> replication_init(void)
> {
> diff --git a/src/box/replication.h b/src/box/replication.h
> index ced519612..fa651d1c5 100644
> --- a/src/box/replication.h
> +++ b/src/box/replication.h
> @@ -131,6 +131,12 @@ extern double replication_sync_lag;
> */
> extern int replication_synchro_quorum;
>
> +/**
> + * A flag to point that replication_synchro_quorum needs
> + * to be evaluated as a formula.
> + */
> +extern bool replication_synchro_quorum_eval;
> +
> /**
> * Time in seconds which the master node is able to wait for ACKs
> * for a synchronous transaction until it is rolled back.
> @@ -178,6 +184,9 @@ replication_disconnect_timeout(void)
> void
> replication_synchro_quorum_update(int value);
>
> +void
> +replication_on_cluster_update(void);
> +
> void
> replication_init(void);
>
--
Serge Petrenko
More information about the Tarantool-patches
mailing list