[Tarantool-patches] [RFC 4/4] qsync: allow to specify replication_synchro_quorum as a formula

Serge Petrenko sergepetrenko at tarantool.org
Fri Nov 20 13:50:50 MSK 2020


19.11.2020 22:41, Cyrill Gorcunov пишет:
> When synchronous replication is used we prefer a user to specify
> a quorum number, ie the number of replicas where data must be
> replicated before the master node continue accepting new operations.
>
> This is not that convenient since a user may not know initially
> how many replicas will be used. Moreover the number of replicas
> may variaty dynamically. For this sake we allow to specify the
> number of quorum in symbolic way.
>
> For example
>
> box.cfg {
> 	replication_synchro_quorum = "n/2+1",
> }
>
> where n is number of registered replicas in the cluster.
> Once new replica attached or old one detached the number
> is evaluated and propagated.
>
> Internally on each "_cluster" system space update the trigger
> runs replication_on_cluster_update() helper which counts the
> number of registered replicas and update the quorum value notifying
> limbo and raft about a change.
>
> Closes #5446
>
> Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
> ---
>   src/box/alter.cc       |  2 ++
>   src/box/box.cc         | 11 +++--------
>   src/box/replication.cc | 31 +++++++++++++++++++++++++++++++
>   src/box/replication.h  |  9 +++++++++
>   4 files changed, 45 insertions(+), 8 deletions(-)
>
> diff --git a/src/box/alter.cc b/src/box/alter.cc
> index 075b79d33..bd291ad4f 100644
> --- a/src/box/alter.cc
> +++ b/src/box/alter.cc
> @@ -4196,6 +4196,7 @@ register_replica(struct trigger *trigger, void * /* event */)
>   	} else {
>   		try {
>   			replica = replicaset_add(id, &uuid);
> +			replication_on_cluster_update();
>   			/* Can't throw exceptions from on_commit trigger */
>   		} catch(Exception *e) {
>   			panic("Can't register replica: %s", e->errmsg);
> @@ -4216,6 +4217,7 @@ unregister_replica(struct trigger *trigger, void * /* event */)
>   	struct replica *replica = replica_by_uuid(&old_uuid);
>   	assert(replica != NULL);
>   	replica_clear_id(replica);
> +	replication_on_cluster_update();
>   	return 0;
>   }
>   

We usually perform all the work related to replica register/unregister 
directly in replica_set_id
and replica_clear_id.

Besides, these are the places where replicaset.registered_count is updated,
so it'd be nice to call replication_on_cluster_update from there.


> diff --git a/src/box/box.cc b/src/box/box.cc
> index 5f7ddfa99..558a71468 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -634,14 +634,6 @@ box_check_replication_synchro_quorum(void)
>   		 */
>   		int value = replication_synchro_quorum;
>   		quorum = eval_replication_synchro_quorum(value);


Pass replicaset.registered_count instead of replication_synchro_quorum here.


> -		/*
> -		 * FIXME: Until we get full support.
> -		 */
> -		diag_set(ClientError, ER_CFG,
> -			 "replication_synchro_quorum",
> -			 "symbolic evaluation is not yet supported");
> -		diag_log();
> -		quorum = -1;
>   	} else {
>   		quorum = cfg_geti("replication_synchro_quorum");
>   	}
> @@ -1004,6 +996,9 @@ box_set_replication_synchro_quorum(void)
>   	int value = box_check_replication_synchro_quorum();
>   	if (value < 0)
>   		return -1;
> +
> +	bool isnumber = cfg_isnumber("replication_synchro_quorum");
> +	replication_synchro_quorum_eval = !isnumber;
>   	replication_synchro_quorum_update(value);
>   	return 0;
>   }
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index c83392f81..bde850c1c 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -53,6 +53,7 @@ double replication_connect_timeout = 30.0; /* seconds */
>   int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL;
>   double replication_sync_lag = 10.0; /* seconds */
>   int replication_synchro_quorum = 1;
> +bool replication_synchro_quorum_eval = false;
>   double replication_synchro_timeout = 5.0; /* seconds */
>   double replication_sync_timeout = 300.0; /* seconds */
>   bool replication_skip_conflict = false;
> @@ -100,6 +101,36 @@ replication_synchro_quorum_update(int value)
>   	raft_cfg_election_quorum(box_raft());
>   }
>   
> +/**
> + * Evaluate the new synchro quorum number when replica
> + * get registered/unregistered and the quorum depends on
> + * their amount via formula in config.
> + */
> +void
> +replication_on_cluster_update(void)
> +{
> +	if (!replication_synchro_quorum_eval)
> +		return;
> +
> +	/*
> +	 * Account only registered replicas when evaluating
> +	 * quorum number from a fromula present in config.
> +	 */
> +	int value = replicaset.registered_count - replicaset.anon_count;


registered_count stands for 'normal' replica count, so no need to subtract
anon_count from it.


> +	int quorum = eval_replication_synchro_quorum(value);
> +
> +	/*
> +	 * Upon node bootstrap we verify configuration so there
> +	 * must never be a value out of bounds, still better to
> +	 * be sure since evaluation code lays far from here.
> +	 */
> +	if (quorum <= 0 || quorum >= VCLOCK_MAX)
> +		panic("Unexpected result for replication_synchro_quorum eval");
> +
> +	say_info("replication: evaluated quorum is %d", quorum);


quorum -> replication_synchro_quorum


> +	replication_synchro_quorum_update(quorum);
> +}
> +
>   void
>   replication_init(void)
>   {
> diff --git a/src/box/replication.h b/src/box/replication.h
> index ced519612..fa651d1c5 100644
> --- a/src/box/replication.h
> +++ b/src/box/replication.h
> @@ -131,6 +131,12 @@ extern double replication_sync_lag;
>    */
>   extern int replication_synchro_quorum;
>   
> +/**
> + * A flag to point that replication_synchro_quorum needs
> + * to be evaluated as a formula.
> + */
> +extern bool replication_synchro_quorum_eval;
> +
>   /**
>    * Time in seconds which the master node is able to wait for ACKs
>    * for a synchronous transaction until it is rolled back.
> @@ -178,6 +184,9 @@ replication_disconnect_timeout(void)
>   void
>   replication_synchro_quorum_update(int value);
>   
> +void
> +replication_on_cluster_update(void);
> +
>   void
>   replication_init(void);
>   

-- 
Serge Petrenko



More information about the Tarantool-patches mailing list