[PATCH v2] replication: enter orphan mode on every erroneous config change

Vladimir Davydov vdavydov.dev at gmail.com
Tue Aug 27 15:38:32 MSK 2019


On Fri, Aug 23, 2019 at 06:33:19PM +0300, Serge Petrenko wrote:
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 66cd6d3a4..272d9addb 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -679,7 +679,10 @@ box_sync_replication(bool connect_quorum)
>  			applier_delete(appliers[i]); /* doesn't affect diag */
>  	});
>  
> -	replicaset_connect(appliers, count, connect_quorum);
> +	replicaset_connect(appliers, count);
> +
> +	if (connect_quorum)
> +		replicaset_check_quorum();

This function (box_sync_replication) is also called on bootstrap, when
we do want to throw an error in case we fail to connect to 'quorum'
replicas.

Anyway, why do you need to call this function? Wouldn't replicaset_sync
switch the instance to ro in this case? In other words, why can't we
simply pass connect_quorum=false to box_sync_replication called from
box_set_replication?

>  
>  	guard.is_active = false;
>  }
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 28f7acedc..0f6e0ff83 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -599,8 +599,7 @@ applier_on_connect_f(struct trigger *trigger, void *event)
>  }
>  
>  void
> -replicaset_connect(struct applier **appliers, int count,
> -		   bool connect_quorum)
> +replicaset_connect(struct applier **appliers, int count)
>  {
>  	if (count == 0) {
>  		/* Cleanup the replica set. */
> @@ -658,12 +657,6 @@ replicaset_connect(struct applier **appliers, int count,
>  	if (state.connected < count) {
>  		say_crit("failed to connect to %d out of %d replicas",
>  			 count - state.connected, count);
> -		/* Timeout or connection failure. */
> -		if (connect_quorum && state.connected < quorum) {
> -			diag_set(ClientError, ER_CFG, "replication",
> -				 "failed to connect to one or more replicas");
> -			goto error;
> -		}
>  	} else {
>  		say_info("connected to %d replicas", state.connected);
>  	}
> @@ -685,16 +678,14 @@ replicaset_connect(struct applier **appliers, int count,
>  	try {
>  		replicaset_update(appliers, count);
>  	} catch (Exception *e) {
> -		goto error;
> +		/* Destroy appliers */
> +		for (int i = 0; i < count; i++) {
> +			trigger_clear(&triggers[i].base);
> +			applier_stop(appliers[i]);
> +		}
> +		diag_raise();
>  	}
>  	return;
> -error:
> -	/* Destroy appliers */
> -	for (int i = 0; i < count; i++) {
> -		trigger_clear(&triggers[i].base);
> -		applier_stop(appliers[i]);
> -	}
> -	diag_raise();
>  }
>  
>  bool
> @@ -772,7 +763,6 @@ void
>  replicaset_sync(void)
>  {
>  	int quorum = replicaset_quorum();
> -
>  	if (quorum == 0) {
>  		/*
>  		 * Quorum is 0 or replication is not configured.
> @@ -816,8 +806,11 @@ replicaset_sync(void)
>  void
>  replicaset_check_quorum(void)
>  {
> -	if (replicaset.applier.synced >= replicaset_quorum())
> +	if (replicaset.applier.synced >= replicaset_quorum()) {
>  		box_set_orphan(false);
> +	} else if (replicaset.applier.connected < replicaset_quorum()) {
> +		box_set_orphan(true);
> +	}

AFAICS this isn't quite consistent with the comment to the function
prototype, which says:

  Check whether a replication quorum is formed.
  If it is, switch to write mode. Switch to readonly
  mode otherwise.

>  }
>  
>  void
> diff --git a/src/box/replication.h b/src/box/replication.h
> index 19f283c7d..999c8acd2 100644
> --- a/src/box/replication.h
> +++ b/src/box/replication.h
> @@ -379,8 +379,7 @@ replicaset_add(uint32_t replica_id, const struct tt_uuid *instance_uuid);
>   *                       appliers have successfully connected.
>   */
>  void
> -replicaset_connect(struct applier **appliers, int count,
> -		   bool connect_quorum);
> +replicaset_connect(struct applier **appliers, int count);

This change obsoletes the comment to this function.

>  
>  /**
>   * Check if the current instance fell too much behind its
> @@ -406,8 +405,9 @@ void
>  replicaset_sync(void);
>  
>  /**
> - * Check if a replication quorum has been formed and
> - * switch the server to the write mode if so.
> + * Check whether a replication quorum is formed.
> + * If it is, switch to write mode. Switch to readonly
> + * mode otherwise.
>   */
>  void
>  replicaset_check_quorum(void);



More information about the Tarantool-patches mailing list