Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: Serge Petrenko <sergepetrenko@tarantool.org>
Cc: tarantool-patches@freelists.org
Subject: Re: [PATCH v2] replication: enter orphan mode on every erroneous config change
Date: Tue, 27 Aug 2019 15:38:32 +0300	[thread overview]
Message-ID: <20190827123832.GP13834@esperanza> (raw)
In-Reply-To: <20190823153319.11958-1-sergepetrenko@tarantool.org>

On Fri, Aug 23, 2019 at 06:33:19PM +0300, Serge Petrenko wrote:
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 66cd6d3a4..272d9addb 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -679,7 +679,10 @@ box_sync_replication(bool connect_quorum)
>  			applier_delete(appliers[i]); /* doesn't affect diag */
>  	});
>  
> -	replicaset_connect(appliers, count, connect_quorum);
> +	replicaset_connect(appliers, count);
> +
> +	if (connect_quorum)
> +		replicaset_check_quorum();

This function (box_sync_replication) is also called on bootstrap, when
we do want to throw an error in case we fail to connect to 'quorum'
replicas.

Anyway, why do you need to call this function? Wouldn't replicaset_sync
switch the instance to ro in this case? In other words, why can't we
simply pass connect_quorum=false to box_sync_replication called from
box_set_replication?

>  
>  	guard.is_active = false;
>  }
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 28f7acedc..0f6e0ff83 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -599,8 +599,7 @@ applier_on_connect_f(struct trigger *trigger, void *event)
>  }
>  
>  void
> -replicaset_connect(struct applier **appliers, int count,
> -		   bool connect_quorum)
> +replicaset_connect(struct applier **appliers, int count)
>  {
>  	if (count == 0) {
>  		/* Cleanup the replica set. */
> @@ -658,12 +657,6 @@ replicaset_connect(struct applier **appliers, int count,
>  	if (state.connected < count) {
>  		say_crit("failed to connect to %d out of %d replicas",
>  			 count - state.connected, count);
> -		/* Timeout or connection failure. */
> -		if (connect_quorum && state.connected < quorum) {
> -			diag_set(ClientError, ER_CFG, "replication",
> -				 "failed to connect to one or more replicas");
> -			goto error;
> -		}
>  	} else {
>  		say_info("connected to %d replicas", state.connected);
>  	}
> @@ -685,16 +678,14 @@ replicaset_connect(struct applier **appliers, int count,
>  	try {
>  		replicaset_update(appliers, count);
>  	} catch (Exception *e) {
> -		goto error;
> +		/* Destroy appliers */
> +		for (int i = 0; i < count; i++) {
> +			trigger_clear(&triggers[i].base);
> +			applier_stop(appliers[i]);
> +		}
> +		diag_raise();
>  	}
>  	return;
> -error:
> -	/* Destroy appliers */
> -	for (int i = 0; i < count; i++) {
> -		trigger_clear(&triggers[i].base);
> -		applier_stop(appliers[i]);
> -	}
> -	diag_raise();
>  }
>  
>  bool
> @@ -772,7 +763,6 @@ void
>  replicaset_sync(void)
>  {
>  	int quorum = replicaset_quorum();
> -
>  	if (quorum == 0) {
>  		/*
>  		 * Quorum is 0 or replication is not configured.
> @@ -816,8 +806,11 @@ replicaset_sync(void)
>  void
>  replicaset_check_quorum(void)
>  {
> -	if (replicaset.applier.synced >= replicaset_quorum())
> +	if (replicaset.applier.synced >= replicaset_quorum()) {
>  		box_set_orphan(false);
> +	} else if (replicaset.applier.connected < replicaset_quorum()) {
> +		box_set_orphan(true);
> +	}

AFAICS this isn't quite consistent with the comment to the function
prototype, which says:

  Check whether a replication quorum is formed.
  If it is, switch to write mode. Switch to readonly
  mode otherwise.

>  }
>  
>  void
> diff --git a/src/box/replication.h b/src/box/replication.h
> index 19f283c7d..999c8acd2 100644
> --- a/src/box/replication.h
> +++ b/src/box/replication.h
> @@ -379,8 +379,7 @@ replicaset_add(uint32_t replica_id, const struct tt_uuid *instance_uuid);
>   *                       appliers have successfully connected.
>   */
>  void
> -replicaset_connect(struct applier **appliers, int count,
> -		   bool connect_quorum);
> +replicaset_connect(struct applier **appliers, int count);

This change obsoletes the comment to this function.

>  
>  /**
>   * Check if the current instance fell too much behind its
> @@ -406,8 +405,9 @@ void
>  replicaset_sync(void);
>  
>  /**
> - * Check if a replication quorum has been formed and
> - * switch the server to the write mode if so.
> + * Check whether a replication quorum is formed.
> + * If it is, switch to write mode. Switch to readonly
> + * mode otherwise.
>   */
>  void
>  replicaset_check_quorum(void);

      reply	other threads:[~2019-08-27 12:38 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-23 15:33 Serge Petrenko
2019-08-27 12:38 ` Vladimir Davydov [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190827123832.GP13834@esperanza \
    --to=vdavydov.dev@gmail.com \
    --cc=sergepetrenko@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [PATCH v2] replication: enter orphan mode on every erroneous config change' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox