Tarantool development patches archive
 help / color / mirror / Atom feed
From: Georgy Kirichenko <georgy@tarantool.org>
To: tarantool-patches@freelists.org
Cc: Serge Petrenko <sergepetrenko@tarantool.org>
Subject: [tarantool-patches] Re: [PATCH v2] replication: do not ignore replication_connect_quorum.
Date: Tue, 07 Aug 2018 17:18:09 +0300	[thread overview]
Message-ID: <1749394.pfGv1nC5fM@localhost> (raw)
In-Reply-To: <20180807124413.14947-1-sergepetrenko@tarantool.org>

[-- Attachment #1: Type: text/plain, Size: 15463 bytes --]

LGTM

On Tuesday, August 7, 2018 3:44:13 PM MSK Serge Petrenko wrote:
> On bootstrap and after replication reconfiguration
> replication_connect_quorum was ignored. The instance tried to connect to
> every replica listed in replication parameter, and errored if it wasn't
> possible.
> The patch alters this behaviour. The instance still tries to connect to
> every node listed in replication, but does not raise an error if it was
> able to connect to at least replication_connect_quorum instances.
> 
> Closes #3428
> ---
> https://github.com/tarantool/tarantool/issues/3428
> https://github.com/tarantool/tarantool/tree/sergepetrenko/gh-3428-replicatio
> n-connect-quorum
> 
> Changes in v2:
>   - change test/replication/ddl.lua instance file to fix
>     test failure on Travis.
> 
>  src/box/box.cc                           | 6 +++---
>  src/box/replication.cc                   | 8 +++-----
>  src/box/replication.h                    | 7 ++++---
>  test/replication-py/init_storage.test.py | 2 +-
>  test/replication-py/master.lua           | 2 ++
>  test/replication-py/replica.lua          | 2 ++
>  test/replication/autobootstrap.lua       | 3 ++-
>  test/replication/autobootstrap_guest.lua | 2 +-
>  test/replication/ddl.lua                 | 3 ++-
>  test/replication/errinj.result           | 6 +++---
>  test/replication/errinj.test.lua         | 6 +++---
>  test/replication/master.lua              | 1 +
>  test/replication/master_quorum.lua       | 3 ++-
>  test/replication/on_replace.lua          | 3 ++-
>  test/replication/quorum.lua              | 4 ++--
>  test/replication/rebootstrap.lua         | 2 +-
>  test/replication/replica_no_quorum.lua   | 3 ++-
>  test/replication/replica_timeout.lua     | 3 ++-
>  test/replication/replica_uuid_ro.lua     | 2 +-
>  19 files changed, 39 insertions(+), 29 deletions(-)
> 
> diff --git a/src/box/box.cc b/src/box/box.cc
> index e3eb2738f..f8731f464 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -595,7 +595,7 @@ cfg_get_replication(int *p_count)
>   * don't start appliers.
>   */
>  static void
> -box_sync_replication(double timeout, bool connect_all)
> +box_sync_replication(double timeout, bool reach_quorum)
>  {
>  	int count = 0;
>  	struct applier **appliers = cfg_get_replication(&count);
> @@ -607,7 +607,7 @@ box_sync_replication(double timeout, bool connect_all)
>  			applier_delete(appliers[i]); /* doesn't affect diag */
>  	});
> 
> -	replicaset_connect(appliers, count, timeout, connect_all);
> +	replicaset_connect(appliers, count, timeout, reach_quorum);
> 
>  	guard.is_active = false;
>  }
> @@ -1888,7 +1888,7 @@ box_cfg_xc(void)
>  		 * receive the same replica set UUID when a new cluster
>  		 * is deployed.
>  		 */
> -		box_sync_replication(TIMEOUT_INFINITY, true);
> +		box_sync_replication(replication_connect_timeout, true);
>  		/* Bootstrap a new master */
>  		bootstrap(&replicaset_uuid, &is_bootstrap_leader);
>  	}
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 528fe4459..a6c60220f 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -46,7 +46,7 @@ struct tt_uuid INSTANCE_UUID;
>  struct tt_uuid REPLICASET_UUID;
> 
>  double replication_timeout = 1.0; /* seconds */
> -double replication_connect_timeout = 4.0; /* seconds */
> +double replication_connect_timeout = 10.0; /* seconds */
>  int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL;
>  double replication_sync_lag = 10.0; /* seconds */
> 
> @@ -540,7 +540,7 @@ applier_on_connect_f(struct trigger *trigger, void
> *event)
> 
>  void
>  replicaset_connect(struct applier **appliers, int count,
> -		   double timeout, bool connect_all)
> +		   double timeout, bool reach_quorum)
>  {
>  	if (count == 0) {
>  		/* Cleanup the replica set. */
> @@ -587,15 +587,13 @@ replicaset_connect(struct applier **appliers, int
> count, double wait_start = ev_monotonic_now(loop());
>  		if (fiber_cond_wait_timeout(&state.wakeup, timeout) != 0)
>  			break;
> -		if (state.failed > 0 && connect_all)
> -			break;
>  		timeout -= ev_monotonic_now(loop()) - wait_start;
>  	}
>  	if (state.connected < count) {
>  		say_crit("failed to connect to %d out of %d replicas",
>  			 count - state.connected, count);
>  		/* Timeout or connection failure. */
> -		if (connect_all)
> +		if (reach_quorum && state.connected < replication_connect_quorum)
>  			goto error;
>  	} else {
>  		say_verbose("connected to %d replicas", state.connected);
> diff --git a/src/box/replication.h b/src/box/replication.h
> index 95122eb45..c16c8b56c 100644
> --- a/src/box/replication.h
> +++ b/src/box/replication.h
> @@ -357,12 +357,13 @@ replicaset_add(uint32_t replica_id, const struct
> tt_uuid *instance_uuid); * \param appliers the array of appliers
>   * \param count size of appliers array
>   * \param timeout connection timeout
> - * \param connect_all if this flag is set, fail unless all
> - *                    appliers have successfully connected
> + * \param reach_quorum if this flag is set, fail unless at
> + *		       least replication_connect_quorum
> + *		       appliers have successfully connected.
>   */
>  void
>  replicaset_connect(struct applier **appliers, int count,
> -		   double timeout, bool connect_all);
> +		   double timeout, bool reach_quorum);
> 
>  /**
>   * Resume all appliers registered with the replica set.
> diff --git a/test/replication-py/init_storage.test.py
> b/test/replication-py/init_storage.test.py index 0911a02c0..32b4639f1
> 100644
> --- a/test/replication-py/init_storage.test.py
> +++ b/test/replication-py/init_storage.test.py
> @@ -57,7 +57,7 @@ print
> '-------------------------------------------------------------'
> 
>  server.stop()
>  replica = TarantoolServer(server.ini)
> -replica.script = 'replication/replica.lua'
> +replica.script = 'replication-py/replica.lua'
>  replica.vardir = server.vardir #os.path.join(server.vardir, 'replica')
>  replica.rpl_master = master
>  replica.deploy(wait=False)
> diff --git a/test/replication-py/master.lua b/test/replication-py/master.lua
> index 0f9f7a6f0..51283efdf 100644
> --- a/test/replication-py/master.lua
> +++ b/test/replication-py/master.lua
> @@ -3,6 +3,8 @@ os = require('os')
>  box.cfg({
>      listen              = os.getenv("LISTEN"),
>      memtx_memory        = 107374182,
> +    replication_connect_timeout = 1.0,
> +    replication_timeout = 0.3
>  })
> 
>  require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication-py/replica.lua
> b/test/replication-py/replica.lua index 278291bba..b9d193b70 100644
> --- a/test/replication-py/replica.lua
> +++ b/test/replication-py/replica.lua
> @@ -7,6 +7,8 @@ box.cfg({
>      listen              = os.getenv("LISTEN"),
>      replication         = os.getenv("MASTER"),
>      memtx_memory        = 107374182,
> +    replication_connect_timeout = 1.0,
> +    replication_timeout = 0.3
>  })
> 
>  box_cfg_done = true
> diff --git a/test/replication/autobootstrap.lua
> b/test/replication/autobootstrap.lua index 4f55417ae..8fc6809de 100644
> --- a/test/replication/autobootstrap.lua
> +++ b/test/replication/autobootstrap.lua
> @@ -21,7 +21,8 @@ box.cfg({
>          USER..':'..PASSWORD..'@'..instance_uri(2);
>          USER..':'..PASSWORD..'@'..instance_uri(3);
>      };
> -    replication_connect_timeout = 0.5,
> +    replication_connect_timeout = 3.0;
> +    replication_timeout = 0.5;
>  })
> 
>  box.once("bootstrap", function()
> diff --git a/test/replication/autobootstrap_guest.lua
> b/test/replication/autobootstrap_guest.lua index 40fef2c7a..7cd921e3c
> 100644
> --- a/test/replication/autobootstrap_guest.lua
> +++ b/test/replication/autobootstrap_guest.lua
> @@ -20,7 +20,7 @@ box.cfg({
>          instance_uri(2);
>          instance_uri(3);
>      };
> -    replication_connect_timeout = 0.5,
> +    replication_connect_timeout = 5,
>  })
> 
>  box.once("bootstrap", function()
> diff --git a/test/replication/ddl.lua b/test/replication/ddl.lua
> index 694f40eac..85403e35b 100644
> --- a/test/replication/ddl.lua
> +++ b/test/replication/ddl.lua
> @@ -22,7 +22,8 @@ box.cfg({
>          USER..':'..PASSWORD..'@'..instance_uri(3);
>          USER..':'..PASSWORD..'@'..instance_uri(4);
>      };
> -    replication_connect_timeout = 0.5,
> +    replication_timeout = 0.1,
> +    replication_connect_timeout = 2.0,
>  })
> 
>  box.once("bootstrap", function()
> diff --git a/test/replication/errinj.result b/test/replication/errinj.result
> index ca8af2988..19d7d9a05 100644
> --- a/test/replication/errinj.result
> +++ b/test/replication/errinj.result
> @@ -418,7 +418,7 @@ test_run:cmd("create server replica_timeout with
> rpl_master=default, script='rep ---
>  - true
>  ...
> -test_run:cmd("start server replica_timeout with args='0.01'")
> +test_run:cmd("start server replica_timeout with args='0.1, 0.5'")
>  ---
>  - true
>  ...
> @@ -474,7 +474,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0)
>  ...
>  -- Check replica's ACKs don't prevent the master from sending
>  -- heartbeat messages (gh-3160).
> -test_run:cmd("start server replica_timeout with args='0.009'")
> +test_run:cmd("start server replica_timeout with args='0.009, 0.5'")
>  ---
>  - true
>  ...
> @@ -522,7 +522,7 @@ for i = 0, 9999 do box.space.test:replace({i, 4, 5,
> 'test'}) end -- during the join stage, i.e. a replica with a minuscule
>  -- timeout successfully bootstraps and breaks connection only
>  -- after subscribe.
> -test_run:cmd("start server replica_timeout with args='0.00001'")
> +test_run:cmd("start server replica_timeout with args='0.00001, 0.5'")
>  ---
>  - true
>  ...
> diff --git a/test/replication/errinj.test.lua
> b/test/replication/errinj.test.lua index 463d89a8f..f00b98eed 100644
> --- a/test/replication/errinj.test.lua
> +++ b/test/replication/errinj.test.lua
> @@ -173,7 +173,7 @@ errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0)
>  box.cfg{replication_timeout = 0.01}
> 
>  test_run:cmd("create server replica_timeout with rpl_master=default,
> script='replication/replica_timeout.lua'") -test_run:cmd("start server
> replica_timeout with args='0.01'")
> +test_run:cmd("start server replica_timeout with args='0.1, 0.5'")
>  test_run:cmd("switch replica_timeout")
> 
>  fiber = require('fiber')
> @@ -199,7 +199,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0)
>  -- Check replica's ACKs don't prevent the master from sending
>  -- heartbeat messages (gh-3160).
> 
> -test_run:cmd("start server replica_timeout with args='0.009'")
> +test_run:cmd("start server replica_timeout with args='0.009, 0.5'")
>  test_run:cmd("switch replica_timeout")
> 
>  fiber = require('fiber')
> @@ -219,7 +219,7 @@ for i = 0, 9999 do box.space.test:replace({i, 4, 5,
> 'test'}) end -- during the join stage, i.e. a replica with a minuscule
>  -- timeout successfully bootstraps and breaks connection only
>  -- after subscribe.
> -test_run:cmd("start server replica_timeout with args='0.00001'")
> +test_run:cmd("start server replica_timeout with args='0.00001, 0.5'")
>  test_run:cmd("switch replica_timeout")
>  fiber = require('fiber')
>  while box.info.replication[1].upstream.message ~= 'timed out' do
> fiber.sleep(0.0001) end diff --git a/test/replication/master.lua
> b/test/replication/master.lua index 6d431aaeb..9b96b7891 100644
> --- a/test/replication/master.lua
> +++ b/test/replication/master.lua
> @@ -4,6 +4,7 @@ box.cfg({
>      listen              = os.getenv("LISTEN"),
>      memtx_memory        = 107374182,
>      replication_connect_timeout = 0.5,
> +    replication_timeout = 0.1
>  })
> 
>  require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication/master_quorum.lua
> b/test/replication/master_quorum.lua index fb5f7ec2b..6e0429f65 100644
> --- a/test/replication/master_quorum.lua
> +++ b/test/replication/master_quorum.lua
> @@ -20,7 +20,8 @@ box.cfg({
>          instance_uri(2);
>      };
>      replication_connect_quorum = 0;
> -    replication_connect_timeout = 0.1;
> +    replication_timeout = 0.5;
> +    replication_connect_timeout = 2.0;
>  })
> 
>  test_run = require('test_run').new()
> diff --git a/test/replication/on_replace.lua
> b/test/replication/on_replace.lua index 03f15d94c..bafead48d 100644
> --- a/test/replication/on_replace.lua
> +++ b/test/replication/on_replace.lua
> @@ -20,7 +20,8 @@ box.cfg({
>          USER..':'..PASSWORD..'@'..instance_uri(1);
>          USER..':'..PASSWORD..'@'..instance_uri(2);
>      };
> -    replication_connect_timeout = 0.5,
> +    replication_timeout = 0.5,
> +    replication_connect_timeout = 1.0,
>  })
> 
>  env = require('test_run')
> diff --git a/test/replication/quorum.lua b/test/replication/quorum.lua
> index 9c7bf5c93..7f85d7b13 100644
> --- a/test/replication/quorum.lua
> +++ b/test/replication/quorum.lua
> @@ -15,8 +15,8 @@ require('console').listen(os.getenv('ADMIN'))
>  box.cfg({
>      listen = instance_uri(INSTANCE_ID);
>      replication_timeout = 0.05;
> -    replication_sync_lag = 0.01;
> -    replication_connect_timeout = 0.1;
> +    replication_sync_lag = 0.1;
> +    replication_connect_timeout = 3.0;
>      replication_connect_quorum = 3;
>      replication = {
>          instance_uri(1);
> diff --git a/test/replication/rebootstrap.lua
> b/test/replication/rebootstrap.lua index e743577e4..f1e8d69e9 100644
> --- a/test/replication/rebootstrap.lua
> +++ b/test/replication/rebootstrap.lua
> @@ -15,7 +15,7 @@ box.cfg({
>      listen = instance_uri(INSTANCE_ID),
>      instance_uuid = '12345678-abcd-1234-abcd-123456789ef' .. INSTANCE_ID,
>      replication_timeout = 0.1,
> -    replication_connect_timeout = 0.5,
> +    replication_connect_timeout = 2.0,
>      replication = {
>          instance_uri(1);
>          instance_uri(2);
> diff --git a/test/replication/replica_no_quorum.lua
> b/test/replication/replica_no_quorum.lua index b9edeea94..c30c043cc 100644
> --- a/test/replication/replica_no_quorum.lua
> +++ b/test/replication/replica_no_quorum.lua
> @@ -5,7 +5,8 @@ box.cfg({
>      replication         = os.getenv("MASTER"),
>      memtx_memory        = 107374182,
>      replication_connect_quorum = 0,
> -    replication_connect_timeout = 0.1,
> +    replication_timeout = 0.1,
> +    replication_connect_timeout = 0.5,
>  })
> 
>  require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication/replica_timeout.lua
> b/test/replication/replica_timeout.lua index 64f119763..51c718360 100644
> --- a/test/replication/replica_timeout.lua
> +++ b/test/replication/replica_timeout.lua
> @@ -1,13 +1,14 @@
>  #!/usr/bin/env tarantool
> 
>  local TIMEOUT = tonumber(arg[1])
> +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or TIMEOUT * 3
> 
>  box.cfg({
>      listen              = os.getenv("LISTEN"),
>      replication         = os.getenv("MASTER"),
>      memtx_memory        = 107374182,
>      replication_timeout = TIMEOUT,
> -    replication_connect_timeout = TIMEOUT * 3,
> +    replication_connect_timeout = CON_TIMEOUT,
>  })
> 
>  require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication/replica_uuid_ro.lua
> b/test/replication/replica_uuid_ro.lua index 8e1c6cc47..ff70da144 100644
> --- a/test/replication/replica_uuid_ro.lua
> +++ b/test/replication/replica_uuid_ro.lua
> @@ -22,7 +22,7 @@ box.cfg({
>          USER..':'..PASSWORD..'@'..instance_uri(2);
>      };
>      read_only = (INSTANCE_ID ~= '1' and true or false);
> -    replication_connect_timeout = 0.5,
> +    replication_connect_timeout = 5,
>  })
> 
>  box.once("bootstrap", function()


[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

  reply	other threads:[~2018-08-07 14:18 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-07 12:44 [tarantool-patches] " Serge Petrenko
2018-08-07 14:18 ` Georgy Kirichenko [this message]
2018-08-07 17:28 ` Vladimir Davydov
2018-08-09  7:50   ` [tarantool-patches] " Sergey Petrenko
2018-08-09  9:11     ` Vladimir Davydov
2018-08-09 12:20       ` Vladimir Davydov
2018-08-09 12:39         ` Serge Petrenko
2018-08-10  9:14           ` [PATCH v3] " Serge Petrenko
2018-08-10 10:48             ` Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1749394.pfGv1nC5fM@localhost \
    --to=georgy@tarantool.org \
    --cc=sergepetrenko@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='[tarantool-patches] Re: [PATCH v2] replication: do not ignore replication_connect_quorum.' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox