From: Georgy Kirichenko <georgy@tarantool.org>
To: tarantool-patches@freelists.org
Cc: Serge Petrenko <sergepetrenko@tarantool.org>
Subject: [tarantool-patches] Re: [PATCH v2] replication: do not ignore replication_connect_quorum.
Date: Tue, 07 Aug 2018 17:18:09 +0300 [thread overview]
Message-ID: <1749394.pfGv1nC5fM@localhost> (raw)
In-Reply-To: <20180807124413.14947-1-sergepetrenko@tarantool.org>
[-- Attachment #1: Type: text/plain, Size: 15463 bytes --]
LGTM
On Tuesday, August 7, 2018 3:44:13 PM MSK Serge Petrenko wrote:
> On bootstrap and after replication reconfiguration
> replication_connect_quorum was ignored. The instance tried to connect to
> every replica listed in replication parameter, and errored if it wasn't
> possible.
> The patch alters this behaviour. The instance still tries to connect to
> every node listed in replication, but does not raise an error if it was
> able to connect to at least replication_connect_quorum instances.
>
> Closes #3428
> ---
> https://github.com/tarantool/tarantool/issues/3428
> https://github.com/tarantool/tarantool/tree/sergepetrenko/gh-3428-replicatio
> n-connect-quorum
>
> Changes in v2:
> - change test/replication/ddl.lua instance file to fix
> test failure on Travis.
>
> src/box/box.cc | 6 +++---
> src/box/replication.cc | 8 +++-----
> src/box/replication.h | 7 ++++---
> test/replication-py/init_storage.test.py | 2 +-
> test/replication-py/master.lua | 2 ++
> test/replication-py/replica.lua | 2 ++
> test/replication/autobootstrap.lua | 3 ++-
> test/replication/autobootstrap_guest.lua | 2 +-
> test/replication/ddl.lua | 3 ++-
> test/replication/errinj.result | 6 +++---
> test/replication/errinj.test.lua | 6 +++---
> test/replication/master.lua | 1 +
> test/replication/master_quorum.lua | 3 ++-
> test/replication/on_replace.lua | 3 ++-
> test/replication/quorum.lua | 4 ++--
> test/replication/rebootstrap.lua | 2 +-
> test/replication/replica_no_quorum.lua | 3 ++-
> test/replication/replica_timeout.lua | 3 ++-
> test/replication/replica_uuid_ro.lua | 2 +-
> 19 files changed, 39 insertions(+), 29 deletions(-)
>
> diff --git a/src/box/box.cc b/src/box/box.cc
> index e3eb2738f..f8731f464 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -595,7 +595,7 @@ cfg_get_replication(int *p_count)
> * don't start appliers.
> */
> static void
> -box_sync_replication(double timeout, bool connect_all)
> +box_sync_replication(double timeout, bool reach_quorum)
> {
> int count = 0;
> struct applier **appliers = cfg_get_replication(&count);
> @@ -607,7 +607,7 @@ box_sync_replication(double timeout, bool connect_all)
> applier_delete(appliers[i]); /* doesn't affect diag */
> });
>
> - replicaset_connect(appliers, count, timeout, connect_all);
> + replicaset_connect(appliers, count, timeout, reach_quorum);
>
> guard.is_active = false;
> }
> @@ -1888,7 +1888,7 @@ box_cfg_xc(void)
> * receive the same replica set UUID when a new cluster
> * is deployed.
> */
> - box_sync_replication(TIMEOUT_INFINITY, true);
> + box_sync_replication(replication_connect_timeout, true);
> /* Bootstrap a new master */
> bootstrap(&replicaset_uuid, &is_bootstrap_leader);
> }
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 528fe4459..a6c60220f 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -46,7 +46,7 @@ struct tt_uuid INSTANCE_UUID;
> struct tt_uuid REPLICASET_UUID;
>
> double replication_timeout = 1.0; /* seconds */
> -double replication_connect_timeout = 4.0; /* seconds */
> +double replication_connect_timeout = 10.0; /* seconds */
> int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL;
> double replication_sync_lag = 10.0; /* seconds */
>
> @@ -540,7 +540,7 @@ applier_on_connect_f(struct trigger *trigger, void
> *event)
>
> void
> replicaset_connect(struct applier **appliers, int count,
> - double timeout, bool connect_all)
> + double timeout, bool reach_quorum)
> {
> if (count == 0) {
> /* Cleanup the replica set. */
> @@ -587,15 +587,13 @@ replicaset_connect(struct applier **appliers, int
> count, double wait_start = ev_monotonic_now(loop());
> if (fiber_cond_wait_timeout(&state.wakeup, timeout) != 0)
> break;
> - if (state.failed > 0 && connect_all)
> - break;
> timeout -= ev_monotonic_now(loop()) - wait_start;
> }
> if (state.connected < count) {
> say_crit("failed to connect to %d out of %d replicas",
> count - state.connected, count);
> /* Timeout or connection failure. */
> - if (connect_all)
> + if (reach_quorum && state.connected < replication_connect_quorum)
> goto error;
> } else {
> say_verbose("connected to %d replicas", state.connected);
> diff --git a/src/box/replication.h b/src/box/replication.h
> index 95122eb45..c16c8b56c 100644
> --- a/src/box/replication.h
> +++ b/src/box/replication.h
> @@ -357,12 +357,13 @@ replicaset_add(uint32_t replica_id, const struct
> tt_uuid *instance_uuid); * \param appliers the array of appliers
> * \param count size of appliers array
> * \param timeout connection timeout
> - * \param connect_all if this flag is set, fail unless all
> - * appliers have successfully connected
> + * \param reach_quorum if this flag is set, fail unless at
> + * least replication_connect_quorum
> + * appliers have successfully connected.
> */
> void
> replicaset_connect(struct applier **appliers, int count,
> - double timeout, bool connect_all);
> + double timeout, bool reach_quorum);
>
> /**
> * Resume all appliers registered with the replica set.
> diff --git a/test/replication-py/init_storage.test.py
> b/test/replication-py/init_storage.test.py index 0911a02c0..32b4639f1
> 100644
> --- a/test/replication-py/init_storage.test.py
> +++ b/test/replication-py/init_storage.test.py
> @@ -57,7 +57,7 @@ print
> '-------------------------------------------------------------'
>
> server.stop()
> replica = TarantoolServer(server.ini)
> -replica.script = 'replication/replica.lua'
> +replica.script = 'replication-py/replica.lua'
> replica.vardir = server.vardir #os.path.join(server.vardir, 'replica')
> replica.rpl_master = master
> replica.deploy(wait=False)
> diff --git a/test/replication-py/master.lua b/test/replication-py/master.lua
> index 0f9f7a6f0..51283efdf 100644
> --- a/test/replication-py/master.lua
> +++ b/test/replication-py/master.lua
> @@ -3,6 +3,8 @@ os = require('os')
> box.cfg({
> listen = os.getenv("LISTEN"),
> memtx_memory = 107374182,
> + replication_connect_timeout = 1.0,
> + replication_timeout = 0.3
> })
>
> require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication-py/replica.lua
> b/test/replication-py/replica.lua index 278291bba..b9d193b70 100644
> --- a/test/replication-py/replica.lua
> +++ b/test/replication-py/replica.lua
> @@ -7,6 +7,8 @@ box.cfg({
> listen = os.getenv("LISTEN"),
> replication = os.getenv("MASTER"),
> memtx_memory = 107374182,
> + replication_connect_timeout = 1.0,
> + replication_timeout = 0.3
> })
>
> box_cfg_done = true
> diff --git a/test/replication/autobootstrap.lua
> b/test/replication/autobootstrap.lua index 4f55417ae..8fc6809de 100644
> --- a/test/replication/autobootstrap.lua
> +++ b/test/replication/autobootstrap.lua
> @@ -21,7 +21,8 @@ box.cfg({
> USER..':'..PASSWORD..'@'..instance_uri(2);
> USER..':'..PASSWORD..'@'..instance_uri(3);
> };
> - replication_connect_timeout = 0.5,
> + replication_connect_timeout = 3.0;
> + replication_timeout = 0.5;
> })
>
> box.once("bootstrap", function()
> diff --git a/test/replication/autobootstrap_guest.lua
> b/test/replication/autobootstrap_guest.lua index 40fef2c7a..7cd921e3c
> 100644
> --- a/test/replication/autobootstrap_guest.lua
> +++ b/test/replication/autobootstrap_guest.lua
> @@ -20,7 +20,7 @@ box.cfg({
> instance_uri(2);
> instance_uri(3);
> };
> - replication_connect_timeout = 0.5,
> + replication_connect_timeout = 5,
> })
>
> box.once("bootstrap", function()
> diff --git a/test/replication/ddl.lua b/test/replication/ddl.lua
> index 694f40eac..85403e35b 100644
> --- a/test/replication/ddl.lua
> +++ b/test/replication/ddl.lua
> @@ -22,7 +22,8 @@ box.cfg({
> USER..':'..PASSWORD..'@'..instance_uri(3);
> USER..':'..PASSWORD..'@'..instance_uri(4);
> };
> - replication_connect_timeout = 0.5,
> + replication_timeout = 0.1,
> + replication_connect_timeout = 2.0,
> })
>
> box.once("bootstrap", function()
> diff --git a/test/replication/errinj.result b/test/replication/errinj.result
> index ca8af2988..19d7d9a05 100644
> --- a/test/replication/errinj.result
> +++ b/test/replication/errinj.result
> @@ -418,7 +418,7 @@ test_run:cmd("create server replica_timeout with
> rpl_master=default, script='rep ---
> - true
> ...
> -test_run:cmd("start server replica_timeout with args='0.01'")
> +test_run:cmd("start server replica_timeout with args='0.1, 0.5'")
> ---
> - true
> ...
> @@ -474,7 +474,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0)
> ...
> -- Check replica's ACKs don't prevent the master from sending
> -- heartbeat messages (gh-3160).
> -test_run:cmd("start server replica_timeout with args='0.009'")
> +test_run:cmd("start server replica_timeout with args='0.009, 0.5'")
> ---
> - true
> ...
> @@ -522,7 +522,7 @@ for i = 0, 9999 do box.space.test:replace({i, 4, 5,
> 'test'}) end -- during the join stage, i.e. a replica with a minuscule
> -- timeout successfully bootstraps and breaks connection only
> -- after subscribe.
> -test_run:cmd("start server replica_timeout with args='0.00001'")
> +test_run:cmd("start server replica_timeout with args='0.00001, 0.5'")
> ---
> - true
> ...
> diff --git a/test/replication/errinj.test.lua
> b/test/replication/errinj.test.lua index 463d89a8f..f00b98eed 100644
> --- a/test/replication/errinj.test.lua
> +++ b/test/replication/errinj.test.lua
> @@ -173,7 +173,7 @@ errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0)
> box.cfg{replication_timeout = 0.01}
>
> test_run:cmd("create server replica_timeout with rpl_master=default,
> script='replication/replica_timeout.lua'") -test_run:cmd("start server
> replica_timeout with args='0.01'")
> +test_run:cmd("start server replica_timeout with args='0.1, 0.5'")
> test_run:cmd("switch replica_timeout")
>
> fiber = require('fiber')
> @@ -199,7 +199,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0)
> -- Check replica's ACKs don't prevent the master from sending
> -- heartbeat messages (gh-3160).
>
> -test_run:cmd("start server replica_timeout with args='0.009'")
> +test_run:cmd("start server replica_timeout with args='0.009, 0.5'")
> test_run:cmd("switch replica_timeout")
>
> fiber = require('fiber')
> @@ -219,7 +219,7 @@ for i = 0, 9999 do box.space.test:replace({i, 4, 5,
> 'test'}) end -- during the join stage, i.e. a replica with a minuscule
> -- timeout successfully bootstraps and breaks connection only
> -- after subscribe.
> -test_run:cmd("start server replica_timeout with args='0.00001'")
> +test_run:cmd("start server replica_timeout with args='0.00001, 0.5'")
> test_run:cmd("switch replica_timeout")
> fiber = require('fiber')
> while box.info.replication[1].upstream.message ~= 'timed out' do
> fiber.sleep(0.0001) end diff --git a/test/replication/master.lua
> b/test/replication/master.lua index 6d431aaeb..9b96b7891 100644
> --- a/test/replication/master.lua
> +++ b/test/replication/master.lua
> @@ -4,6 +4,7 @@ box.cfg({
> listen = os.getenv("LISTEN"),
> memtx_memory = 107374182,
> replication_connect_timeout = 0.5,
> + replication_timeout = 0.1
> })
>
> require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication/master_quorum.lua
> b/test/replication/master_quorum.lua index fb5f7ec2b..6e0429f65 100644
> --- a/test/replication/master_quorum.lua
> +++ b/test/replication/master_quorum.lua
> @@ -20,7 +20,8 @@ box.cfg({
> instance_uri(2);
> };
> replication_connect_quorum = 0;
> - replication_connect_timeout = 0.1;
> + replication_timeout = 0.5;
> + replication_connect_timeout = 2.0;
> })
>
> test_run = require('test_run').new()
> diff --git a/test/replication/on_replace.lua
> b/test/replication/on_replace.lua index 03f15d94c..bafead48d 100644
> --- a/test/replication/on_replace.lua
> +++ b/test/replication/on_replace.lua
> @@ -20,7 +20,8 @@ box.cfg({
> USER..':'..PASSWORD..'@'..instance_uri(1);
> USER..':'..PASSWORD..'@'..instance_uri(2);
> };
> - replication_connect_timeout = 0.5,
> + replication_timeout = 0.5,
> + replication_connect_timeout = 1.0,
> })
>
> env = require('test_run')
> diff --git a/test/replication/quorum.lua b/test/replication/quorum.lua
> index 9c7bf5c93..7f85d7b13 100644
> --- a/test/replication/quorum.lua
> +++ b/test/replication/quorum.lua
> @@ -15,8 +15,8 @@ require('console').listen(os.getenv('ADMIN'))
> box.cfg({
> listen = instance_uri(INSTANCE_ID);
> replication_timeout = 0.05;
> - replication_sync_lag = 0.01;
> - replication_connect_timeout = 0.1;
> + replication_sync_lag = 0.1;
> + replication_connect_timeout = 3.0;
> replication_connect_quorum = 3;
> replication = {
> instance_uri(1);
> diff --git a/test/replication/rebootstrap.lua
> b/test/replication/rebootstrap.lua index e743577e4..f1e8d69e9 100644
> --- a/test/replication/rebootstrap.lua
> +++ b/test/replication/rebootstrap.lua
> @@ -15,7 +15,7 @@ box.cfg({
> listen = instance_uri(INSTANCE_ID),
> instance_uuid = '12345678-abcd-1234-abcd-123456789ef' .. INSTANCE_ID,
> replication_timeout = 0.1,
> - replication_connect_timeout = 0.5,
> + replication_connect_timeout = 2.0,
> replication = {
> instance_uri(1);
> instance_uri(2);
> diff --git a/test/replication/replica_no_quorum.lua
> b/test/replication/replica_no_quorum.lua index b9edeea94..c30c043cc 100644
> --- a/test/replication/replica_no_quorum.lua
> +++ b/test/replication/replica_no_quorum.lua
> @@ -5,7 +5,8 @@ box.cfg({
> replication = os.getenv("MASTER"),
> memtx_memory = 107374182,
> replication_connect_quorum = 0,
> - replication_connect_timeout = 0.1,
> + replication_timeout = 0.1,
> + replication_connect_timeout = 0.5,
> })
>
> require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication/replica_timeout.lua
> b/test/replication/replica_timeout.lua index 64f119763..51c718360 100644
> --- a/test/replication/replica_timeout.lua
> +++ b/test/replication/replica_timeout.lua
> @@ -1,13 +1,14 @@
> #!/usr/bin/env tarantool
>
> local TIMEOUT = tonumber(arg[1])
> +local CON_TIMEOUT = arg[2] and tonumber(arg[2]) or TIMEOUT * 3
>
> box.cfg({
> listen = os.getenv("LISTEN"),
> replication = os.getenv("MASTER"),
> memtx_memory = 107374182,
> replication_timeout = TIMEOUT,
> - replication_connect_timeout = TIMEOUT * 3,
> + replication_connect_timeout = CON_TIMEOUT,
> })
>
> require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication/replica_uuid_ro.lua
> b/test/replication/replica_uuid_ro.lua index 8e1c6cc47..ff70da144 100644
> --- a/test/replication/replica_uuid_ro.lua
> +++ b/test/replication/replica_uuid_ro.lua
> @@ -22,7 +22,7 @@ box.cfg({
> USER..':'..PASSWORD..'@'..instance_uri(2);
> };
> read_only = (INSTANCE_ID ~= '1' and true or false);
> - replication_connect_timeout = 0.5,
> + replication_connect_timeout = 5,
> })
>
> box.once("bootstrap", function()
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
next prev parent reply other threads:[~2018-08-07 14:18 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-08-07 12:44 [tarantool-patches] " Serge Petrenko
2018-08-07 14:18 ` Georgy Kirichenko [this message]
2018-08-07 17:28 ` Vladimir Davydov
2018-08-09 7:50 ` [tarantool-patches] " Sergey Petrenko
2018-08-09 9:11 ` Vladimir Davydov
2018-08-09 12:20 ` Vladimir Davydov
2018-08-09 12:39 ` Serge Petrenko
2018-08-10 9:14 ` [PATCH v3] " Serge Petrenko
2018-08-10 10:48 ` Vladimir Davydov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1749394.pfGv1nC5fM@localhost \
--to=georgy@tarantool.org \
--cc=sergepetrenko@tarantool.org \
--cc=tarantool-patches@freelists.org \
--subject='[tarantool-patches] Re: [PATCH v2] replication: do not ignore replication_connect_quorum.' \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox