From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Sergey Petrenko Message-Id: <34F1BCA9-2C61-4C3A-939D-F90E745B349D@tarantool.org> Content-Type: multipart/alternative; boundary="Apple-Mail=_D0D63893-4442-4794-9FE2-F00A1CCCBF72" Mime-Version: 1.0 (Mac OS X Mail 11.5 \(3445.9.1\)) Subject: Re: [tarantool-patches] Re: [PATCH v2] replication: do not ignore replication_connect_quorum. Date: Thu, 9 Aug 2018 10:50:46 +0300 In-Reply-To: <20180807172847.z7h7476jedxktwno@esperanza> References: <20180807124413.14947-1-sergepetrenko@tarantool.org> <20180807172847.z7h7476jedxktwno@esperanza> To: Vladimir Davydov Cc: Georgy Kirichenko , tarantool-patches@freelists.org List-ID: --Apple-Mail=_D0D63893-4442-4794-9FE2-F00A1CCCBF72 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset=utf-8 Hi! Thank you for the review! I fixed your remarks. The new diff is below. Also please see my = comments. > 7 =D0=B0=D0=B2=D0=B3. 2018 =D0=B3., =D0=B2 20:28, Vladimir Davydov = =D0=BD=D0=B0=D0=BF=D0=B8=D1=81=D0=B0=D0=BB(=D0=B0= ): >=20 > On Tue, Aug 07, 2018 at 03:44:13PM +0300, Serge Petrenko wrote: >> On bootstrap and after replication reconfiguration >> replication_connect_quorum was ignored. The instance tried to connect = to >> every replica listed in replication parameter, and errored if it = wasn't >> possible. >> The patch alters this behaviour. The instance still tries to connect = to >> every node listed in replication, but does not raise an error if it = was >> able to connect to at least replication_connect_quorum instances. >=20 > Please append a documentation request (@TarantoolBot) to the commit > message. >=20 Done. >>=20 >> Closes #3428 >> --- >> https://github.com/tarantool/tarantool/issues/3428 >> = https://github.com/tarantool/tarantool/tree/sergepetrenko/gh-3428-replicat= ion-connect-quorum >>=20 >> Changes in v2: >> - change test/replication/ddl.lua instance file to fix >> test failure on Travis. >>=20 >> src/box/box.cc | 6 +++--- >> src/box/replication.cc | 8 +++----- >> src/box/replication.h | 7 ++++--- >> test/replication-py/init_storage.test.py | 2 +- >> test/replication-py/master.lua | 2 ++ >> test/replication-py/replica.lua | 2 ++ >> test/replication/autobootstrap.lua | 3 ++- >> test/replication/autobootstrap_guest.lua | 2 +- >> test/replication/ddl.lua | 3 ++- >> test/replication/errinj.result | 6 +++--- >> test/replication/errinj.test.lua | 6 +++--- >> test/replication/master.lua | 1 + >> test/replication/master_quorum.lua | 3 ++- >> test/replication/on_replace.lua | 3 ++- >> test/replication/quorum.lua | 4 ++-- >> test/replication/rebootstrap.lua | 2 +- >> test/replication/replica_no_quorum.lua | 3 ++- >> test/replication/replica_timeout.lua | 3 ++- >> test/replication/replica_uuid_ro.lua | 2 +- >> 19 files changed, 39 insertions(+), 29 deletions(-) >>=20 >> diff --git a/src/box/box.cc b/src/box/box.cc >> index e3eb2738f..f8731f464 100644 >> --- a/src/box/box.cc >> +++ b/src/box/box.cc >> @@ -595,7 +595,7 @@ cfg_get_replication(int *p_count) >> * don't start appliers. >> */ >> static void >> -box_sync_replication(double timeout, bool connect_all) >> +box_sync_replication(double timeout, bool reach_quorum) >=20 > After this patch, 'timeout' always equals replication_connect_timeout > so you don't need to pass it explicitly anymore. Please remove it from > this function and from replicaset_connect. Done. >=20 > Also, I don't like 'reach_quorum' name. Would 'connect_quorum' sound > better? Or may be we should pass the minimal number of masters to > connect instead? Let it be connect_quorum. I don=E2=80=99t like the idea to pass a = minimal number of replicas to connect. Looks like we would always pass either replication_connect_quorum or count. >> { >> int count =3D 0; >> struct applier **appliers =3D cfg_get_replication(&count); >> @@ -607,7 +607,7 @@ box_sync_replication(double timeout, bool = connect_all) >> applier_delete(appliers[i]); /* doesn't affect = diag */ >> }); >>=20 >> - replicaset_connect(appliers, count, timeout, connect_all); >> + replicaset_connect(appliers, count, timeout, reach_quorum); >>=20 >> guard.is_active =3D false; >> } >> @@ -1888,7 +1888,7 @@ box_cfg_xc(void) >> * receive the same replica set UUID when a new cluster >> * is deployed. >> */ >> - box_sync_replication(TIMEOUT_INFINITY, true); >> + box_sync_replication(replication_connect_timeout, true); >> /* Bootstrap a new master */ >> bootstrap(&replicaset_uuid, &is_bootstrap_leader); >> } >> diff --git a/src/box/replication.cc b/src/box/replication.cc >> index 528fe4459..a6c60220f 100644 >> --- a/src/box/replication.cc >> +++ b/src/box/replication.cc >> @@ -46,7 +46,7 @@ struct tt_uuid INSTANCE_UUID; >> struct tt_uuid REPLICASET_UUID; >>=20 >> double replication_timeout =3D 1.0; /* seconds */ >> -double replication_connect_timeout =3D 4.0; /* seconds */ >> +double replication_connect_timeout =3D 10.0; /* seconds */ >=20 > Why? BTW, this isn't enough - replication_connect_timeout is actually > set in load_cfg.lua (I've no idea why they differ). Changed the default value to 30 seconds, to match the one in = load_cfg.lua Don=E2=80=99t see a reason for them to differ either, >=20 >> int replication_connect_quorum =3D REPLICATION_CONNECT_QUORUM_ALL; >> double replication_sync_lag =3D 10.0; /* seconds */ >>=20 >> @@ -540,7 +540,7 @@ applier_on_connect_f(struct trigger *trigger, = void *event) >>=20 >> void >> replicaset_connect(struct applier **appliers, int count, >> - double timeout, bool connect_all) >> + double timeout, bool reach_quorum) >> { >> if (count =3D=3D 0) { >> /* Cleanup the replica set. */ >> @@ -587,15 +587,13 @@ replicaset_connect(struct applier **appliers, = int count, >> double wait_start =3D ev_monotonic_now(loop()); >> if (fiber_cond_wait_timeout(&state.wakeup, timeout) !=3D = 0) >> break; >> - if (state.failed > 0 && connect_all) >> - break; >=20 > I guess you should break the loop if >=20 > state.failed > count - min_count_to_connect AFAIU, if replication_timeout is less than replication_connect_timeout, = the appliers, which have failed, will have time to try and reconnect during replicaset_connect(). = So failing here is essentially ignoring replication_connect_timeout. >=20 >> timeout -=3D ev_monotonic_now(loop()) - wait_start; >> } >> if (state.connected < count) { >> say_crit("failed to connect to %d out of %d replicas", >> count - state.connected, count); >> /* Timeout or connection failure. */ >> - if (connect_all) >> + if (reach_quorum && state.connected < = replication_connect_quorum) >=20 > replication_connect_quorum can be greater than the number of = configured > replicas. I think you should use MIN(count, = replication_connect_quorum). Fixed. >=20 >> goto error; >> } else { >> say_verbose("connected to %d replicas", = state.connected); >> diff --git a/src/box/replication.h b/src/box/replication.h >> index 95122eb45..c16c8b56c 100644 >> --- a/src/box/replication.h >> +++ b/src/box/replication.h >> @@ -357,12 +357,13 @@ replicaset_add(uint32_t replica_id, const = struct tt_uuid *instance_uuid); >> * \param appliers the array of appliers >> * \param count size of appliers array >> * \param timeout connection timeout >> - * \param connect_all if this flag is set, fail unless all >> - * appliers have successfully connected >> + * \param reach_quorum if this flag is set, fail unless at >> + * least replication_connect_quorum >> + * appliers have successfully connected. >> */ >> void >> replicaset_connect(struct applier **appliers, int count, >> - double timeout, bool connect_all); >> + double timeout, bool reach_quorum); >>=20 >> /** >> * Resume all appliers registered with the replica set. >> diff --git a/test/replication-py/init_storage.test.py = b/test/replication-py/init_storage.test.py >> index 0911a02c0..32b4639f1 100644 >> --- a/test/replication-py/init_storage.test.py >> +++ b/test/replication-py/init_storage.test.py >> @@ -57,7 +57,7 @@ print = '-------------------------------------------------------------' >>=20 >> server.stop() >> replica =3D TarantoolServer(server.ini) >> -replica.script =3D 'replication/replica.lua' >> +replica.script =3D 'replication-py/replica.lua' >> replica.vardir =3D server.vardir #os.path.join(server.vardir, = 'replica') >> replica.rpl_master =3D master >> replica.deploy(wait=3DFalse) >> diff --git a/test/replication-py/master.lua = b/test/replication-py/master.lua >> index 0f9f7a6f0..51283efdf 100644 >> --- a/test/replication-py/master.lua >> +++ b/test/replication-py/master.lua >> @@ -3,6 +3,8 @@ os =3D require('os') >> box.cfg({ >> listen =3D os.getenv("LISTEN"), >> memtx_memory =3D 107374182, >> + replication_connect_timeout =3D 1.0, >> + replication_timeout =3D 0.3 >=20 > Why do you need to adjust the timeouts? The timeouts set in all cfg files in all the tests had no effect, just = like the replication_connect_quorum option, cos both of the options were ignored During bootstrap. We wanted every replica to connect, and the timeout = was Set to TIMEOUT_INFINITY. Now when we actually start passing replication_connect_timeout, all these timeouts become too small. >=20 >> }) >>=20 >> require('console').listen(os.getenv('ADMIN')) >> diff --git a/test/replication-py/replica.lua = b/test/replication-py/replica.lua >> index 278291bba..b9d193b70 100644 >> --- a/test/replication-py/replica.lua >> +++ b/test/replication-py/replica.lua >> @@ -7,6 +7,8 @@ box.cfg({ >> listen =3D os.getenv("LISTEN"), >> replication =3D os.getenv("MASTER"), >> memtx_memory =3D 107374182, >> + replication_connect_timeout =3D 1.0, >> + replication_timeout =3D 0.3 >> }) >>=20 >> box_cfg_done =3D true >> diff --git a/test/replication/autobootstrap.lua = b/test/replication/autobootstrap.lua >> index 4f55417ae..8fc6809de 100644 >> --- a/test/replication/autobootstrap.lua >> +++ b/test/replication/autobootstrap.lua >> @@ -21,7 +21,8 @@ box.cfg({ >> USER..':'..PASSWORD..'@'..instance_uri(2); >> USER..':'..PASSWORD..'@'..instance_uri(3); >> }; >> - replication_connect_timeout =3D 0.5, >> + replication_connect_timeout =3D 3.0; >> + replication_timeout =3D 0.5; >> }) >>=20 >> box.once("bootstrap", function() >> diff --git a/test/replication/autobootstrap_guest.lua = b/test/replication/autobootstrap_guest.lua >> index 40fef2c7a..7cd921e3c 100644 >> --- a/test/replication/autobootstrap_guest.lua >> +++ b/test/replication/autobootstrap_guest.lua >> @@ -20,7 +20,7 @@ box.cfg({ >> instance_uri(2); >> instance_uri(3); >> }; >> - replication_connect_timeout =3D 0.5, >> + replication_connect_timeout =3D 5, >=20 > Why do you use different timeouts in different tests? I tried to find the lowest possible boundary in every test. It happens = so that they differ. I believe, any finite timeout is better that the infinite one. Here=E2=80=99s the new diff: src/box/box.cc | 10 +++++----- src/box/replication.cc | 13 +++++++------ src/box/replication.h | 8 ++++---- test/replication-py/init_storage.test.py | 2 +- test/replication-py/master.lua | 2 ++ test/replication-py/replica.lua | 2 ++ test/replication/autobootstrap.lua | 3 ++- test/replication/autobootstrap_guest.lua | 2 +- test/replication/ddl.lua | 3 ++- test/replication/errinj.result | 6 +++--- test/replication/errinj.test.lua | 6 +++--- test/replication/master.lua | 1 + test/replication/master_quorum.lua | 3 ++- test/replication/on_replace.lua | 3 ++- test/replication/quorum.lua | 4 ++-- test/replication/rebootstrap.lua | 2 +- test/replication/replica_no_quorum.lua | 3 ++- test/replication/replica_timeout.lua | 3 ++- test/replication/replica_uuid_ro.lua | 2 +- 19 files changed, 45 insertions(+), 33 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index e3eb2738f..8cf43a6ad 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -595,7 +595,7 @@ cfg_get_replication(int *p_count) * don't start appliers. */ static void -box_sync_replication(double timeout, bool connect_all) +box_sync_replication(bool connect_quorum) { int count =3D 0; struct applier **appliers =3D cfg_get_replication(&count); @@ -607,7 +607,7 @@ box_sync_replication(double timeout, bool = connect_all) applier_delete(appliers[i]); /* doesn't affect = diag */ }); =20 - replicaset_connect(appliers, count, timeout, connect_all); + replicaset_connect(appliers, count, connect_quorum); =20 guard.is_active =3D false; } @@ -626,7 +626,7 @@ box_set_replication(void) =20 box_check_replication(); /* Try to connect to all replicas within the timeout period */ - box_sync_replication(replication_connect_timeout, true); + box_sync_replication(true); /* Follow replica */ replicaset_follow(); } @@ -1866,7 +1866,7 @@ box_cfg_xc(void) title("orphan"); =20 /* Wait for the cluster to start up */ - box_sync_replication(replication_connect_timeout, = false); + box_sync_replication(false); } else { if (!tt_uuid_is_nil(&instance_uuid)) INSTANCE_UUID =3D instance_uuid; @@ -1888,7 +1888,7 @@ box_cfg_xc(void) * receive the same replica set UUID when a new cluster * is deployed. */ - box_sync_replication(TIMEOUT_INFINITY, true); + box_sync_replication(true); /* Bootstrap a new master */ bootstrap(&replicaset_uuid, &is_bootstrap_leader); } diff --git a/src/box/replication.cc b/src/box/replication.cc index 528fe4459..fa3b6afb4 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -46,7 +46,7 @@ struct tt_uuid INSTANCE_UUID; struct tt_uuid REPLICASET_UUID; =20 double replication_timeout =3D 1.0; /* seconds */ -double replication_connect_timeout =3D 4.0; /* seconds */ +double replication_connect_timeout =3D 30.0; /* seconds */ int replication_connect_quorum =3D REPLICATION_CONNECT_QUORUM_ALL; double replication_sync_lag =3D 10.0; /* seconds */ =20 @@ -540,7 +540,7 @@ applier_on_connect_f(struct trigger *trigger, void = *event) =20 void replicaset_connect(struct applier **appliers, int count, - double timeout, bool connect_all) + bool connect_quorum) { if (count =3D=3D 0) { /* Cleanup the replica set. */ @@ -557,7 +557,7 @@ replicaset_connect(struct applier **appliers, int = count, * - register a trigger in each applier to wake up our * fiber via this channel when the remote peer becomes * connected and a UUID is received; - * - wait up to CONNECT_TIMEOUT seconds for `count` messages; + * - wait up to REPLICATION_CONNECT_TIMEOUT seconds for `count` = messages; * - on timeout, raise a CFG error, cancel and destroy * the freshly created appliers (done in a guard); * - an success, unregister the trigger, check the UUID set @@ -571,6 +571,8 @@ replicaset_connect(struct applier **appliers, int = count, state.connected =3D state.failed =3D 0; fiber_cond_create(&state.wakeup); =20 + double timeout =3D replication_connect_timeout; + /* Add triggers and start simulations connection to remote peers = */ for (int i =3D 0; i < count; i++) { struct applier *applier =3D appliers[i]; @@ -587,15 +589,14 @@ replicaset_connect(struct applier **appliers, int = count, double wait_start =3D ev_monotonic_now(loop()); if (fiber_cond_wait_timeout(&state.wakeup, timeout) !=3D = 0) break; - if (state.failed > 0 && connect_all) - break; timeout -=3D ev_monotonic_now(loop()) - wait_start; } if (state.connected < count) { say_crit("failed to connect to %d out of %d replicas", count - state.connected, count); /* Timeout or connection failure. */ - if (connect_all) + if (connect_quorum && state.connected < + MIN(count, replication_connect_quorum)) goto error; } else { say_verbose("connected to %d replicas", = state.connected); diff --git a/src/box/replication.h b/src/box/replication.h index 95122eb45..9ce9910f8 100644 --- a/src/box/replication.h +++ b/src/box/replication.h @@ -356,13 +356,13 @@ replicaset_add(uint32_t replica_id, const struct = tt_uuid *instance_uuid); * * \param appliers the array of appliers * \param count size of appliers array - * \param timeout connection timeout - * \param connect_all if this flag is set, fail unless all - * appliers have successfully connected + * \param connect_quorum if this flag is set, fail unless at + * least replication_connect_quorum + * appliers have successfully connected. */ void replicaset_connect(struct applier **appliers, int count, - double timeout, bool connect_all); + bool connect_quorum); =20 /** * Resume all appliers registered with the replica set. diff --git a/test/replication-py/init_storage.test.py = b/test/replication-py/init_storage.test.py index 0911a02c0..32b4639f1 100644 --- a/test/replication-py/init_storage.test.py +++ b/test/replication-py/init_storage.test.py @@ -57,7 +57,7 @@ print = '-------------------------------------------------------------' =20 server.stop() replica =3D TarantoolServer(server.ini) -replica.script =3D 'replication/replica.lua' +replica.script =3D 'replication-py/replica.lua' replica.vardir =3D server.vardir #os.path.join(server.vardir, = 'replica') replica.rpl_master =3D master replica.deploy(wait=3DFalse) diff --git a/test/replication-py/master.lua = b/test/replication-py/master.lua index 0f9f7a6f0..51283efdf 100644 --- a/test/replication-py/master.lua +++ b/test/replication-py/master.lua @@ -3,6 +3,8 @@ os =3D require('os') box.cfg({ listen =3D os.getenv("LISTEN"), memtx_memory =3D 107374182, + replication_connect_timeout =3D 1.0, + replication_timeout =3D 0.3 }) =20 require('console').listen(os.getenv('ADMIN')) diff --git a/test/replication-py/replica.lua = b/test/replication-py/replica.lua index 278291bba..b9d193b70 100644 --- a/test/replication-py/replica.lua +++ b/test/replication-py/replica.lua @@ -7,6 +7,8 @@ box.cfg({ listen =3D os.getenv("LISTEN"), replication =3D os.getenv("MASTER"), memtx_memory =3D 107374182, + replication_connect_timeout =3D 1.0, + replication_timeout =3D 0.3 }) =20 box_cfg_done =3D true diff --git a/test/replication/autobootstrap.lua = b/test/replication/autobootstrap.lua index 4f55417ae..8fc6809de 100644 --- a/test/replication/autobootstrap.lua +++ b/test/replication/autobootstrap.lua @@ -21,7 +21,8 @@ box.cfg({ USER..':'..PASSWORD..'@'..instance_uri(2); USER..':'..PASSWORD..'@'..instance_uri(3); }; - replication_connect_timeout =3D 0.5, + replication_connect_timeout =3D 3.0; + replication_timeout =3D 0.5; }) =20 box.once("bootstrap", function() diff --git a/test/replication/autobootstrap_guest.lua = b/test/replication/autobootstrap_guest.lua index 40fef2c7a..7cd921e3c 100644 --- a/test/replication/autobootstrap_guest.lua +++ b/test/replication/autobootstrap_guest.lua @@ -20,7 +20,7 @@ box.cfg({ instance_uri(2); instance_uri(3); }; - replication_connect_timeout =3D 0.5, + replication_connect_timeout =3D 5, }) =20 box.once("bootstrap", function() diff --git a/test/replication/ddl.lua b/test/replication/ddl.lua index 694f40eac..85403e35b 100644 --- a/test/replication/ddl.lua +++ b/test/replication/ddl.lua @@ -22,7 +22,8 @@ box.cfg({ USER..':'..PASSWORD..'@'..instance_uri(3); USER..':'..PASSWORD..'@'..instance_uri(4); }; - replication_connect_timeout =3D 0.5, + replication_timeout =3D 0.1, + replication_connect_timeout =3D 2.0, }) =20 box.once("bootstrap", function() diff --git a/test/replication/errinj.result = b/test/replication/errinj.result index ca8af2988..19d7d9a05 100644 --- a/test/replication/errinj.result +++ b/test/replication/errinj.result @@ -418,7 +418,7 @@ test_run:cmd("create server replica_timeout with = rpl_master=3Ddefault, script=3D'rep --- - true ... -test_run:cmd("start server replica_timeout with args=3D'0.01'") +test_run:cmd("start server replica_timeout with args=3D'0.1, 0.5'") --- - true ... @@ -474,7 +474,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) ... -- Check replica's ACKs don't prevent the master from sending -- heartbeat messages (gh-3160). -test_run:cmd("start server replica_timeout with args=3D'0.009'") +test_run:cmd("start server replica_timeout with args=3D'0.009, 0.5'") --- - true ... @@ -522,7 +522,7 @@ for i =3D 0, 9999 do box.space.test:replace({i, 4, = 5, 'test'}) end -- during the join stage, i.e. a replica with a minuscule -- timeout successfully bootstraps and breaks connection only -- after subscribe. -test_run:cmd("start server replica_timeout with args=3D'0.00001'") +test_run:cmd("start server replica_timeout with args=3D'0.00001, 0.5'") --- - true ... diff --git a/test/replication/errinj.test.lua = b/test/replication/errinj.test.lua index 463d89a8f..f00b98eed 100644 --- a/test/replication/errinj.test.lua +++ b/test/replication/errinj.test.lua @@ -173,7 +173,7 @@ errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0) box.cfg{replication_timeout =3D 0.01} =20 test_run:cmd("create server replica_timeout with rpl_master=3Ddefault, = script=3D'replication/replica_timeout.lua'") -test_run:cmd("start server replica_timeout with args=3D'0.01'") +test_run:cmd("start server replica_timeout with args=3D'0.1, 0.5'") test_run:cmd("switch replica_timeout") =20 fiber =3D require('fiber') @@ -199,7 +199,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) -- Check replica's ACKs don't prevent the master from sending -- heartbeat messages (gh-3160). =20 -test_run:cmd("start server replica_timeout with args=3D'0.009'") +test_run:cmd("start server replica_timeout with args=3D'0.009, 0.5'") test_run:cmd("switch replica_timeout") =20 fiber =3D require('fiber') @@ -219,7 +219,7 @@ for i =3D 0, 9999 do box.space.test:replace({i, 4, = 5, 'test'}) end -- during the join stage, i.e. a replica with a minuscule -- timeout successfully bootstraps and breaks connection only -- after subscribe. -test_run:cmd("start server replica_timeout with args=3D'0.00001'") +test_run:cmd("start server replica_timeout with args=3D'0.00001, 0.5'") test_run:cmd("switch replica_timeout") fiber =3D require('fiber') while box.info.replication[1].upstream.message ~=3D 'timed out' do = fiber.sleep(0.0001) end diff --git a/test/replication/master.lua b/test/replication/master.lua index 6d431aaeb..9b96b7891 100644 --- a/test/replication/master.lua +++ b/test/replication/master.lua @@ -4,6 +4,7 @@ box.cfg({ listen =3D os.getenv("LISTEN"), memtx_memory =3D 107374182, replication_connect_timeout =3D 0.5, + replication_timeout =3D 0.1 }) =20 require('console').listen(os.getenv('ADMIN')) diff --git a/test/replication/master_quorum.lua = b/test/replication/master_quorum.lua index fb5f7ec2b..6e0429f65 100644 --- a/test/replication/master_quorum.lua +++ b/test/replication/master_quorum.lua @@ -20,7 +20,8 @@ box.cfg({ instance_uri(2); }; replication_connect_quorum =3D 0; - replication_connect_timeout =3D 0.1; + replication_timeout =3D 0.5; + replication_connect_timeout =3D 2.0; }) =20 test_run =3D require('test_run').new() diff --git a/test/replication/on_replace.lua = b/test/replication/on_replace.lua index 03f15d94c..bafead48d 100644 --- a/test/replication/on_replace.lua +++ b/test/replication/on_replace.lua @@ -20,7 +20,8 @@ box.cfg({ USER..':'..PASSWORD..'@'..instance_uri(1); USER..':'..PASSWORD..'@'..instance_uri(2); }; - replication_connect_timeout =3D 0.5, + replication_timeout =3D 0.5, + replication_connect_timeout =3D 1.0, }) =20 env =3D require('test_run') diff --git a/test/replication/quorum.lua b/test/replication/quorum.lua index 9c7bf5c93..7f85d7b13 100644 --- a/test/replication/quorum.lua +++ b/test/replication/quorum.lua @@ -15,8 +15,8 @@ require('console').listen(os.getenv('ADMIN')) box.cfg({ listen =3D instance_uri(INSTANCE_ID); replication_timeout =3D 0.05; - replication_sync_lag =3D 0.01; - replication_connect_timeout =3D 0.1; + replication_sync_lag =3D 0.1; + replication_connect_timeout =3D 3.0; replication_connect_quorum =3D 3; replication =3D { instance_uri(1); diff --git a/test/replication/rebootstrap.lua = b/test/replication/rebootstrap.lua index e743577e4..f1e8d69e9 100644 --- a/test/replication/rebootstrap.lua +++ b/test/replication/rebootstrap.lua @@ -15,7 +15,7 @@ box.cfg({ listen =3D instance_uri(INSTANCE_ID), instance_uuid =3D '12345678-abcd-1234-abcd-123456789ef' .. = INSTANCE_ID, replication_timeout =3D 0.1, - replication_connect_timeout =3D 0.5, + replication_connect_timeout =3D 2.0, replication =3D { instance_uri(1); instance_uri(2); diff --git a/test/replication/replica_no_quorum.lua = b/test/replication/replica_no_quorum.lua index b9edeea94..c30c043cc 100644 --- a/test/replication/replica_no_quorum.lua +++ b/test/replication/replica_no_quorum.lua @@ -5,7 +5,8 @@ box.cfg({ replication =3D os.getenv("MASTER"), memtx_memory =3D 107374182, replication_connect_quorum =3D 0, - replication_connect_timeout =3D 0.1, + replication_timeout =3D 0.1, + replication_connect_timeout =3D 0.5, }) =20 require('console').listen(os.getenv('ADMIN')) diff --git a/test/replication/replica_timeout.lua = b/test/replication/replica_timeout.lua index 64f119763..51c718360 100644 --- a/test/replication/replica_timeout.lua +++ b/test/replication/replica_timeout.lua @@ -1,13 +1,14 @@ #!/usr/bin/env tarantool =20 local TIMEOUT =3D tonumber(arg[1]) +local CON_TIMEOUT =3D arg[2] and tonumber(arg[2]) or TIMEOUT * 3 =20 box.cfg({ listen =3D os.getenv("LISTEN"), replication =3D os.getenv("MASTER"), memtx_memory =3D 107374182, replication_timeout =3D TIMEOUT, - replication_connect_timeout =3D TIMEOUT * 3, + replication_connect_timeout =3D CON_TIMEOUT, }) =20 require('console').listen(os.getenv('ADMIN')) diff --git a/test/replication/replica_uuid_ro.lua = b/test/replication/replica_uuid_ro.lua index 8e1c6cc47..ff70da144 100644 --- a/test/replication/replica_uuid_ro.lua +++ b/test/replication/replica_uuid_ro.lua @@ -22,7 +22,7 @@ box.cfg({ USER..':'..PASSWORD..'@'..instance_uri(2); }; read_only =3D (INSTANCE_ID ~=3D '1' and true or false); - replication_connect_timeout =3D 0.5, + replication_connect_timeout =3D 5, }) =20 box.once("bootstrap", function() --=20 2.15.2 (Apple Git-101.1) --Apple-Mail=_D0D63893-4442-4794-9FE2-F00A1CCCBF72 Content-Transfer-Encoding: quoted-printable Content-Type: text/html; charset=utf-8 Hi! Thank you for the = review!

I fixed your remarks. The new diff = is below. Also please see my comments.

7 =D0=B0=D0=B2=D0=B3. = 2018 =D0=B3., =D0=B2 20:28, Vladimir Davydov <vdavydov.dev@gmail.com> =D0=BD=D0=B0=D0=BF=D0=B8=D1=81=D0= =B0=D0=BB(=D0=B0):

On Tue, Aug 07, 2018 at = 03:44:13PM +0300, Serge Petrenko wrote:
On = bootstrap and after replication reconfiguration
replication_connect_quorum was ignored. The instance tried to = connect to
every replica listed in replication parameter, = and errored if it wasn't
possible.
The patch = alters this behaviour. The instance still tries to connect to
every node listed in replication, but does not raise an error = if it was
able to connect to at least = replication_connect_quorum instances.

Please append a documentation request (@TarantoolBot) to the = commit
message.

Done.


Closes #3428
---
https://github.com/tarantool/tarantool/issues/3428
https://github.com/tarantool/tarantool/tree/sergepetrenko/gh-34= 28-replication-connect-quorum

Changes in = v2:
 - change test/replication/ddl.lua instance file = to fix
   test failure on Travis.

src/box/box.cc         =                   | 6 = +++---
src/box/replication.cc         =           | 8 +++-----
src/box/replication.h           =          | 7 ++++---
test/replication-py/init_storage.test.py | 2 +-
test/replication-py/master.lua         =   | 2 ++
test/replication-py/replica.lua   =        | 2 ++
test/replication/autobootstrap.lua       | 3 = ++-
test/replication/autobootstrap_guest.lua | 2 +-
test/replication/ddl.lua           =       | 3 ++-
test/replication/errinj.result =           | 6 +++---
test/replication/errinj.test.lua         = | 6 +++---
test/replication/master.lua     =          | 1 +
test/replication/master_quorum.lua       | 3 = ++-
test/replication/on_replace.lua       =    | 3 ++-
test/replication/quorum.lua   =            | 4 ++--
test/replication/rebootstrap.lua         = | 2 +-
test/replication/replica_no_quorum.lua   | 3 = ++-
test/replication/replica_timeout.lua     | 3 = ++-
test/replication/replica_uuid_ro.lua     | 2 = +-
19 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/src/box/box.cc = b/src/box/box.cc
index e3eb2738f..f8731f464 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -595,7 +595,7 @@ cfg_get_replication(int *p_count)
 * don't start appliers.
 */
static void
-box_sync_replication(double = timeout, bool connect_all)
+box_sync_replication(double = timeout, bool reach_quorum)

After this patch, 'timeout' always equals = replication_connect_timeout
so you don't need to pass it = explicitly anymore. Please remove it from
this function = and from replicaset_connect.

Done.


Also, I don't like 'reach_quorum' name. Would = 'connect_quorum' sound
better? Or may be we should pass = the minimal number of masters to
connect instead?

Let it be connect_quorum. I = don=E2=80=99t like the idea to pass a minimal number of
replicas to connect. Looks like we would always pass = either
replication_connect_quorum or count.

{
= int count =3D 0;
struct applier **appliers =3D = cfg_get_replication(&count);
@@ -607,7 +607,7 @@ = box_sync_replication(double timeout, bool connect_all)
= applier_delete(appliers[i]); /* doesn't affect diag */
= });

- replicaset_connect(appliers, = count, timeout, connect_all);
+ = replicaset_connect(appliers, count, timeout, reach_quorum);

guard.is_active =3D false;
}
@@ -1888,7 +1888,7 @@ box_cfg_xc(void)
= =  * receive the same replica set UUID when a new cluster
= =  * is deployed.
 */
- = box_sync_replication(TIMEOUT_INFINITY, true);
+ = box_sync_replication(replication_connect_timeout, true);
= = /* Bootstrap a new master */
= bootstrap(&replicaset_uuid, &is_bootstrap_leader);
= }
diff --git a/src/box/replication.cc = b/src/box/replication.cc
index 528fe4459..a6c60220f = 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -46,7 +46,7 @@ struct tt_uuid INSTANCE_UUID;
struct tt_uuid REPLICASET_UUID;

double replication_timeout =3D 1.0; /* seconds */
-double replication_connect_timeout =3D 4.0; /* seconds */
+double replication_connect_timeout =3D 10.0; /* seconds = */

Why? BTW, this isn't enough = - replication_connect_timeout is actually
set in = load_cfg.lua (I've no idea why they differ).


Changed the = default value to 30 seconds, to match the one in load_cfg.lua
Don=E2=80=99t see a reason for them to differ either,

int = replication_connect_quorum =3D REPLICATION_CONNECT_QUORUM_ALL;
double replication_sync_lag =3D 10.0; /* seconds */

@@ -540,7 +540,7 @@ = applier_on_connect_f(struct trigger *trigger, void *event)

void
replicaset_connect(struct = applier **appliers, int count,
- =    double timeout, bool connect_all)
+ =    double timeout, bool reach_quorum)
{
= if (count =3D=3D 0) {
/* Cleanup the replica set. */
@@ -587,15 +587,13 @@ replicaset_connect(struct applier = **appliers, int count,
double wait_start =3D = ev_monotonic_now(loop());
if = (fiber_cond_wait_timeout(&state.wakeup, timeout) !=3D 0)
= = = break;
- if (state.failed > 0 = && connect_all)
- break;

I guess you should break the loop = if

 state.failed > count - = min_count_to_connect

AFAIU, if = replication_timeout is less than replication_connect_timeout, the = appliers, which have
failed, will have time to try and = reconnect during replicaset_connect(). So failing here is essentially = ignoring
replication_connect_timeout.


timeout = -=3D ev_monotonic_now(loop()) - wait_start;
}
= if (state.connected < count) {
= say_crit("failed to connect to %d out of %d replicas",
= = =  count - state.connected, count);
/* = Timeout or connection failure. */
- if = (connect_all)
+ if (reach_quorum && = state.connected < replication_connect_quorum)

replication_connect_quorum can be = greater than the number of configured
replicas. I think = you should use MIN(count, replication_connect_quorum).

Fixed.


goto = error;
} else {
= say_verbose("connected to %d replicas", state.connected);
diff --git a/src/box/replication.h b/src/box/replication.h
index 95122eb45..c16c8b56c 100644
--- = a/src/box/replication.h
+++ b/src/box/replication.h
@@ -357,12 +357,13 @@ replicaset_add(uint32_t replica_id, = const struct tt_uuid *instance_uuid);
 * \param = appliers the array of appliers
 * \param count size = of appliers array
 * \param timeout connection = timeout
- * \param connect_all if this flag is set, fail = unless all
- *             =        appliers have successfully connected
+ * \param reach_quorum if this flag is set, fail unless = at
+ *        least = replication_connect_quorum
+ *        = appliers have successfully connected.
 */
void
replicaset_connect(struct applier = **appliers, int count,
-    double timeout, bool = connect_all);
+    double timeout, bool = reach_quorum);

/**
 * = Resume all appliers registered with the replica set.
diff = --git a/test/replication-py/init_storage.test.py = b/test/replication-py/init_storage.test.py
index = 0911a02c0..32b4639f1 100644
--- = a/test/replication-py/init_storage.test.py
+++ = b/test/replication-py/init_storage.test.py
@@ -57,7 +57,7 = @@ print = '-------------------------------------------------------------'

server.stop()
replica =3D = TarantoolServer(server.ini)
-replica.script =3D = 'replication/replica.lua'
+replica.script =3D = 'replication-py/replica.lua'
replica.vardir =3D = server.vardir #os.path.join(server.vardir, 'replica')
replica.rpl_master =3D master
replica.deploy(wait=3DFalse)
diff --git = a/test/replication-py/master.lua b/test/replication-py/master.lua
index 0f9f7a6f0..51283efdf 100644
--- = a/test/replication-py/master.lua
+++ = b/test/replication-py/master.lua
@@ -3,6 +3,8 @@ os =3D = require('os')
box.cfg({
    listen =              =3D = os.getenv("LISTEN"),
    memtx_memory   =      =3D 107374182,
+   =  replication_connect_timeout =3D 1.0,
+   =  replication_timeout =3D 0.3

Why do you need to adjust the timeouts?

The timeouts set in all cfg files = in all the tests had no effect, just like the
replication_connect_quorum option, cos both of the options = were ignored
During bootstrap. We wanted every replica to = connect, and the timeout was
Set to TIMEOUT_INFINITY. Now = when we actually start passing
replication_connect_timeout, = all these timeouts become too small.


})

require('console').listen(os.getenv('ADMIN'))diff --git a/test/replication-py/replica.lua = b/test/replication-py/replica.lua
index = 278291bba..b9d193b70 100644
--- = a/test/replication-py/replica.lua
+++ = b/test/replication-py/replica.lua
@@ -7,6 +7,8 @@ = box.cfg({
    listen         =      =3D os.getenv("LISTEN"),
    = replication         =3D os.getenv("MASTER"),
    memtx_memory        =3D = 107374182,
+    replication_connect_timeout =3D = 1.0,
+    replication_timeout =3D 0.3
})

box_cfg_done =3D true
diff --git a/test/replication/autobootstrap.lua = b/test/replication/autobootstrap.lua
index = 4f55417ae..8fc6809de 100644
--- = a/test/replication/autobootstrap.lua
+++ = b/test/replication/autobootstrap.lua
@@ -21,7 +21,8 @@ = box.cfg({
        = USER..':'..PASSWORD..'@'..instance_uri(2);
    =     USER..':'..PASSWORD..'@'..instance_uri(3);
    };
-   =  replication_connect_timeout =3D 0.5,
+   =  replication_connect_timeout =3D 3.0;
+   =  replication_timeout =3D 0.5;
})

box.once("bootstrap", function()
diff --git = a/test/replication/autobootstrap_guest.lua = b/test/replication/autobootstrap_guest.lua
index = 40fef2c7a..7cd921e3c 100644
--- = a/test/replication/autobootstrap_guest.lua
+++ = b/test/replication/autobootstrap_guest.lua
@@ -20,7 +20,7 = @@ box.cfg({
        = instance_uri(2);
        = instance_uri(3);
    };
-   =  replication_connect_timeout =3D 0.5,
+   =  replication_connect_timeout =3D 5,

Why do you use different timeouts in different tests?

I tried to find the lowest = possible boundary in every test. It happens so that they differ.
I believe, any finite timeout is better that the infinite = one.

Here=E2=80=99s the new diff:

 src/box/box.cc             =               | 10 +++++-----
 src/box/replication.cc           =         | 13 +++++++------
 src/box/replication.h          =           |  8 ++++----
 test/replication-py/init_storage.test.py |  2 = +-
 test/replication-py/master.lua     =       |  2 ++
 test/replication-py/replica.lua      =     |  2 ++
 test/replication/autobootstrap.lua     =   |  3 ++-
 test/replication/autobootstrap_guest.lua |  2 = +-
 test/replication/ddl.lua     =             |  3 ++-
 test/replication/errinj.result     =       |  6 +++---
 test/replication/errinj.test.lua     =     |  6 +++---
 test/replication/master.lua        =       |  1 +
 test/replication/master_quorum.lua     =   |  3 ++-
 test/replication/on_replace.lua      =     |  3 ++-
 test/replication/quorum.lua        =       |  4 ++--
 test/replication/rebootstrap.lua     =     |  2 +-
 test/replication/replica_no_quorum.lua   |=   3 ++-
 test/replication/replica_timeout.lua   =   |  3 ++-
 test/replication/replica_uuid_ro.lua   =   |  2 +-
 19 files changed, 45 = insertions(+), 33 deletions(-)

diff --git = a/src/box/box.cc b/src/box/box.cc
index = e3eb2738f..8cf43a6ad 100644
--- a/src/box/box.cc
+++ = b/src/box/box.cc
@@= -595,7 +595,7 @@ cfg_get_replication(int *p_count)
  * don't start appliers.
  */
 static void
-box_sync_replication(double timeout, bool connect_all)
+box_sync_replication(bool connect_quorum)
 {
  int count =3D 0;
 = struct applier **appliers =3D = cfg_get_replication(&count);
@@ -607,7 +607,7 @@ = box_sync_replication(double timeout, bool connect_all)
 = applier_delete(appliers[i]); /* doesn't = affect diag */
  });
 
- = replicaset_connect(appliers, count, timeout, connect_all);
+ = replicaset_connect(appliers, count, connect_quorum);
 
  guard.is_active =3D false;
 }
@@ -626,7 +626,7 @@ = box_set_replication(void)
 
  = box_check_replication();
  /* Try to = connect to all replicas within the timeout period */
- = box_sync_replication(replication_connect_timeout, true);
+ = box_sync_replication(true);
  /* Follow = replica */
  replicaset_follow();
 }
@@ -1866,7 +1866,7 @@ = box_cfg_xc(void)
  title("orphan");
 
  /* Wait for the cluster = to start up */
- = box_sync_replication(replication_connect_timeout, false);
+ = box_sync_replication(false);
  } else = {
  if = (!tt_uuid_is_nil(&instance_uuid))
  = INSTANCE_UUID =3D instance_uuid;
@@ -1888,7 +1888,7 = @@ box_cfg_xc(void)
   * receive the same = replica set UUID when a new cluster
  =  * is deployed.
  =  */
- = box_sync_replication(TIMEOUT_INFINITY, true);
+ = box_sync_replication(true);
  = /* Bootstrap a new master */
  = bootstrap(&replicaset_uuid, &is_bootstrap_leader);
 = }
diff --git a/src/box/replication.cc = b/src/box/replication.cc
index 528fe4459..fa3b6afb4 = 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -46,7 +46,7 @@ struct tt_uuid INSTANCE_UUID;
 struct tt_uuid REPLICASET_UUID;
 
 double replication_timeout =3D 1.0; /* seconds */
-double replication_connect_timeout =3D 4.0; /* seconds */
+double replication_connect_timeout =3D 30.0; /* seconds = */
 int replication_connect_quorum =3D = REPLICATION_CONNECT_QUORUM_ALL;
 double = replication_sync_lag =3D 10.0; /* seconds */
 
@@ -540,7 +540,7 @@ applier_on_connect_f(struct trigger = *trigger, void *event)
 
 void
 replicaset_connect(struct applier **appliers, int = count,
-    double = timeout, bool connect_all)
+    bool = connect_quorum)
 {
  if (count = =3D=3D 0) {
  /* Cleanup the replica = set. */
@@ -557,7 +557,7 @@ replicaset_connect(struct = applier **appliers, int count,
   * - = register a trigger in each applier to wake up our
 =  *   fiber via this channel when the = remote peer becomes
  =  *   connected and a UUID is received;
- =  * - wait up to CONNECT_TIMEOUT seconds for `count` = messages;
+  * - wait up to = REPLICATION_CONNECT_TIMEOUT seconds for `count` messages;
 =  * - on timeout, raise a CFG error, cancel and = destroy
   *   the = freshly created appliers (done in a guard);
   * - = an success, unregister the trigger, check the UUID set
@@ = -571,6 +571,8 @@ replicaset_connect(struct applier **appliers, int = count,
  state.connected =3D state.failed = =3D 0;
  = fiber_cond_create(&state.wakeup);
 
+ = double timeout =3D replication_connect_timeout;
+
 = /* Add triggers and start simulations connection to = remote peers */
  for (int i =3D 0; i < count; = i++) {
  struct applier *applier =3D= appliers[i];
@@ -587,15 +589,14 @@ = replicaset_connect(struct applier **appliers, int count,
 = double wait_start =3D = ev_monotonic_now(loop());
  = if (fiber_cond_wait_timeout(&state.wakeup, timeout) !=3D = 0)
  break;
- = if (state.failed > 0 && connect_all)
- = break;
  timeout -=3D = ev_monotonic_now(loop()) - wait_start;
  }
 = if (state.connected < count) {
  = say_crit("failed to connect to %d out of %d replicas",
 =  count - state.connected, count);
 = /* Timeout or connection failure. */
- = if (connect_all)
+ if (connect_quorum = && state.connected <
+ =     MIN(count, replication_connect_quorum))
 = goto error;
  } else = {
  say_verbose("connected to = %d replicas", state.connected);
diff --git = a/src/box/replication.h b/src/box/replication.h
index = 95122eb45..9ce9910f8 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -356,13 +356,13 = @@ replicaset_add(uint32_t replica_id, const struct tt_uuid = *instance_uuid);
  *
  * = \param appliers the array of appliers
  * \param = count size of appliers array
- * \param timeout connection = timeout
- * \param connect_all if this flag is set, fail = unless all
- *            =         appliers have successfully connected
+ * \param connect_quorum if this flag is set, fail unless = at
+ *      =   least replication_connect_quorum
+ * =        appliers have successfully = connected.
  */
 void
 replicaset_connect(struct applier **appliers, int = count,
-    double = timeout, bool connect_all);
+    bool = connect_quorum);
 
 /**
  * Resume all appliers registered with the replica = set.
diff --git a/test/replication-py/init_storage.test.py = b/test/replication-py/init_storage.test.py
index = 0911a02c0..32b4639f1 100644
--- = a/test/replication-py/init_storage.test.py
+++ = b/test/replication-py/init_storage.test.py
@@ -57,7 +57,7 = @@ print = '-------------------------------------------------------------'
 
 server.stop()
 replica =3D TarantoolServer(server.ini)
-replica.script =3D 'replication/replica.lua'
+replica.script =3D 'replication-py/replica.lua'
 replica.vardir =3D server.vardir = #os.path.join(server.vardir, 'replica')
 replica.rpl_master =3D master
 replica.deploy(wait=3DFalse)
diff --git = a/test/replication-py/master.lua b/test/replication-py/master.lua
index 0f9f7a6f0..51283efdf 100644
--- = a/test/replication-py/master.lua
+++ = b/test/replication-py/master.lua
@@ -3,6 +3,8 @@ os =3D = require('os')
 box.cfg({
    =  listen              =3D = os.getenv("LISTEN"),
    =  memtx_memory        =3D 107374182,
+    replication_connect_timeout =3D 1.0,
+    replication_timeout =3D 0.3
 })
 
 require('console').listen(os.getenv('ADMIN'))
diff --git a/test/replication-py/replica.lua = b/test/replication-py/replica.lua
index = 278291bba..b9d193b70 100644
--- = a/test/replication-py/replica.lua
+++ = b/test/replication-py/replica.lua
@@ -7,6 +7,8 @@ = box.cfg({
     listen      =         =3D os.getenv("LISTEN"),
     replication       =   =3D os.getenv("MASTER"),
    =  memtx_memory        =3D 107374182,
+    replication_connect_timeout =3D 1.0,
+    replication_timeout =3D 0.3
 })
 
 box_cfg_done = =3D true
diff --git a/test/replication/autobootstrap.lua = b/test/replication/autobootstrap.lua
index = 4f55417ae..8fc6809de 100644
--- = a/test/replication/autobootstrap.lua
+++ = b/test/replication/autobootstrap.lua
@@ -21,7 +21,8 @@ = box.cfg({
        =  USER..':'..PASSWORD..'@'..instance_uri(2);
  =        USER..':'..PASSWORD..'@'..instance_uri(3);
     };
-  =   replication_connect_timeout =3D 0.5,
+  =   replication_connect_timeout =3D 3.0;
+  =   replication_timeout =3D 0.5;
 })
 
 box.once("bootstrap", = function()
diff --git = a/test/replication/autobootstrap_guest.lua = b/test/replication/autobootstrap_guest.lua
index = 40fef2c7a..7cd921e3c 100644
--- = a/test/replication/autobootstrap_guest.lua
+++ = b/test/replication/autobootstrap_guest.lua
@@ -20,7 +20,7 = @@ box.cfg({
        =  instance_uri(2);
        =  instance_uri(3);
     };
-    replication_connect_timeout =3D 0.5,
+    replication_connect_timeout =3D 5,
 })
 
 box.once("bootstrap", function()
diff = --git a/test/replication/ddl.lua b/test/replication/ddl.lua
index 694f40eac..85403e35b 100644
--- = a/test/replication/ddl.lua
+++ = b/test/replication/ddl.lua
@@ -22,7 +22,8 @@ box.cfg({
        =  USER..':'..PASSWORD..'@'..instance_uri(3);
  =        USER..':'..PASSWORD..'@'..instance_uri(4);
     };
-  =   replication_connect_timeout =3D 0.5,
+  =   replication_timeout =3D 0.1,
+  =   replication_connect_timeout =3D 2.0,
 }) 
 box.once("bootstrap", = function()
diff --git a/test/replication/errinj.result = b/test/replication/errinj.result
index = ca8af2988..19d7d9a05 100644
--- = a/test/replication/errinj.result
+++ = b/test/replication/errinj.result
@@ -418,7 +418,7 @@ = test_run:cmd("create server replica_timeout with rpl_master=3Ddefault, = script=3D'rep
 ---
 - true
 ...
-test_run:cmd("start server = replica_timeout with args=3D'0.01'")
+test_run:cmd("start = server replica_timeout with args=3D'0.1, 0.5'")
 --- - true
 ...
@@ -474,7 = +474,7 @@ errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0)
 ...
 -- Check replica's ACKs don't = prevent the master from sending
 -- heartbeat = messages (gh-3160).
-test_run:cmd("start server = replica_timeout with args=3D'0.009'")
+test_run:cmd("start = server replica_timeout with args=3D'0.009, 0.5'")
 ---
 - true
 ...@@ -522,7 +522,7 @@ for i =3D 0, 9999 do = box.space.test:replace({i, 4, 5, 'test'}) end
 -- = during the join stage, i.e. a replica with a minuscule
 -- timeout successfully bootstraps and breaks = connection only
 -- after subscribe.
-test_run:cmd("start server replica_timeout with = args=3D'0.00001'")
+test_run:cmd("start server = replica_timeout with args=3D'0.00001, 0.5'")
 ---
 - true
 ...
diff --git = a/test/replication/errinj.test.lua b/test/replication/errinj.test.lua
index 463d89a8f..f00b98eed 100644
--- = a/test/replication/errinj.test.lua
+++ = b/test/replication/errinj.test.lua
@@ -173,7 +173,7 @@ = errinj.set("ERRINJ_RELAY_EXIT_DELAY", 0)
 box.cfg{replication_timeout =3D 0.01}
 
 test_run:cmd("create server = replica_timeout with = rpl_master=3Ddefault, script=3D'replication/replica_timeout.lua'")-test_run:cmd("start server replica_timeout with = args=3D'0.01'")
+test_run:cmd("start server = replica_timeout with args=3D'0.1, 0.5'")
 test_run:cmd("switch replica_timeout")
 
 fiber =3D require('fiber')
@@ -199,7 +199,7 @@ = errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 0)
 -- = Check replica's ACKs don't prevent the master from sending
 -- heartbeat messages (gh-3160).
 
-test_run:cmd("start server replica_timeout with = args=3D'0.009'")
+test_run:cmd("start server = replica_timeout with args=3D'0.009, 0.5'")
 test_run:cmd("switch replica_timeout")
 
 fiber =3D require('fiber')
@@ -219,7 +219,7 @@ for i =3D 0, 9999 do = box.space.test:replace({i, 4, 5, 'test'}) end
 -- = during the join stage, i.e. a replica with a minuscule
 -- timeout successfully bootstraps and breaks = connection only
 -- after subscribe.
-test_run:cmd("start server replica_timeout with = args=3D'0.00001'")
+test_run:cmd("start server = replica_timeout with args=3D'0.00001, 0.5'")
 test_run:cmd("switch replica_timeout")
 fiber =3D require('fiber')
 while = box.info.replication[1].upstream.message ~=3D 'timed out' do = fiber.sleep(0.0001) end
diff --git = a/test/replication/master.lua b/test/replication/master.lua
index 6d431aaeb..9b96b7891 100644
--- = a/test/replication/master.lua
+++ = b/test/replication/master.lua
@@ -4,6 +4,7 @@ box.cfg({
     listen          =     =3D os.getenv("LISTEN"),
    =  memtx_memory        =3D 107374182,
     replication_connect_timeout =3D 0.5,
+    replication_timeout =3D 0.1
 })
 
 require('console').listen(os.getenv('ADMIN'))
diff --git a/test/replication/master_quorum.lua = b/test/replication/master_quorum.lua
index = fb5f7ec2b..6e0429f65 100644
--- = a/test/replication/master_quorum.lua
+++ = b/test/replication/master_quorum.lua
@@ -20,7 +20,8 @@ = box.cfg({
        =  instance_uri(2);
     };
     replication_connect_quorum =3D 0;
-    replication_connect_timeout =3D 0.1;
+    replication_timeout =3D 0.5;
+    replication_connect_timeout =3D 2.0;
 })
 
 test_run =3D = require('test_run').new()
diff --git = a/test/replication/on_replace.lua b/test/replication/on_replace.lua
index 03f15d94c..bafead48d 100644
--- = a/test/replication/on_replace.lua
+++ = b/test/replication/on_replace.lua
@@ -20,7 +20,8 @@ = box.cfg({
        =  USER..':'..PASSWORD..'@'..instance_uri(1);
  =        USER..':'..PASSWORD..'@'..instance_uri(2);
     };
-  =   replication_connect_timeout =3D 0.5,
+  =   replication_timeout =3D 0.5,
+  =   replication_connect_timeout =3D 1.0,
 }) 
 env =3D require('test_run')
diff --git a/test/replication/quorum.lua = b/test/replication/quorum.lua
index 9c7bf5c93..7f85d7b13 = 100644
--- a/test/replication/quorum.lua
+++ = b/test/replication/quorum.lua
@@ -15,8 +15,8 @@ = require('console').listen(os.getenv('ADMIN'))
 box.cfg({
     listen =3D = instance_uri(INSTANCE_ID);
    =  replication_timeout =3D 0.05;
-  =   replication_sync_lag =3D 0.01;
-  =   replication_connect_timeout =3D 0.1;
+  =   replication_sync_lag =3D 0.1;
+  =   replication_connect_timeout =3D 3.0;
  =    replication_connect_quorum =3D 3;
  =    replication =3D {
        =  instance_uri(1);
diff --git = a/test/replication/rebootstrap.lua b/test/replication/rebootstrap.lua
index e743577e4..f1e8d69e9 100644
--- = a/test/replication/rebootstrap.lua
+++ = b/test/replication/rebootstrap.lua
@@ -15,7 +15,7 @@ = box.cfg({
     listen =3D = instance_uri(INSTANCE_ID),
    =  instance_uuid =3D '12345678-abcd-1234-abcd-123456789ef' .. = INSTANCE_ID,
     replication_timeout =3D = 0.1,
-    replication_connect_timeout =3D = 0.5,
+    replication_connect_timeout =3D = 2.0,
     replication =3D {
         instance_uri(1);
         instance_uri(2);
diff --git a/test/replication/replica_no_quorum.lua = b/test/replication/replica_no_quorum.lua
index = b9edeea94..c30c043cc 100644
--- = a/test/replication/replica_no_quorum.lua
+++ = b/test/replication/replica_no_quorum.lua
@@ -5,7 +5,8 @@ = box.cfg({
     replication   =       =3D os.getenv("MASTER"),
  =    memtx_memory        =3D = 107374182,
     replication_connect_quorum = =3D 0,
-    replication_connect_timeout =3D = 0.1,
+    replication_timeout =3D 0.1,
+    replication_connect_timeout =3D 0.5,
 })
 
 require('console').listen(os.getenv('ADMIN'))
diff --git a/test/replication/replica_timeout.lua = b/test/replication/replica_timeout.lua
index = 64f119763..51c718360 100644
--- = a/test/replication/replica_timeout.lua
+++ = b/test/replication/replica_timeout.lua
@@ -1,13 +1,14 = @@
 #!/usr/bin/env tarantool
 
 local TIMEOUT =3D tonumber(arg[1])
+local = CON_TIMEOUT =3D arg[2] and tonumber(arg[2]) or TIMEOUT * 3
 
 box.cfg({
  =    listen            =   =3D os.getenv("LISTEN"),
    =  replication         =3D = os.getenv("MASTER"),
    =  memtx_memory        =3D 107374182,
     replication_timeout =3D TIMEOUT,
-    replication_connect_timeout =3D TIMEOUT * = 3,
+    replication_connect_timeout =3D = CON_TIMEOUT,
 })
 
 require('console').listen(os.getenv('ADMIN'))
diff --git a/test/replication/replica_uuid_ro.lua = b/test/replication/replica_uuid_ro.lua
index = 8e1c6cc47..ff70da144 100644
--- = a/test/replication/replica_uuid_ro.lua
+++ = b/test/replication/replica_uuid_ro.lua
@@ -22,7 +22,7 @@ = box.cfg({
        =  USER..':'..PASSWORD..'@'..instance_uri(2);
  =    };
     read_only =3D = (INSTANCE_ID ~=3D '1' and true or false);
-  =   replication_connect_timeout =3D 0.5,
+  =   replication_connect_timeout =3D 5,
 })
 
 box.once("bootstrap", = function()
-- 
2.15.2 (Apple = Git-101.1)

= --Apple-Mail=_D0D63893-4442-4794-9FE2-F00A1CCCBF72--