From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Serge Petrenko Subject: [PATCH v4 3/3] replication: do not ignore replication_connect_quorum. Date: Tue, 14 Aug 2018 13:02:59 +0300 Message-Id: In-Reply-To: References: In-Reply-To: References: To: tarantool-patches@freelists.org Cc: georgy@tarantool.org, vdavydov.dev@gmail.com, Serge Petrenko List-ID: On bootstrap and after replication reconfiguration replication_connect_quorum was ignored. The instance tried to connect to every replica listed in replication parameter, and errored if it wasn't possible. The patch alters this behaviour. An instance still tries to connect to every node listed in replication, but does not raise an error if it was able to connect to at least replication_connect_quorum instances. Closes #3428 @TarantoolBot document Title: replication_connect_quorum is not ignored. Now on replica set bootstrap and in case of replication reconfiguration (e.g. calling box.cfg{replication=...} for the second time) tarantool doesn't fail, if it couldn't connect to to every replica, but could connect to replication_connect_quorum replicas. If after replication_connect_timeout seconds the instance is not connected to at least replication_connect_quorum other instances, we throw an error. --- src/box/box.cc | 35 +++++++++++++++++--------- src/box/replication.cc | 11 ++++++--- src/box/replication.h | 7 +++--- test/replication/quorum.result | 49 +++++++++++++++++++++++++++++++++++++ test/replication/quorum.test.lua | 20 +++++++++++++++ test/replication/replica_quorum.lua | 24 ++++++++++++++++++ 6 files changed, 128 insertions(+), 18 deletions(-) create mode 100644 test/replication/replica_quorum.lua diff --git a/src/box/box.cc b/src/box/box.cc index e3eb2738f..8d7454d1f 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -595,7 +595,7 @@ cfg_get_replication(int *p_count) * don't start appliers. */ static void -box_sync_replication(double timeout, bool connect_all) +box_sync_replication(bool connect_quorum) { int count = 0; struct applier **appliers = cfg_get_replication(&count); @@ -607,7 +607,7 @@ box_sync_replication(double timeout, bool connect_all) applier_delete(appliers[i]); /* doesn't affect diag */ }); - replicaset_connect(appliers, count, timeout, connect_all); + replicaset_connect(appliers, count, connect_quorum); guard.is_active = false; } @@ -625,8 +625,13 @@ box_set_replication(void) } box_check_replication(); - /* Try to connect to all replicas within the timeout period */ - box_sync_replication(replication_connect_timeout, true); + /* + * Try to connect to all replicas within the timeout period. + * The configuration will succeed as long as we've managed + * to connect to at least replication_connect_quorum + * masters. + */ + box_sync_replication(true); /* Follow replica */ replicaset_follow(); } @@ -1865,8 +1870,13 @@ box_cfg_xc(void) title("orphan"); - /* Wait for the cluster to start up */ - box_sync_replication(replication_connect_timeout, false); + /* + * In case of recovering from a checkpoint we + * don't need to wait for 'quorum' masters, since + * the recovered _cluster space will have all the + * information about cluster. + */ + box_sync_replication(false); } else { if (!tt_uuid_is_nil(&instance_uuid)) INSTANCE_UUID = instance_uuid; @@ -1883,12 +1893,15 @@ box_cfg_xc(void) /* * Wait for the cluster to start up. * - * Note, when bootstrapping a new instance, we have to - * connect to all masters to make sure all replicas - * receive the same replica set UUID when a new cluster - * is deployed. + * Note, when bootstrapping a new instance, we try to + * connect to all masters during timeout to make sure + * all replicas recieve the same replica set UUID when + * a new cluster is deployed. + * If we fail to do so, settle with connecting to + * 'replication_connect_quorum' masters. + * If this also fails, throw an error. */ - box_sync_replication(TIMEOUT_INFINITY, true); + box_sync_replication(true); /* Bootstrap a new master */ bootstrap(&replicaset_uuid, &is_bootstrap_leader); } diff --git a/src/box/replication.cc b/src/box/replication.cc index 528fe4459..861ce34ea 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -46,7 +46,7 @@ struct tt_uuid INSTANCE_UUID; struct tt_uuid REPLICASET_UUID; double replication_timeout = 1.0; /* seconds */ -double replication_connect_timeout = 4.0; /* seconds */ +double replication_connect_timeout = 30.0; /* seconds */ int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL; double replication_sync_lag = 10.0; /* seconds */ @@ -540,7 +540,7 @@ applier_on_connect_f(struct trigger *trigger, void *event) void replicaset_connect(struct applier **appliers, int count, - double timeout, bool connect_all) + bool connect_quorum) { if (count == 0) { /* Cleanup the replica set. */ @@ -571,6 +571,9 @@ replicaset_connect(struct applier **appliers, int count, state.connected = state.failed = 0; fiber_cond_create(&state.wakeup); + double timeout = replication_connect_timeout; + int quorum = MIN(count, replication_connect_quorum); + /* Add triggers and start simulations connection to remote peers */ for (int i = 0; i < count; i++) { struct applier *applier = appliers[i]; @@ -587,7 +590,7 @@ replicaset_connect(struct applier **appliers, int count, double wait_start = ev_monotonic_now(loop()); if (fiber_cond_wait_timeout(&state.wakeup, timeout) != 0) break; - if (state.failed > 0 && connect_all) + if (count - state.failed < quorum) break; timeout -= ev_monotonic_now(loop()) - wait_start; } @@ -595,7 +598,7 @@ replicaset_connect(struct applier **appliers, int count, say_crit("failed to connect to %d out of %d replicas", count - state.connected, count); /* Timeout or connection failure. */ - if (connect_all) + if (connect_quorum && state.connected < quorum) goto error; } else { say_verbose("connected to %d replicas", state.connected); diff --git a/src/box/replication.h b/src/box/replication.h index 95122eb45..06a2867b6 100644 --- a/src/box/replication.h +++ b/src/box/replication.h @@ -357,12 +357,13 @@ replicaset_add(uint32_t replica_id, const struct tt_uuid *instance_uuid); * \param appliers the array of appliers * \param count size of appliers array * \param timeout connection timeout - * \param connect_all if this flag is set, fail unless all - * appliers have successfully connected + * \param connect_quorum if this flag is set, fail unless at + * least replication_connect_quorum + * appliers have successfully connected. */ void replicaset_connect(struct applier **appliers, int count, - double timeout, bool connect_all); + bool connect_quorum); /** * Resume all appliers registered with the replica set. diff --git a/test/replication/quorum.result b/test/replication/quorum.result index a55b0d087..265b099b7 100644 --- a/test/replication/quorum.result +++ b/test/replication/quorum.result @@ -401,3 +401,52 @@ test_run:cmd("switch default") test_run:drop_cluster(SERVERS) --- ... +-- Test that quorum is not ignored neither during bootstrap, nor +-- during reconfiguration. +box.schema.user.grant('guest', 'replication') +--- +... +test_run:cmd('create server replica_quorum with script="replication/replica_quorum.lua"') +--- +- true +... +-- Arguments are: replication_connect_quorum, replication_timeout +-- replication_connect_timeout. +-- If replication_connect_quorum was ignored here, the instance +-- would exit with an error. +test_run:cmd('start server replica_quorum with wait=True, wait_load=True, args="1 0.05 0.1"') +--- +- true +... +test_run:cmd('switch replica_quorum') +--- +- true +... +-- If replication_connect_quorum was ignored here, the instance +-- would exit with an error. +box.cfg{replication={INSTANCE_URI, nonexistent_uri(1)}} +--- +... +box.info.id +--- +- 1 +... +test_run:cmd('switch default') +--- +- true +... +test_run:cmd('stop server replica_quorum') +--- +- true +... +test_run:cmd('cleanup server replica_quorum') +--- +- true +... +test_run:cmd('delete server replica_quorum') +--- +- true +... +box.schema.user.revoke('guest', 'replication') +--- +... diff --git a/test/replication/quorum.test.lua b/test/replication/quorum.test.lua index 8290f8ea5..5a43275c2 100644 --- a/test/replication/quorum.test.lua +++ b/test/replication/quorum.test.lua @@ -150,3 +150,23 @@ box.space.test:select() test_run:cmd("switch default") -- Cleanup. test_run:drop_cluster(SERVERS) + +-- Test that quorum is not ignored neither during bootstrap, nor +-- during reconfiguration. +box.schema.user.grant('guest', 'replication') +test_run:cmd('create server replica_quorum with script="replication/replica_quorum.lua"') +-- Arguments are: replication_connect_quorum, replication_timeout +-- replication_connect_timeout. +-- If replication_connect_quorum was ignored here, the instance +-- would exit with an error. +test_run:cmd('start server replica_quorum with wait=True, wait_load=True, args="1 0.05 0.1"') +test_run:cmd('switch replica_quorum') +-- If replication_connect_quorum was ignored here, the instance +-- would exit with an error. +box.cfg{replication={INSTANCE_URI, nonexistent_uri(1)}} +box.info.id +test_run:cmd('switch default') +test_run:cmd('stop server replica_quorum') +test_run:cmd('cleanup server replica_quorum') +test_run:cmd('delete server replica_quorum') +box.schema.user.revoke('guest', 'replication') diff --git a/test/replication/replica_quorum.lua b/test/replication/replica_quorum.lua new file mode 100644 index 000000000..dd42b8214 --- /dev/null +++ b/test/replication/replica_quorum.lua @@ -0,0 +1,24 @@ +#!/usr/bin/env tarantool + +local SOCKET_DIR = require('fio').cwd() + +local QUORUM = tonumber(arg[1]) +local TIMEOUT = arg[2] and tonumber(arg[2]) or 0.1 +local CON_TIMEOUT = arg[3] and tonumber(arg[3]) or 30.0 +INSTANCE_URI = SOCKET_DIR .. '/replica_quorum.sock' + +function nonexistent_uri(id) + return SOCKET_DIR .. '/replica_quorum' .. (1000 + id) .. '.sock' +end + +require('console').listen(os.getenv('ADMIN')) + +box.cfg{ + listen = INSTANCE_URI, + replication_timeout = TIMEOUT, + replication_connect_timeout = CON_TIMEOUT, + replication_connect_quorum = QUORUM, + replication = {INSTANCE_URI, + nonexistent_uri(1), + nonexistent_uri(2)} +} -- 2.15.2 (Apple Git-101.1)