From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Serge Petrenko Subject: [PATCH v3] replication: enter orphan mode on manual replication configuration chage Date: Wed, 28 Aug 2019 14:20:28 +0300 Message-Id: <20190828112028.13875-1-sergepetrenko@tarantool.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit To: vdavydov.dev@gmail.com Cc: tarantool-patches@freelists.org, Serge Petrenko List-ID: Currently we only enter orphan mode when instance fails to connect to replication_connect_quorum remote instances during local recovery. On bootstrap and manual replication configuration change an error is thrown. We better enter orphan mode on manual config change, and leave it only in case we managed to sync with replication_connect_quorum instances. Closes #4424 @TarantoolBot document Title: document reaction on error in replication configuration change. Now when issuing `box.cfg{replication={uri1, uri2, ...}}` and failing to sync with replication_connect_quorum remote instances, the server will throw an error, if it is bootstrapping, and just set its state to orphan in all other cases (recovering from existing xlog/snap files or manually changing box.cfg.replication on the fly). To leave orphan mode, you may wait until the server manages to sync with replication_connect_quorum instances. In order to leave orphan mode you need to make the server sync with enough instances. To do so, you may either: 1) set replication_connect_quorum to a lower value 2) reset box.cfg.replication to exclude instances that cannot be reached or synced with 3) just set box.cfg.replication to "" (empty string) --- https://github.com/tarantool/tarantool/issues/4424 https://github.com/tarantool/tarantool/tree/sp/gh-4424-repl-config-errors-v33 src/box/box.cc | 14 +++-- src/box/box.h | 7 +++ src/box/replication.cc | 11 ++++ test/replication/misc.result | 104 ++++++++++++++++++++++++++++++++- test/replication/misc.test.lua | 41 ++++++++++++- 5 files changed, 168 insertions(+), 9 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index 66cd6d3a4..62d9665f4 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -257,7 +257,7 @@ box_wait_ro(bool ro, double timeout) } void -box_set_orphan(bool orphan) +box_set_orphan_raw(bool orphan) { if (is_orphan == orphan) return; /* nothing to do */ @@ -266,7 +266,12 @@ box_set_orphan(bool orphan) is_orphan = orphan; fiber_cond_broadcast(&ro_cond); +} +void +box_set_orphan(bool orphan) +{ + box_set_orphan_raw(orphan); /* Update the title to reflect the new status. */ if (is_orphan) { say_crit("entering orphan mode"); @@ -699,11 +704,10 @@ box_set_replication(void) box_check_replication(); /* * Try to connect to all replicas within the timeout period. - * The configuration will succeed as long as we've managed - * to connect to at least replication_connect_quorum - * masters. + * Stay in orphan mode in case we fail to connect to at least + * 'replication_connect_quorum' remote instances. */ - box_sync_replication(true); + box_sync_replication(false); /* Follow replica */ replicaset_follow(); /* Wait until appliers are in sync */ diff --git a/src/box/box.h b/src/box/box.h index ddcfbe2e5..3bd484353 100644 --- a/src/box/box.h +++ b/src/box/box.h @@ -127,6 +127,13 @@ box_wait_ro(bool ro, double timeout); void box_set_orphan(bool orphan); +/** + * Set orphan mode but don't update instance title. + * \sa box_set_orphan + */ +void +box_set_orphan_raw(bool orphan); + /** * Iterate over all spaces and save them to the * snapshot file. diff --git a/src/box/replication.cc b/src/box/replication.cc index 28f7acedc..ac415fd46 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -610,6 +610,17 @@ replicaset_connect(struct applier **appliers, int count, say_info("connecting to %d replicas", count); + if (!connect_quorum) { + /* + * Enter orphan mode on configuration change and + * only leave it when we manage to sync with + * replicaset_quorum instances. Don't change + * title though, it should be 'loading' during + * local recovery. + */ + box_set_orphan_raw(true); + } + /* * Simultaneously connect to remote peers to receive their UUIDs * and fill the resulting set: diff --git a/test/replication/misc.result b/test/replication/misc.result index 0a57edda5..ae72ce3e4 100644 --- a/test/replication/misc.result +++ b/test/replication/misc.result @@ -18,10 +18,19 @@ replication_connect_timeout = box.cfg.replication_connect_timeout box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}} --- ... +box.cfg{replication_connect_quorum=2} +--- +... box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}} --- -- error: 'Incorrect value for option ''replication'': failed to connect to one or - more replicas' +... +box.info.status +--- +- orphan +... +box.info.ro +--- +- true ... -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently fiber = require('fiber') @@ -47,8 +56,16 @@ c:get() --- - true ... -box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +--- +... +box.info.status +--- +- running +... +box.info.ro --- +- false ... -- gh-3111 - Allow to rebootstrap a replica from a read-only master replica_uuid = uuid.new() @@ -729,3 +746,84 @@ test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') --- ... +-- +-- gh-4424 Always enter orphan mode on error in replication +-- configuration change. +-- +replication_connect_timeout = box.cfg.replication_connect_timeout +--- +... +replication_connect_quorum = box.cfg.replication_connect_quorum +--- +... +box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1} +--- +... +box.info.status +--- +- orphan +... +box.info.ro +--- +- true +... +-- reset replication => leave orphan mode +box.cfg{replication=""} +--- +... +box.info.status +--- +- running +... +box.info.ro +--- +- false +... +-- no switch to orphan when quorum == 0 +box.cfg{replication="12345", replication_connect_quorum=0} +--- +... +box.info.status +--- +- running +... +box.info.ro +--- +- false +... +-- we could connect to one out of two replicas. Set orphan. +box.cfg{replication_connect_quorum=2} +--- +... +box.cfg{replication={box.cfg.listen, "12345"}} +--- +... +box.info.status +--- +- orphan +... +box.info.ro +--- +- true +... +-- lower quorum => leave orphan mode +box.cfg{replication_connect_quorum=1} +--- +... +box.info.status +--- +- running +... +box.info.ro +--- +- false +... +box.cfg{replication=""} +--- +... +box.cfg{replication_connect_timeout=replication_connect_timeout} +--- +... +box.cfg{replication_connect_quorum=replication_connect_quorum} +--- +... diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua index 99e995509..16e7e9e42 100644 --- a/test/replication/misc.test.lua +++ b/test/replication/misc.test.lua @@ -8,7 +8,10 @@ box.schema.user.grant('guest', 'replication') replication_timeout = box.cfg.replication_timeout replication_connect_timeout = box.cfg.replication_connect_timeout box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}} +box.cfg{replication_connect_quorum=2} box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}} +box.info.status +box.info.ro -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently fiber = require('fiber') @@ -19,7 +22,9 @@ f() c:get() c:get() -box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.info.status +box.info.ro -- gh-3111 - Allow to rebootstrap a replica from a read-only master replica_uuid = uuid.new() @@ -293,3 +298,37 @@ test_run:cmd("cleanup server replica") test_run:cmd("delete server replica") test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') + +-- +-- gh-4424 Always enter orphan mode on error in replication +-- configuration change. +-- +replication_connect_timeout = box.cfg.replication_connect_timeout +replication_connect_quorum = box.cfg.replication_connect_quorum +box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1} +box.info.status +box.info.ro +-- reset replication => leave orphan mode +box.cfg{replication=""} +box.info.status +box.info.ro +-- no switch to orphan when quorum == 0 +box.cfg{replication="12345", replication_connect_quorum=0} +box.info.status +box.info.ro + +-- we could connect to one out of two replicas. Set orphan. +box.cfg{replication_connect_quorum=2} +box.cfg{replication={box.cfg.listen, "12345"}} +box.info.status +box.info.ro +-- lower quorum => leave orphan mode +box.cfg{replication_connect_quorum=1} +box.info.status +box.info.ro + +box.cfg{replication=""} + + +box.cfg{replication_connect_timeout=replication_connect_timeout} +box.cfg{replication_connect_quorum=replication_connect_quorum} -- 2.20.1 (Apple Git-117)