From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Serge Petrenko Subject: [PATCH] replication: enter orphan mode on every erroneous config change Date: Mon, 19 Aug 2019 15:11:01 +0300 Message-Id: <20190819121101.29815-1-sergepetrenko@tarantool.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit To: vdavydov.dev@gmail.com Cc: tarantool-patches@freelists.org, Serge Petrenko List-ID: We only entered orphan mode on bootrap and local recovery, but threw an error when replicaton config was changed on the fly. For consistency, in this case we should also enter orphan mode when an instance fails to connect to quorum remote instances. Closes #4424 --- https://github.com/tarantool/tarantool/issues/4424 https://github.com/tarantool/tarantool/tree/sp/gh-4424-repl-config-errors src/box/box.cc | 9 ++-- src/box/replication.cc | 30 ++++++------- src/box/replication.h | 2 +- test/replication/misc.result | 77 ++++++++++++++++++++++++++++++++-- test/replication/misc.test.lua | 29 ++++++++++++- 5 files changed, 125 insertions(+), 22 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index 66cd6d3a4..43cc32d87 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -666,7 +666,7 @@ cfg_get_replication(int *p_count) * Sync box.cfg.replication with the cluster registry, but * don't start appliers. */ -static void +static int box_sync_replication(bool connect_quorum) { int count = 0; @@ -679,9 +679,10 @@ box_sync_replication(bool connect_quorum) applier_delete(appliers[i]); /* doesn't affect diag */ }); - replicaset_connect(appliers, count, connect_quorum); + int rc = replicaset_connect(appliers, count, connect_quorum); guard.is_active = false; + return rc; } void @@ -703,7 +704,9 @@ box_set_replication(void) * to connect to at least replication_connect_quorum * masters. */ - box_sync_replication(true); + if (box_sync_replication(true) != 0) { + return; + } /* Follow replica */ replicaset_follow(); /* Wait until appliers are in sync */ diff --git a/src/box/replication.cc b/src/box/replication.cc index 28f7acedc..e9d0a9206 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -598,14 +598,14 @@ applier_on_connect_f(struct trigger *trigger, void *event) applier_pause(applier); } -void +int replicaset_connect(struct applier **appliers, int count, bool connect_quorum) { if (count == 0) { /* Cleanup the replica set. */ replicaset_update(appliers, count); - return; + return 0; } say_info("connecting to %d replicas", count); @@ -660,9 +660,13 @@ replicaset_connect(struct applier **appliers, int count, count - state.connected, count); /* Timeout or connection failure. */ if (connect_quorum && state.connected < quorum) { - diag_set(ClientError, ER_CFG, "replication", - "failed to connect to one or more replicas"); - goto error; + /* Destroy appliers */ + for (int i = 0; i < count; i++) { + trigger_clear(&triggers[i].base); + applier_stop(appliers[i]); + } + box_set_orphan(true); + return -1; } } else { say_info("connected to %d replicas", state.connected); @@ -685,16 +689,14 @@ replicaset_connect(struct applier **appliers, int count, try { replicaset_update(appliers, count); } catch (Exception *e) { - goto error; - } - return; -error: - /* Destroy appliers */ - for (int i = 0; i < count; i++) { - trigger_clear(&triggers[i].base); - applier_stop(appliers[i]); + /* Destroy appliers */ + for (int i = 0; i < count; i++) { + trigger_clear(&triggers[i].base); + applier_stop(appliers[i]); + } + diag_raise(); } - diag_raise(); + return 0; } bool diff --git a/src/box/replication.h b/src/box/replication.h index 19f283c7d..62d9de8ce 100644 --- a/src/box/replication.h +++ b/src/box/replication.h @@ -378,7 +378,7 @@ replicaset_add(uint32_t replica_id, const struct tt_uuid *instance_uuid); * least replication_connect_quorum * appliers have successfully connected. */ -void +int replicaset_connect(struct applier **appliers, int count, bool connect_quorum); diff --git a/test/replication/misc.result b/test/replication/misc.result index 0a57edda5..c6fd19db9 100644 --- a/test/replication/misc.result +++ b/test/replication/misc.result @@ -20,8 +20,14 @@ box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication= ... box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}} --- -- error: 'Incorrect value for option ''replication'': failed to connect to one or - more replicas' +... +box.info.status +--- +- orphan +... +box.info.ro +--- +- true ... -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently fiber = require('fiber') @@ -47,8 +53,16 @@ c:get() --- - true ... -box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +--- +... +box.info.status +--- +- running +... +box.info.ro --- +- false ... -- gh-3111 - Allow to rebootstrap a replica from a read-only master replica_uuid = uuid.new() @@ -729,3 +743,60 @@ test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') --- ... +-- +-- gh-4424 Always enter orphan mode on error in replication +-- configuration change. +-- +replication_connect_timeout = box.cfg.replication_connect_timeout +--- +... +replication_connect_quorum = box.cfg.replication_connect_quorum +--- +... +box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1} +--- +... +box.info.status +--- +- orphan +... +box.info.ro +--- +- true +... +box.cfg{replication = ""} +--- +... +box.info.status +--- +- running +... +box.info.ro +--- +- false +... +-- no switch to orphan when quorum == 0 +box.cfg{replication= "12345", replication_connect_quorum=0} +--- +... +box.info.status +--- +- running +... +box.info.ro +--- +- false +... +test_run:cmd("setopt delimiter ';'") +--- +- true +... +box.cfg{replication="", + replication_connect_timeout=replication_connect_timeout, + replication_connect_quorum=replication_connect_quorum}; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua index 99e995509..91a77b584 100644 --- a/test/replication/misc.test.lua +++ b/test/replication/misc.test.lua @@ -9,6 +9,8 @@ replication_timeout = box.cfg.replication_timeout replication_connect_timeout = box.cfg.replication_connect_timeout box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}} box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}} +box.info.status +box.info.ro -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently fiber = require('fiber') @@ -19,7 +21,9 @@ f() c:get() c:get() -box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} +box.info.status +box.info.ro -- gh-3111 - Allow to rebootstrap a replica from a read-only master replica_uuid = uuid.new() @@ -293,3 +297,26 @@ test_run:cmd("cleanup server replica") test_run:cmd("delete server replica") test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') + +-- +-- gh-4424 Always enter orphan mode on error in replication +-- configuration change. +-- +replication_connect_timeout = box.cfg.replication_connect_timeout +replication_connect_quorum = box.cfg.replication_connect_quorum + +box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1} +box.info.status +box.info.ro +box.cfg{replication = ""} +box.info.status +box.info.ro +-- no switch to orphan when quorum == 0 +box.cfg{replication= "12345", replication_connect_quorum=0} +box.info.status +box.info.ro +test_run:cmd("setopt delimiter ';'") +box.cfg{replication="", + replication_connect_timeout=replication_connect_timeout, + replication_connect_quorum=replication_connect_quorum}; +test_run:cmd("setopt delimiter ''"); -- 2.20.1 (Apple Git-117)