Hi! Thank you for review. I’ve addressed your comments in v2. Please, check it out > 21 авг. 2019 г., в 12:44, Vladimir Davydov написал(а): > > On Mon, Aug 19, 2019 at 03:11:01PM +0300, Serge Petrenko wrote: >> We only entered orphan mode on bootrap and local recovery, but threw an >> error when replicaton config was changed on the fly. >> For consistency, in this case we should also enter orphan mode when >> an instance fails to connect to quorum remote instances. >> >> Closes #4424 > > A doc bot request is probably required. > >> --- >> https://github.com/tarantool/tarantool/issues/4424 >> https://github.com/tarantool/tarantool/tree/sp/gh-4424-repl-config-errors >> >> src/box/box.cc | 9 ++-- >> src/box/replication.cc | 30 ++++++------- >> src/box/replication.h | 2 +- >> test/replication/misc.result | 77 ++++++++++++++++++++++++++++++++-- >> test/replication/misc.test.lua | 29 ++++++++++++- >> 5 files changed, 125 insertions(+), 22 deletions(-) >> >> diff --git a/src/box/box.cc b/src/box/box.cc >> index 66cd6d3a4..43cc32d87 100644 >> --- a/src/box/box.cc >> +++ b/src/box/box.cc >> @@ -666,7 +666,7 @@ cfg_get_replication(int *p_count) >> * Sync box.cfg.replication with the cluster registry, but >> * don't start appliers. >> */ >> -static void >> +static int >> box_sync_replication(bool connect_quorum) >> { >> int count = 0; >> @@ -679,9 +679,10 @@ box_sync_replication(bool connect_quorum) >> applier_delete(appliers[i]); /* doesn't affect diag */ >> }); >> >> - replicaset_connect(appliers, count, connect_quorum); >> + int rc = replicaset_connect(appliers, count, connect_quorum); > > A function typically returns -1 as an error condition, but in this case > -1 means something different. Besdies, this function can also throw an > exception, and there isn't a single word in the comment regarding it, > which is confusing. > > Can you somehow manage without using a return code? > >> >> guard.is_active = false; >> + return rc; >> } >> >> void >> @@ -703,7 +704,9 @@ box_set_replication(void) >> * to connect to at least replication_connect_quorum >> * masters. >> */ >> - box_sync_replication(true); >> + if (box_sync_replication(true) != 0) { >> + return; >> + } > > The comment above is clearly outdated by this patch. > >> /* Follow replica */ >> replicaset_follow(); >> /* Wait until appliers are in sync */ >> diff --git a/src/box/replication.cc b/src/box/replication.cc >> index 28f7acedc..e9d0a9206 100644 >> --- a/src/box/replication.cc >> +++ b/src/box/replication.cc >> @@ -598,14 +598,14 @@ applier_on_connect_f(struct trigger *trigger, void *event) >> applier_pause(applier); >> } >> >> -void >> +int >> replicaset_connect(struct applier **appliers, int count, >> bool connect_quorum) >> { >> if (count == 0) { >> /* Cleanup the replica set. */ >> replicaset_update(appliers, count); >> - return; >> + return 0; >> } >> >> say_info("connecting to %d replicas", count); >> @@ -660,9 +660,13 @@ replicaset_connect(struct applier **appliers, int count, >> count - state.connected, count); >> /* Timeout or connection failure. */ >> if (connect_quorum && state.connected < quorum) { >> - diag_set(ClientError, ER_CFG, "replication", >> - "failed to connect to one or more replicas"); >> - goto error; >> + /* Destroy appliers */ >> + for (int i = 0; i < count; i++) { >> + trigger_clear(&triggers[i].base); >> + applier_stop(appliers[i]); >> + } >> + box_set_orphan(true); >> + return -1; >> } >> } else { >> say_info("connected to %d replicas", state.connected); >> @@ -685,16 +689,14 @@ replicaset_connect(struct applier **appliers, int count, >> try { >> replicaset_update(appliers, count); >> } catch (Exception *e) { >> - goto error; >> - } >> - return; >> -error: >> - /* Destroy appliers */ >> - for (int i = 0; i < count; i++) { >> - trigger_clear(&triggers[i].base); >> - applier_stop(appliers[i]); >> + /* Destroy appliers */ >> + for (int i = 0; i < count; i++) { >> + trigger_clear(&triggers[i].base); >> + applier_stop(appliers[i]); >> + } > > Copy-and-paste... Let's please try to avoid that. I'd prefer if you > called replicaset_follow and _sync in any case and let them decide > if the instance should enter the orphan state. > >> + diag_raise(); >> } >> - diag_raise(); >> + return 0; >> } >> >> bool >> diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua >> index 99e995509..91a77b584 100644 >> --- a/test/replication/misc.test.lua >> +++ b/test/replication/misc.test.lua >> @@ -9,6 +9,8 @@ replication_timeout = box.cfg.replication_timeout >> replication_connect_timeout = box.cfg.replication_connect_timeout >> box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}} >> box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}} >> +box.info.status >> +box.info.ro >> >> -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently >> fiber = require('fiber') >> @@ -19,7 +21,9 @@ f() >> c:get() >> c:get() >> >> -box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} >> +box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout} >> +box.info.status >> +box.info.ro >> >> -- gh-3111 - Allow to rebootstrap a replica from a read-only master >> replica_uuid = uuid.new() >> @@ -293,3 +297,26 @@ test_run:cmd("cleanup server replica") >> test_run:cmd("delete server replica") >> test_run:cleanup_cluster() >> box.schema.user.revoke('guest', 'replication') >> + >> +-- >> +-- gh-4424 Always enter orphan mode on error in replication >> +-- configuration change. >> +-- >> +replication_connect_timeout = box.cfg.replication_connect_timeout >> +replication_connect_quorum = box.cfg.replication_connect_quorum >> + >> +box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1} > > Let's please also test a more sophisticated use case: when there are two > replicas and we manage to connect to one of them but fail to connect to > another. > >> +box.info.status >> +box.info.ro >> +box.cfg{replication = ""} >> +box.info.status >> +box.info.ro >> +-- no switch to orphan when quorum == 0 >> +box.cfg{replication= "12345", replication_connect_quorum=0} >> +box.info.status >> +box.info.ro >> +test_run:cmd("setopt delimiter ';'") >> +box.cfg{replication="", >> + replication_connect_timeout=replication_connect_timeout, >> + replication_connect_quorum=replication_connect_quorum}; >> +test_run:cmd("setopt delimiter ''"); > > Nit: I'd rewrite it without using 'setopt delimiter':: > > box.cfg{replication = {}} > box.cfg{replication_connect_timeout = replication_connect_timeout) > box.cfg{replication_connect_quorum = replication_connect_quorum) -- Serge Petrenko sergepetrenko@tarantool.org