[PATCH] replication: enter orphan mode on every erroneous config change
Serge Petrenko
sergepetrenko at tarantool.org
Mon Aug 19 15:11:01 MSK 2019
We only entered orphan mode on bootrap and local recovery, but threw an
error when replicaton config was changed on the fly.
For consistency, in this case we should also enter orphan mode when
an instance fails to connect to quorum remote instances.
Closes #4424
---
https://github.com/tarantool/tarantool/issues/4424
https://github.com/tarantool/tarantool/tree/sp/gh-4424-repl-config-errors
src/box/box.cc | 9 ++--
src/box/replication.cc | 30 ++++++-------
src/box/replication.h | 2 +-
test/replication/misc.result | 77 ++++++++++++++++++++++++++++++++--
test/replication/misc.test.lua | 29 ++++++++++++-
5 files changed, 125 insertions(+), 22 deletions(-)
diff --git a/src/box/box.cc b/src/box/box.cc
index 66cd6d3a4..43cc32d87 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -666,7 +666,7 @@ cfg_get_replication(int *p_count)
* Sync box.cfg.replication with the cluster registry, but
* don't start appliers.
*/
-static void
+static int
box_sync_replication(bool connect_quorum)
{
int count = 0;
@@ -679,9 +679,10 @@ box_sync_replication(bool connect_quorum)
applier_delete(appliers[i]); /* doesn't affect diag */
});
- replicaset_connect(appliers, count, connect_quorum);
+ int rc = replicaset_connect(appliers, count, connect_quorum);
guard.is_active = false;
+ return rc;
}
void
@@ -703,7 +704,9 @@ box_set_replication(void)
* to connect to at least replication_connect_quorum
* masters.
*/
- box_sync_replication(true);
+ if (box_sync_replication(true) != 0) {
+ return;
+ }
/* Follow replica */
replicaset_follow();
/* Wait until appliers are in sync */
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 28f7acedc..e9d0a9206 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -598,14 +598,14 @@ applier_on_connect_f(struct trigger *trigger, void *event)
applier_pause(applier);
}
-void
+int
replicaset_connect(struct applier **appliers, int count,
bool connect_quorum)
{
if (count == 0) {
/* Cleanup the replica set. */
replicaset_update(appliers, count);
- return;
+ return 0;
}
say_info("connecting to %d replicas", count);
@@ -660,9 +660,13 @@ replicaset_connect(struct applier **appliers, int count,
count - state.connected, count);
/* Timeout or connection failure. */
if (connect_quorum && state.connected < quorum) {
- diag_set(ClientError, ER_CFG, "replication",
- "failed to connect to one or more replicas");
- goto error;
+ /* Destroy appliers */
+ for (int i = 0; i < count; i++) {
+ trigger_clear(&triggers[i].base);
+ applier_stop(appliers[i]);
+ }
+ box_set_orphan(true);
+ return -1;
}
} else {
say_info("connected to %d replicas", state.connected);
@@ -685,16 +689,14 @@ replicaset_connect(struct applier **appliers, int count,
try {
replicaset_update(appliers, count);
} catch (Exception *e) {
- goto error;
- }
- return;
-error:
- /* Destroy appliers */
- for (int i = 0; i < count; i++) {
- trigger_clear(&triggers[i].base);
- applier_stop(appliers[i]);
+ /* Destroy appliers */
+ for (int i = 0; i < count; i++) {
+ trigger_clear(&triggers[i].base);
+ applier_stop(appliers[i]);
+ }
+ diag_raise();
}
- diag_raise();
+ return 0;
}
bool
diff --git a/src/box/replication.h b/src/box/replication.h
index 19f283c7d..62d9de8ce 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -378,7 +378,7 @@ replicaset_add(uint32_t replica_id, const struct tt_uuid *instance_uuid);
* least replication_connect_quorum
* appliers have successfully connected.
*/
-void
+int
replicaset_connect(struct applier **appliers, int count,
bool connect_quorum);
diff --git a/test/replication/misc.result b/test/replication/misc.result
index 0a57edda5..c6fd19db9 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -20,8 +20,14 @@ box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication=
...
box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}}
---
-- error: 'Incorrect value for option ''replication'': failed to connect to one or
- more replicas'
+...
+box.info.status
+---
+- orphan
+...
+box.info.ro
+---
+- true
...
-- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently
fiber = require('fiber')
@@ -47,8 +53,16 @@ c:get()
---
- true
...
-box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
---
+- false
...
-- gh-3111 - Allow to rebootstrap a replica from a read-only master
replica_uuid = uuid.new()
@@ -729,3 +743,60 @@ test_run:cleanup_cluster()
box.schema.user.revoke('guest', 'replication')
---
...
+--
+-- gh-4424 Always enter orphan mode on error in replication
+-- configuration change.
+--
+replication_connect_timeout = box.cfg.replication_connect_timeout
+---
+...
+replication_connect_quorum = box.cfg.replication_connect_quorum
+---
+...
+box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1}
+---
+...
+box.info.status
+---
+- orphan
+...
+box.info.ro
+---
+- true
+...
+box.cfg{replication = ""}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
+---
+- false
+...
+-- no switch to orphan when quorum == 0
+box.cfg{replication= "12345", replication_connect_quorum=0}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
+---
+- false
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+box.cfg{replication="",
+ replication_connect_timeout=replication_connect_timeout,
+ replication_connect_quorum=replication_connect_quorum};
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 99e995509..91a77b584 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -9,6 +9,8 @@ replication_timeout = box.cfg.replication_timeout
replication_connect_timeout = box.cfg.replication_connect_timeout
box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}}
box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}}
+box.info.status
+box.info.ro
-- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently
fiber = require('fiber')
@@ -19,7 +21,9 @@ f()
c:get()
c:get()
-box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.info.status
+box.info.ro
-- gh-3111 - Allow to rebootstrap a replica from a read-only master
replica_uuid = uuid.new()
@@ -293,3 +297,26 @@ test_run:cmd("cleanup server replica")
test_run:cmd("delete server replica")
test_run:cleanup_cluster()
box.schema.user.revoke('guest', 'replication')
+
+--
+-- gh-4424 Always enter orphan mode on error in replication
+-- configuration change.
+--
+replication_connect_timeout = box.cfg.replication_connect_timeout
+replication_connect_quorum = box.cfg.replication_connect_quorum
+
+box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1}
+box.info.status
+box.info.ro
+box.cfg{replication = ""}
+box.info.status
+box.info.ro
+-- no switch to orphan when quorum == 0
+box.cfg{replication= "12345", replication_connect_quorum=0}
+box.info.status
+box.info.ro
+test_run:cmd("setopt delimiter ';'")
+box.cfg{replication="",
+ replication_connect_timeout=replication_connect_timeout,
+ replication_connect_quorum=replication_connect_quorum};
+test_run:cmd("setopt delimiter ''");
--
2.20.1 (Apple Git-117)
More information about the Tarantool-patches
mailing list