[PATCH] replication: enter orphan mode on every erroneous config change

Serge Petrenko sergepetrenko at tarantool.org
Mon Aug 19 15:11:01 MSK 2019


We only entered orphan mode on bootrap and local recovery, but threw an
error when replicaton config was changed on the fly.
For consistency, in this case we should also enter orphan mode when
an instance fails to connect to quorum remote instances.

Closes #4424
---
https://github.com/tarantool/tarantool/issues/4424
https://github.com/tarantool/tarantool/tree/sp/gh-4424-repl-config-errors

 src/box/box.cc                 |  9 ++--
 src/box/replication.cc         | 30 ++++++-------
 src/box/replication.h          |  2 +-
 test/replication/misc.result   | 77 ++++++++++++++++++++++++++++++++--
 test/replication/misc.test.lua | 29 ++++++++++++-
 5 files changed, 125 insertions(+), 22 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 66cd6d3a4..43cc32d87 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -666,7 +666,7 @@ cfg_get_replication(int *p_count)
  * Sync box.cfg.replication with the cluster registry, but
  * don't start appliers.
  */
-static void
+static int
 box_sync_replication(bool connect_quorum)
 {
 	int count = 0;
@@ -679,9 +679,10 @@ box_sync_replication(bool connect_quorum)
 			applier_delete(appliers[i]); /* doesn't affect diag */
 	});
 
-	replicaset_connect(appliers, count, connect_quorum);
+	int rc = replicaset_connect(appliers, count, connect_quorum);
 
 	guard.is_active = false;
+	return rc;
 }
 
 void
@@ -703,7 +704,9 @@ box_set_replication(void)
 	 * to connect to at least replication_connect_quorum
 	 * masters.
 	 */
-	box_sync_replication(true);
+	if (box_sync_replication(true) != 0) {
+		return;
+	}
 	/* Follow replica */
 	replicaset_follow();
 	/* Wait until appliers are in sync */
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 28f7acedc..e9d0a9206 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -598,14 +598,14 @@ applier_on_connect_f(struct trigger *trigger, void *event)
 	applier_pause(applier);
 }
 
-void
+int
 replicaset_connect(struct applier **appliers, int count,
 		   bool connect_quorum)
 {
 	if (count == 0) {
 		/* Cleanup the replica set. */
 		replicaset_update(appliers, count);
-		return;
+		return 0;
 	}
 
 	say_info("connecting to %d replicas", count);
@@ -660,9 +660,13 @@ replicaset_connect(struct applier **appliers, int count,
 			 count - state.connected, count);
 		/* Timeout or connection failure. */
 		if (connect_quorum && state.connected < quorum) {
-			diag_set(ClientError, ER_CFG, "replication",
-				 "failed to connect to one or more replicas");
-			goto error;
+			/* Destroy appliers */
+			for (int i = 0; i < count; i++) {
+				trigger_clear(&triggers[i].base);
+				applier_stop(appliers[i]);
+			}
+			box_set_orphan(true);
+			return -1;
 		}
 	} else {
 		say_info("connected to %d replicas", state.connected);
@@ -685,16 +689,14 @@ replicaset_connect(struct applier **appliers, int count,
 	try {
 		replicaset_update(appliers, count);
 	} catch (Exception *e) {
-		goto error;
-	}
-	return;
-error:
-	/* Destroy appliers */
-	for (int i = 0; i < count; i++) {
-		trigger_clear(&triggers[i].base);
-		applier_stop(appliers[i]);
+		/* Destroy appliers */
+		for (int i = 0; i < count; i++) {
+			trigger_clear(&triggers[i].base);
+			applier_stop(appliers[i]);
+		}
+		diag_raise();
 	}
-	diag_raise();
+	return 0;
 }
 
 bool
diff --git a/src/box/replication.h b/src/box/replication.h
index 19f283c7d..62d9de8ce 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -378,7 +378,7 @@ replicaset_add(uint32_t replica_id, const struct tt_uuid *instance_uuid);
  *                       least replication_connect_quorum
  *                       appliers have successfully connected.
  */
-void
+int
 replicaset_connect(struct applier **appliers, int count,
 		   bool connect_quorum);
 
diff --git a/test/replication/misc.result b/test/replication/misc.result
index 0a57edda5..c6fd19db9 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -20,8 +20,14 @@ box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication=
 ...
 box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}}
 ---
-- error: 'Incorrect value for option ''replication'': failed to connect to one or
-    more replicas'
+...
+box.info.status
+---
+- orphan
+...
+box.info.ro
+---
+- true
 ...
 -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently
 fiber = require('fiber')
@@ -47,8 +53,16 @@ c:get()
 ---
 - true
 ...
-box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
 ---
+- false
 ...
 -- gh-3111 - Allow to rebootstrap a replica from a read-only master
 replica_uuid = uuid.new()
@@ -729,3 +743,60 @@ test_run:cleanup_cluster()
 box.schema.user.revoke('guest', 'replication')
 ---
 ...
+--
+-- gh-4424 Always enter orphan mode on error in replication
+-- configuration change.
+--
+replication_connect_timeout = box.cfg.replication_connect_timeout
+---
+...
+replication_connect_quorum = box.cfg.replication_connect_quorum
+---
+...
+box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1}
+---
+...
+box.info.status
+---
+- orphan
+...
+box.info.ro
+---
+- true
+...
+box.cfg{replication = ""}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
+---
+- false
+...
+-- no switch to orphan when quorum == 0
+box.cfg{replication= "12345", replication_connect_quorum=0}
+---
+...
+box.info.status
+---
+- running
+...
+box.info.ro
+---
+- false
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+box.cfg{replication="",
+        replication_connect_timeout=replication_connect_timeout,
+        replication_connect_quorum=replication_connect_quorum};
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 99e995509..91a77b584 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -9,6 +9,8 @@ replication_timeout = box.cfg.replication_timeout
 replication_connect_timeout = box.cfg.replication_connect_timeout
 box.cfg{replication_timeout=0.05, replication_connect_timeout=0.05, replication={}}
 box.cfg{replication = {'127.0.0.1:12345', box.cfg.listen}}
+box.info.status
+box.info.ro
 
 -- gh-3606 - Tarantool crashes if box.cfg.replication is updated concurrently
 fiber = require('fiber')
@@ -19,7 +21,9 @@ f()
 c:get()
 c:get()
 
-box.cfg{replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.cfg{replication = "", replication_timeout = replication_timeout, replication_connect_timeout = replication_connect_timeout}
+box.info.status
+box.info.ro
 
 -- gh-3111 - Allow to rebootstrap a replica from a read-only master
 replica_uuid = uuid.new()
@@ -293,3 +297,26 @@ test_run:cmd("cleanup server replica")
 test_run:cmd("delete server replica")
 test_run:cleanup_cluster()
 box.schema.user.revoke('guest', 'replication')
+
+--
+-- gh-4424 Always enter orphan mode on error in replication
+-- configuration change.
+--
+replication_connect_timeout = box.cfg.replication_connect_timeout
+replication_connect_quorum = box.cfg.replication_connect_quorum
+
+box.cfg{replication="12345", replication_connect_timeout=0.1, replication_connect_quorum=1}
+box.info.status
+box.info.ro
+box.cfg{replication = ""}
+box.info.status
+box.info.ro
+-- no switch to orphan when quorum == 0
+box.cfg{replication= "12345", replication_connect_quorum=0}
+box.info.status
+box.info.ro
+test_run:cmd("setopt delimiter ';'")
+box.cfg{replication="",
+        replication_connect_timeout=replication_connect_timeout,
+        replication_connect_quorum=replication_connect_quorum};
+test_run:cmd("setopt delimiter ''");
-- 
2.20.1 (Apple Git-117)




More information about the Tarantool-patches mailing list