[PATCH v4 3/3] replication: do not ignore replication_connect_quorum.

Serge Petrenko sergepetrenko at tarantool.org
Tue Aug 14 13:02:59 MSK 2018


On bootstrap and after replication reconfiguration
replication_connect_quorum was ignored. The instance tried to connect to
every replica listed in replication parameter, and errored if it wasn't
possible.
The patch alters this behaviour. An instance still tries to connect to
every node listed in replication, but does not raise an error if it was
able to connect to at least replication_connect_quorum instances.

Closes #3428

@TarantoolBot document
Title: replication_connect_quorum is not ignored.
Now on replica set bootstrap and in case of replication reconfiguration
(e.g. calling box.cfg{replication=...} for the second time) tarantool
doesn't fail, if it couldn't connect to to every replica, but could
connect to replication_connect_quorum replicas. If after
replication_connect_timeout seconds the instance is not connected to at
least replication_connect_quorum other instances, we throw an error.
---
 src/box/box.cc                      | 35 +++++++++++++++++---------
 src/box/replication.cc              | 11 ++++++---
 src/box/replication.h               |  7 +++---
 test/replication/quorum.result      | 49 +++++++++++++++++++++++++++++++++++++
 test/replication/quorum.test.lua    | 20 +++++++++++++++
 test/replication/replica_quorum.lua | 24 ++++++++++++++++++
 6 files changed, 128 insertions(+), 18 deletions(-)
 create mode 100644 test/replication/replica_quorum.lua

diff --git a/src/box/box.cc b/src/box/box.cc
index e3eb2738f..8d7454d1f 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -595,7 +595,7 @@ cfg_get_replication(int *p_count)
  * don't start appliers.
  */
 static void
-box_sync_replication(double timeout, bool connect_all)
+box_sync_replication(bool connect_quorum)
 {
 	int count = 0;
 	struct applier **appliers = cfg_get_replication(&count);
@@ -607,7 +607,7 @@ box_sync_replication(double timeout, bool connect_all)
 			applier_delete(appliers[i]); /* doesn't affect diag */
 	});
 
-	replicaset_connect(appliers, count, timeout, connect_all);
+	replicaset_connect(appliers, count, connect_quorum);
 
 	guard.is_active = false;
 }
@@ -625,8 +625,13 @@ box_set_replication(void)
 	}
 
 	box_check_replication();
-	/* Try to connect to all replicas within the timeout period */
-	box_sync_replication(replication_connect_timeout, true);
+	/*
+	 * Try to connect to all replicas within the timeout period.
+	 * The configuration will succeed as long as we've managed
+	 * to connect to at least replication_connect_quorum
+	 * masters.
+	 */
+	box_sync_replication(true);
 	/* Follow replica */
 	replicaset_follow();
 }
@@ -1865,8 +1870,13 @@ box_cfg_xc(void)
 
 		title("orphan");
 
-		/* Wait for the cluster to start up */
-		box_sync_replication(replication_connect_timeout, false);
+		/*
+		 * In case of recovering from a checkpoint we
+		 * don't need to wait for 'quorum' masters, since
+		 * the recovered _cluster space will have all the
+		 * information about cluster.
+		 */
+		box_sync_replication(false);
 	} else {
 		if (!tt_uuid_is_nil(&instance_uuid))
 			INSTANCE_UUID = instance_uuid;
@@ -1883,12 +1893,15 @@ box_cfg_xc(void)
 		/*
 		 * Wait for the cluster to start up.
 		 *
-		 * Note, when bootstrapping a new instance, we have to
-		 * connect to all masters to make sure all replicas
-		 * receive the same replica set UUID when a new cluster
-		 * is deployed.
+		 * Note, when bootstrapping a new instance, we try to
+		 * connect to all masters during timeout to make sure
+		 * all replicas recieve the same replica set UUID when
+		 * a new cluster is deployed.
+		 * If we fail to do so, settle with connecting to
+		 * 'replication_connect_quorum' masters.
+		 * If this also fails, throw an error.
 		 */
-		box_sync_replication(TIMEOUT_INFINITY, true);
+		box_sync_replication(true);
 		/* Bootstrap a new master */
 		bootstrap(&replicaset_uuid, &is_bootstrap_leader);
 	}
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 528fe4459..861ce34ea 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -46,7 +46,7 @@ struct tt_uuid INSTANCE_UUID;
 struct tt_uuid REPLICASET_UUID;
 
 double replication_timeout = 1.0; /* seconds */
-double replication_connect_timeout = 4.0; /* seconds */
+double replication_connect_timeout = 30.0; /* seconds */
 int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL;
 double replication_sync_lag = 10.0; /* seconds */
 
@@ -540,7 +540,7 @@ applier_on_connect_f(struct trigger *trigger, void *event)
 
 void
 replicaset_connect(struct applier **appliers, int count,
-		   double timeout, bool connect_all)
+		   bool connect_quorum)
 {
 	if (count == 0) {
 		/* Cleanup the replica set. */
@@ -571,6 +571,9 @@ replicaset_connect(struct applier **appliers, int count,
 	state.connected = state.failed = 0;
 	fiber_cond_create(&state.wakeup);
 
+	double timeout = replication_connect_timeout;
+	int quorum = MIN(count, replication_connect_quorum);
+
 	/* Add triggers and start simulations connection to remote peers */
 	for (int i = 0; i < count; i++) {
 		struct applier *applier = appliers[i];
@@ -587,7 +590,7 @@ replicaset_connect(struct applier **appliers, int count,
 		double wait_start = ev_monotonic_now(loop());
 		if (fiber_cond_wait_timeout(&state.wakeup, timeout) != 0)
 			break;
-		if (state.failed > 0 && connect_all)
+		if (count - state.failed < quorum)
 			break;
 		timeout -= ev_monotonic_now(loop()) - wait_start;
 	}
@@ -595,7 +598,7 @@ replicaset_connect(struct applier **appliers, int count,
 		say_crit("failed to connect to %d out of %d replicas",
 			 count - state.connected, count);
 		/* Timeout or connection failure. */
-		if (connect_all)
+		if (connect_quorum && state.connected < quorum)
 			goto error;
 	} else {
 		say_verbose("connected to %d replicas", state.connected);
diff --git a/src/box/replication.h b/src/box/replication.h
index 95122eb45..06a2867b6 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -357,12 +357,13 @@ replicaset_add(uint32_t replica_id, const struct tt_uuid *instance_uuid);
  * \param appliers the array of appliers
  * \param count size of appliers array
  * \param timeout connection timeout
- * \param connect_all if this flag is set, fail unless all
- *                    appliers have successfully connected
+ * \param connect_quorum if this flag is set, fail unless at
+ *                       least replication_connect_quorum
+ *                       appliers have successfully connected.
  */
 void
 replicaset_connect(struct applier **appliers, int count,
-		   double timeout, bool connect_all);
+		   bool connect_quorum);
 
 /**
  * Resume all appliers registered with the replica set.
diff --git a/test/replication/quorum.result b/test/replication/quorum.result
index a55b0d087..265b099b7 100644
--- a/test/replication/quorum.result
+++ b/test/replication/quorum.result
@@ -401,3 +401,52 @@ test_run:cmd("switch default")
 test_run:drop_cluster(SERVERS)
 ---
 ...
+-- Test that quorum is not ignored neither during bootstrap, nor
+-- during reconfiguration.
+box.schema.user.grant('guest', 'replication')
+---
+...
+test_run:cmd('create server replica_quorum with script="replication/replica_quorum.lua"')
+---
+- true
+...
+-- Arguments are: replication_connect_quorum, replication_timeout
+-- replication_connect_timeout.
+-- If replication_connect_quorum was ignored here, the instance
+-- would exit with an error.
+test_run:cmd('start server replica_quorum with wait=True, wait_load=True, args="1 0.05 0.1"')
+---
+- true
+...
+test_run:cmd('switch replica_quorum')
+---
+- true
+...
+-- If replication_connect_quorum was ignored here, the instance
+-- would exit with an error.
+box.cfg{replication={INSTANCE_URI, nonexistent_uri(1)}}
+---
+...
+box.info.id
+---
+- 1
+...
+test_run:cmd('switch default')
+---
+- true
+...
+test_run:cmd('stop server replica_quorum')
+---
+- true
+...
+test_run:cmd('cleanup server replica_quorum')
+---
+- true
+...
+test_run:cmd('delete server replica_quorum')
+---
+- true
+...
+box.schema.user.revoke('guest', 'replication')
+---
+...
diff --git a/test/replication/quorum.test.lua b/test/replication/quorum.test.lua
index 8290f8ea5..5a43275c2 100644
--- a/test/replication/quorum.test.lua
+++ b/test/replication/quorum.test.lua
@@ -150,3 +150,23 @@ box.space.test:select()
 test_run:cmd("switch default")
 -- Cleanup.
 test_run:drop_cluster(SERVERS)
+
+-- Test that quorum is not ignored neither during bootstrap, nor
+-- during reconfiguration.
+box.schema.user.grant('guest', 'replication')
+test_run:cmd('create server replica_quorum with script="replication/replica_quorum.lua"')
+-- Arguments are: replication_connect_quorum, replication_timeout
+-- replication_connect_timeout.
+-- If replication_connect_quorum was ignored here, the instance
+-- would exit with an error.
+test_run:cmd('start server replica_quorum with wait=True, wait_load=True, args="1 0.05 0.1"')
+test_run:cmd('switch replica_quorum')
+-- If replication_connect_quorum was ignored here, the instance
+-- would exit with an error.
+box.cfg{replication={INSTANCE_URI, nonexistent_uri(1)}}
+box.info.id
+test_run:cmd('switch default')
+test_run:cmd('stop server replica_quorum')
+test_run:cmd('cleanup server replica_quorum')
+test_run:cmd('delete server replica_quorum')
+box.schema.user.revoke('guest', 'replication')
diff --git a/test/replication/replica_quorum.lua b/test/replication/replica_quorum.lua
new file mode 100644
index 000000000..dd42b8214
--- /dev/null
+++ b/test/replication/replica_quorum.lua
@@ -0,0 +1,24 @@
+#!/usr/bin/env tarantool
+
+local SOCKET_DIR = require('fio').cwd()
+
+local QUORUM = tonumber(arg[1])
+local TIMEOUT = arg[2] and tonumber(arg[2]) or 0.1
+local CON_TIMEOUT = arg[3] and tonumber(arg[3]) or 30.0
+INSTANCE_URI = SOCKET_DIR .. '/replica_quorum.sock'
+
+function nonexistent_uri(id)
+    return SOCKET_DIR .. '/replica_quorum' .. (1000 + id) .. '.sock'
+end
+
+require('console').listen(os.getenv('ADMIN'))
+
+box.cfg{
+    listen = INSTANCE_URI,
+    replication_timeout = TIMEOUT,
+    replication_connect_timeout = CON_TIMEOUT,
+    replication_connect_quorum = QUORUM,
+    replication = {INSTANCE_URI,
+                   nonexistent_uri(1),
+                   nonexistent_uri(2)}
+}
-- 
2.15.2 (Apple Git-101.1)




More information about the Tarantool-patches mailing list