[PATCH 1/2] Introduce replication_connect_timeout configuration option

Vladimir Davydov vdavydov.dev at gmail.com
Wed Feb 14 20:29:14 MSK 2018


Currently, the max time box.cfg() may wait for connection to replicas to
be established is hardcoded to box.cfg.replication_timeout times 4. As
a result, users can't revert to pre replication_connect_quorum behavior,
when box.cfg() blocks until it connects to all replicas. To fix that,
let's introduce a new configuration option, replication_connect_timeout,
which determines the replication configuration timeout. By default the
option is set to 4 seconds.

Closes #3151
---
 src/box/box.cc                  | 23 +++++++++++++++++++--
 src/box/lua/load_cfg.lua        |  2 ++
 src/box/replication.cc          |  1 +
 src/box/replication.h           | 18 ++++++++---------
 test/app-tap/init_script.result | 45 +++++++++++++++++++++--------------------
 test/box-tap/cfg.test.lua       |  7 ++++++-
 test/box/admin.result           |  2 ++
 test/box/cfg.result             |  4 ++++
 test/replication/quorum.lua     |  1 +
 9 files changed, 68 insertions(+), 35 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index f055788d..fa1eb051 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -386,6 +386,17 @@ box_check_replication_timeout(void)
 	return timeout;
 }
 
+static double
+box_check_replication_connect_timeout(void)
+{
+	double timeout = cfg_getd("replication_connect_timeout");
+	if (timeout <= 0) {
+		tnt_raise(ClientError, ER_CFG, "replication_connect_timeout",
+			  "the value must be greather than 0");
+	}
+	return timeout;
+}
+
 static int
 box_check_replication_connect_quorum(void)
 {
@@ -490,6 +501,7 @@ box_check_config()
 	box_check_replicaset_uuid(&uuid);
 	box_check_replication();
 	box_check_replication_timeout();
+	box_check_replication_connect_timeout();
 	box_check_replication_connect_quorum();
 	box_check_replication_sync_lag();
 	box_check_readahead(cfg_geti("readahead"));
@@ -580,7 +592,7 @@ box_set_replication(void)
 
 	box_check_replication();
 	/* Try to connect to all replicas within the timeout period */
-	box_sync_replication(replication_connect_quorum_timeout(), true);
+	box_sync_replication(replication_connect_timeout, true);
 	/* Follow replica */
 	replicaset_follow();
 }
@@ -592,6 +604,12 @@ box_set_replication_timeout(void)
 }
 
 void
+box_set_replication_connect_timeout(void)
+{
+	replication_connect_timeout = box_check_replication_connect_timeout();
+}
+
+void
 box_set_replication_connect_quorum(void)
 {
 	replication_connect_quorum = box_check_replication_connect_quorum();
@@ -1678,6 +1696,7 @@ box_cfg_xc(void)
 	box_set_checkpoint_count();
 	box_set_too_long_threshold();
 	box_set_replication_timeout();
+	box_set_replication_connect_timeout();
 	box_set_replication_connect_quorum();
 	replication_sync_lag = box_check_replication_sync_lag();
 	xstream_create(&join_stream, apply_initial_join_row);
@@ -1803,7 +1822,7 @@ box_cfg_xc(void)
 		title("orphan");
 
 		/* Wait for the cluster to start up */
-		box_sync_replication(replication_connect_quorum_timeout(), false);
+		box_sync_replication(replication_connect_timeout, false);
 	} else {
 		if (!tt_uuid_is_nil(&instance_uuid))
 			INSTANCE_UUID = instance_uuid;
diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua
index 4ac04083..891d819d 100644
--- a/src/box/lua/load_cfg.lua
+++ b/src/box/lua/load_cfg.lua
@@ -57,6 +57,7 @@ local default_cfg = {
     worker_pool_threads = 4,
     replication_timeout = 1,
     replication_sync_lag = 10,
+    replication_connect_timeout = 4,
     replication_connect_quorum = nil, -- connect all
 }
 
@@ -112,6 +113,7 @@ local template_cfg = {
     worker_pool_threads = 'number',
     replication_timeout = 'number',
     replication_sync_lag = 'number',
+    replication_connect_timeout = 'number',
     replication_connect_quorum = 'number',
 }
 
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 35efd8ad..319ea57e 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -47,6 +47,7 @@ struct tt_uuid INSTANCE_UUID;
 struct tt_uuid REPLICASET_UUID;
 
 double replication_timeout = 1.0; /* seconds */
+double replication_connect_timeout = 4.0; /* seconds */
 int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL;
 double replication_sync_lag = 10.0; /* seconds */
 
diff --git a/src/box/replication.h b/src/box/replication.h
index a7595f61..f964eed0 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -104,6 +104,14 @@ static const int REPLICATION_CONNECT_QUORUM_ALL = INT_MAX;
 extern double replication_timeout;
 
 /**
+ * Maximal time box.cfg() may wait for connections to all configured
+ * replicas to be established. If box.cfg() fails to connect to all
+ * replicas within the timeout, it will either leave the instance in
+ * the orphan mode (recovery) or fail (bootstrap, reconfiguration).
+ */
+extern double replication_connect_timeout;
+
+/**
  * Minimal number of replicas to sync for this instance to switch
  * to the write mode. If set to REPLICATION_CONNECT_QUORUM_ALL,
  * wait for all configured masters.
@@ -136,16 +144,6 @@ replication_disconnect_timeout(void)
 	return replication_timeout * 4;
 }
 
-/**
- * Fail box.cfg() if the quorum hasn't been assembled within
- * the given period.
- */
-static inline double
-replication_connect_quorum_timeout(void)
-{
-	return replication_reconnect_timeout() * 4;
-}
-
 void
 replication_init(void);
 
diff --git a/test/app-tap/init_script.result b/test/app-tap/init_script.result
index 53f87a54..80153e38 100644
--- a/test/app-tap/init_script.result
+++ b/test/app-tap/init_script.result
@@ -21,28 +21,29 @@ box.cfg
 16	pid_file:box.pid
 17	read_only:false
 18	readahead:16320
-19	replication_sync_lag:10
-20	replication_timeout:1
-21	rows_per_wal:500000
-22	slab_alloc_factor:1.05
-23	too_long_threshold:0.5
-24	vinyl_bloom_fpr:0.05
-25	vinyl_cache:134217728
-26	vinyl_dir:.
-27	vinyl_max_tuple_size:1048576
-28	vinyl_memory:134217728
-29	vinyl_page_size:8192
-30	vinyl_range_size:1073741824
-31	vinyl_read_threads:1
-32	vinyl_run_count_per_level:2
-33	vinyl_run_size_ratio:3.5
-34	vinyl_timeout:60
-35	vinyl_write_threads:2
-36	wal_dir:.
-37	wal_dir_rescan_delay:2
-38	wal_max_size:268435456
-39	wal_mode:write
-40	worker_pool_threads:4
+19	replication_connect_timeout:4
+20	replication_sync_lag:10
+21	replication_timeout:1
+22	rows_per_wal:500000
+23	slab_alloc_factor:1.05
+24	too_long_threshold:0.5
+25	vinyl_bloom_fpr:0.05
+26	vinyl_cache:134217728
+27	vinyl_dir:.
+28	vinyl_max_tuple_size:1048576
+29	vinyl_memory:134217728
+30	vinyl_page_size:8192
+31	vinyl_range_size:1073741824
+32	vinyl_read_threads:1
+33	vinyl_run_count_per_level:2
+34	vinyl_run_size_ratio:3.5
+35	vinyl_timeout:60
+36	vinyl_write_threads:2
+37	wal_dir:.
+38	wal_dir_rescan_delay:2
+39	wal_max_size:268435456
+40	wal_mode:write
+41	worker_pool_threads:4
 --
 -- Test insert from detached fiber
 --
diff --git a/test/box-tap/cfg.test.lua b/test/box-tap/cfg.test.lua
index 67991ecf..90dc04bd 100755
--- a/test/box-tap/cfg.test.lua
+++ b/test/box-tap/cfg.test.lua
@@ -6,7 +6,7 @@ local socket = require('socket')
 local fio = require('fio')
 local uuid = require('uuid')
 local msgpack = require('msgpack')
-test:plan(80)
+test:plan(85)
 
 --------------------------------------------------------------------------------
 -- Invalid values
@@ -27,6 +27,11 @@ invalid('memtx_min_tuple_size', 1000000000)
 invalid('replication', '//guest at localhost:3301')
 invalid('replication_timeout', -1)
 invalid('replication_timeout', 0)
+invalid('replication_sync_lag', -1)
+invalid('replication_sync_lag', 0)
+invalid('replication_connect_timeout', -1)
+invalid('replication_connect_timeout', 0)
+invalid('replication_connect_quorum', -1)
 invalid('wal_mode', 'invalid')
 invalid('rows_per_wal', -1)
 invalid('listen', '//!')
diff --git a/test/box/admin.result b/test/box/admin.result
index 13e599eb..7a3e937b 100644
--- a/test/box/admin.result
+++ b/test/box/admin.result
@@ -54,6 +54,8 @@ cfg_filter(box.cfg)
     - false
   - - readahead
     - 16320
+  - - replication_connect_timeout
+    - 4
   - - replication_sync_lag
     - 10
   - - replication_timeout
diff --git a/test/box/cfg.result b/test/box/cfg.result
index 9f0ad595..67539cd1 100644
--- a/test/box/cfg.result
+++ b/test/box/cfg.result
@@ -50,6 +50,8 @@ cfg_filter(box.cfg)
     - false
   - - readahead
     - 16320
+  - - replication_connect_timeout
+    - 4
   - - replication_sync_lag
     - 10
   - - replication_timeout
@@ -137,6 +139,8 @@ cfg_filter(box.cfg)
     - false
   - - readahead
     - 16320
+  - - replication_connect_timeout
+    - 4
   - - replication_sync_lag
     - 10
   - - replication_timeout
diff --git a/test/replication/quorum.lua b/test/replication/quorum.lua
index 5138425a..9c7bf5c9 100644
--- a/test/replication/quorum.lua
+++ b/test/replication/quorum.lua
@@ -16,6 +16,7 @@ box.cfg({
     listen = instance_uri(INSTANCE_ID);
     replication_timeout = 0.05;
     replication_sync_lag = 0.01;
+    replication_connect_timeout = 0.1;
     replication_connect_quorum = 3;
     replication = {
         instance_uri(1);
-- 
2.11.0




More information about the Tarantool-patches mailing list