[PATCH 1/2] Introduce replication_connect_timeout configuration option
Vladimir Davydov
vdavydov.dev at gmail.com
Wed Feb 14 20:29:14 MSK 2018
Currently, the max time box.cfg() may wait for connection to replicas to
be established is hardcoded to box.cfg.replication_timeout times 4. As
a result, users can't revert to pre replication_connect_quorum behavior,
when box.cfg() blocks until it connects to all replicas. To fix that,
let's introduce a new configuration option, replication_connect_timeout,
which determines the replication configuration timeout. By default the
option is set to 4 seconds.
Closes #3151
---
src/box/box.cc | 23 +++++++++++++++++++--
src/box/lua/load_cfg.lua | 2 ++
src/box/replication.cc | 1 +
src/box/replication.h | 18 ++++++++---------
test/app-tap/init_script.result | 45 +++++++++++++++++++++--------------------
test/box-tap/cfg.test.lua | 7 ++++++-
test/box/admin.result | 2 ++
test/box/cfg.result | 4 ++++
test/replication/quorum.lua | 1 +
9 files changed, 68 insertions(+), 35 deletions(-)
diff --git a/src/box/box.cc b/src/box/box.cc
index f055788d..fa1eb051 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -386,6 +386,17 @@ box_check_replication_timeout(void)
return timeout;
}
+static double
+box_check_replication_connect_timeout(void)
+{
+ double timeout = cfg_getd("replication_connect_timeout");
+ if (timeout <= 0) {
+ tnt_raise(ClientError, ER_CFG, "replication_connect_timeout",
+ "the value must be greather than 0");
+ }
+ return timeout;
+}
+
static int
box_check_replication_connect_quorum(void)
{
@@ -490,6 +501,7 @@ box_check_config()
box_check_replicaset_uuid(&uuid);
box_check_replication();
box_check_replication_timeout();
+ box_check_replication_connect_timeout();
box_check_replication_connect_quorum();
box_check_replication_sync_lag();
box_check_readahead(cfg_geti("readahead"));
@@ -580,7 +592,7 @@ box_set_replication(void)
box_check_replication();
/* Try to connect to all replicas within the timeout period */
- box_sync_replication(replication_connect_quorum_timeout(), true);
+ box_sync_replication(replication_connect_timeout, true);
/* Follow replica */
replicaset_follow();
}
@@ -592,6 +604,12 @@ box_set_replication_timeout(void)
}
void
+box_set_replication_connect_timeout(void)
+{
+ replication_connect_timeout = box_check_replication_connect_timeout();
+}
+
+void
box_set_replication_connect_quorum(void)
{
replication_connect_quorum = box_check_replication_connect_quorum();
@@ -1678,6 +1696,7 @@ box_cfg_xc(void)
box_set_checkpoint_count();
box_set_too_long_threshold();
box_set_replication_timeout();
+ box_set_replication_connect_timeout();
box_set_replication_connect_quorum();
replication_sync_lag = box_check_replication_sync_lag();
xstream_create(&join_stream, apply_initial_join_row);
@@ -1803,7 +1822,7 @@ box_cfg_xc(void)
title("orphan");
/* Wait for the cluster to start up */
- box_sync_replication(replication_connect_quorum_timeout(), false);
+ box_sync_replication(replication_connect_timeout, false);
} else {
if (!tt_uuid_is_nil(&instance_uuid))
INSTANCE_UUID = instance_uuid;
diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua
index 4ac04083..891d819d 100644
--- a/src/box/lua/load_cfg.lua
+++ b/src/box/lua/load_cfg.lua
@@ -57,6 +57,7 @@ local default_cfg = {
worker_pool_threads = 4,
replication_timeout = 1,
replication_sync_lag = 10,
+ replication_connect_timeout = 4,
replication_connect_quorum = nil, -- connect all
}
@@ -112,6 +113,7 @@ local template_cfg = {
worker_pool_threads = 'number',
replication_timeout = 'number',
replication_sync_lag = 'number',
+ replication_connect_timeout = 'number',
replication_connect_quorum = 'number',
}
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 35efd8ad..319ea57e 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -47,6 +47,7 @@ struct tt_uuid INSTANCE_UUID;
struct tt_uuid REPLICASET_UUID;
double replication_timeout = 1.0; /* seconds */
+double replication_connect_timeout = 4.0; /* seconds */
int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL;
double replication_sync_lag = 10.0; /* seconds */
diff --git a/src/box/replication.h b/src/box/replication.h
index a7595f61..f964eed0 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -104,6 +104,14 @@ static const int REPLICATION_CONNECT_QUORUM_ALL = INT_MAX;
extern double replication_timeout;
/**
+ * Maximal time box.cfg() may wait for connections to all configured
+ * replicas to be established. If box.cfg() fails to connect to all
+ * replicas within the timeout, it will either leave the instance in
+ * the orphan mode (recovery) or fail (bootstrap, reconfiguration).
+ */
+extern double replication_connect_timeout;
+
+/**
* Minimal number of replicas to sync for this instance to switch
* to the write mode. If set to REPLICATION_CONNECT_QUORUM_ALL,
* wait for all configured masters.
@@ -136,16 +144,6 @@ replication_disconnect_timeout(void)
return replication_timeout * 4;
}
-/**
- * Fail box.cfg() if the quorum hasn't been assembled within
- * the given period.
- */
-static inline double
-replication_connect_quorum_timeout(void)
-{
- return replication_reconnect_timeout() * 4;
-}
-
void
replication_init(void);
diff --git a/test/app-tap/init_script.result b/test/app-tap/init_script.result
index 53f87a54..80153e38 100644
--- a/test/app-tap/init_script.result
+++ b/test/app-tap/init_script.result
@@ -21,28 +21,29 @@ box.cfg
16 pid_file:box.pid
17 read_only:false
18 readahead:16320
-19 replication_sync_lag:10
-20 replication_timeout:1
-21 rows_per_wal:500000
-22 slab_alloc_factor:1.05
-23 too_long_threshold:0.5
-24 vinyl_bloom_fpr:0.05
-25 vinyl_cache:134217728
-26 vinyl_dir:.
-27 vinyl_max_tuple_size:1048576
-28 vinyl_memory:134217728
-29 vinyl_page_size:8192
-30 vinyl_range_size:1073741824
-31 vinyl_read_threads:1
-32 vinyl_run_count_per_level:2
-33 vinyl_run_size_ratio:3.5
-34 vinyl_timeout:60
-35 vinyl_write_threads:2
-36 wal_dir:.
-37 wal_dir_rescan_delay:2
-38 wal_max_size:268435456
-39 wal_mode:write
-40 worker_pool_threads:4
+19 replication_connect_timeout:4
+20 replication_sync_lag:10
+21 replication_timeout:1
+22 rows_per_wal:500000
+23 slab_alloc_factor:1.05
+24 too_long_threshold:0.5
+25 vinyl_bloom_fpr:0.05
+26 vinyl_cache:134217728
+27 vinyl_dir:.
+28 vinyl_max_tuple_size:1048576
+29 vinyl_memory:134217728
+30 vinyl_page_size:8192
+31 vinyl_range_size:1073741824
+32 vinyl_read_threads:1
+33 vinyl_run_count_per_level:2
+34 vinyl_run_size_ratio:3.5
+35 vinyl_timeout:60
+36 vinyl_write_threads:2
+37 wal_dir:.
+38 wal_dir_rescan_delay:2
+39 wal_max_size:268435456
+40 wal_mode:write
+41 worker_pool_threads:4
--
-- Test insert from detached fiber
--
diff --git a/test/box-tap/cfg.test.lua b/test/box-tap/cfg.test.lua
index 67991ecf..90dc04bd 100755
--- a/test/box-tap/cfg.test.lua
+++ b/test/box-tap/cfg.test.lua
@@ -6,7 +6,7 @@ local socket = require('socket')
local fio = require('fio')
local uuid = require('uuid')
local msgpack = require('msgpack')
-test:plan(80)
+test:plan(85)
--------------------------------------------------------------------------------
-- Invalid values
@@ -27,6 +27,11 @@ invalid('memtx_min_tuple_size', 1000000000)
invalid('replication', '//guest at localhost:3301')
invalid('replication_timeout', -1)
invalid('replication_timeout', 0)
+invalid('replication_sync_lag', -1)
+invalid('replication_sync_lag', 0)
+invalid('replication_connect_timeout', -1)
+invalid('replication_connect_timeout', 0)
+invalid('replication_connect_quorum', -1)
invalid('wal_mode', 'invalid')
invalid('rows_per_wal', -1)
invalid('listen', '//!')
diff --git a/test/box/admin.result b/test/box/admin.result
index 13e599eb..7a3e937b 100644
--- a/test/box/admin.result
+++ b/test/box/admin.result
@@ -54,6 +54,8 @@ cfg_filter(box.cfg)
- false
- - readahead
- 16320
+ - - replication_connect_timeout
+ - 4
- - replication_sync_lag
- 10
- - replication_timeout
diff --git a/test/box/cfg.result b/test/box/cfg.result
index 9f0ad595..67539cd1 100644
--- a/test/box/cfg.result
+++ b/test/box/cfg.result
@@ -50,6 +50,8 @@ cfg_filter(box.cfg)
- false
- - readahead
- 16320
+ - - replication_connect_timeout
+ - 4
- - replication_sync_lag
- 10
- - replication_timeout
@@ -137,6 +139,8 @@ cfg_filter(box.cfg)
- false
- - readahead
- 16320
+ - - replication_connect_timeout
+ - 4
- - replication_sync_lag
- 10
- - replication_timeout
diff --git a/test/replication/quorum.lua b/test/replication/quorum.lua
index 5138425a..9c7bf5c9 100644
--- a/test/replication/quorum.lua
+++ b/test/replication/quorum.lua
@@ -16,6 +16,7 @@ box.cfg({
listen = instance_uri(INSTANCE_ID);
replication_timeout = 0.05;
replication_sync_lag = 0.01;
+ replication_connect_timeout = 0.1;
replication_connect_quorum = 3;
replication = {
instance_uri(1);
--
2.11.0
More information about the Tarantool-patches
mailing list