From: Olga Arkhangelskaia <krishtal.olja@gmail.com> To: tarantool-patches@freelists.org Cc: Olga Arkhangelskaia <krishtal.olja@gmail.com> Subject: [tarantool-patches] [PATCH 2/3] box: add replication_sync_lag_timeout Date: Wed, 29 Aug 2018 21:56:41 +0300 [thread overview] Message-ID: <20180829185642.49479-2-krishtal.olja@gmail.com> (raw) In-Reply-To: <20180829185642.49479-1-krishtal.olja@gmail.com> In scope of gh-3427 we need timeout in case if replicaset will wait for synchronization for too long, or even forever. Default value is TIMEOUT_INFINITY. @TarantoolBot document Title: Introduce new option replication_sync_lag_timeout. After initial bootstrap or after replication configuration changes we need to sync up with replication quorum. Sometimes sync can take too long or replication_sync_lag can be smaller than network latency we replica will stuck in sync loop that can't be cancelled.To avoid this situations replication_sync_lag_timeout can be used. When time set in replication_sync_lag_timeout is passed replica enters orphan state. Can be set dynamically. Default value is TIMEOUT_INFINITY. Closes #3674 --- https://github.com/tarantool/tarantool/issues/3647 https://github.com/tarantool/tarantool/tree/OKriw/gh-3427-replication-no-sync-1.9 src/box/box.cc | 19 +++++++++++++++++++ src/box/box.h | 1 + src/box/lua/cfg.cc | 12 ++++++++++++ src/box/lua/load_cfg.lua | 4 ++++ src/box/replication.cc | 14 ++++++++++---- src/box/replication.h | 6 ++++++ test/box-tap/cfg.test.lua | 9 ++++++++- test/box/admin.result | 2 ++ test/box/cfg.result | 4 ++++ 9 files changed, 66 insertions(+), 5 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index 7155ad085..0f8364ebc 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -420,6 +420,17 @@ box_check_replication_sync_lag(void) return lag; } +static double +box_check_replication_sync_lag_timeout(void) +{ + double timeout = cfg_getd_default("replication_sync_lag_timeout", TIMEOUT_INFINITY); + if (timeout <= 0) { + tnt_raise(ClientError, ER_CFG, "replication_sync_lag_timeout", + "the value must be greater than 0"); + } + return timeout; +} + static void box_check_instance_uuid(struct tt_uuid *uuid) { @@ -546,6 +557,7 @@ box_check_config() box_check_replication_connect_timeout(); box_check_replication_connect_quorum(); box_check_replication_sync_lag(); + box_check_replication_sync_lag_timeout(); box_check_readahead(cfg_geti("readahead")); box_check_checkpoint_count(cfg_geti("checkpoint_count")); box_check_wal_max_rows(cfg_geti64("rows_per_wal")); @@ -662,6 +674,12 @@ box_set_replication_sync_lag(void) replication_sync_lag = box_check_replication_sync_lag(); } +void +box_set_replication_sync_lag_timeout(void) +{ + replication_sync_lag_timeout = box_check_replication_sync_lag_timeout(); +} + void box_bind(void) { @@ -1754,6 +1772,7 @@ box_cfg_xc(void) box_set_replication_connect_timeout(); box_set_replication_connect_quorum(); box_set_replication_sync_lag(); + box_set_replication_sync_lag_timeout(); xstream_create(&join_stream, apply_initial_join_row); xstream_create(&subscribe_stream, apply_row); diff --git a/src/box/box.h b/src/box/box.h index 3090fdcdb..f30d0e4cf 100644 --- a/src/box/box.h +++ b/src/box/box.h @@ -177,6 +177,7 @@ void box_set_replication_timeout(void); void box_set_replication_connect_timeout(void); void box_set_replication_connect_quorum(void); void box_set_replication_sync_lag(void); +void box_set_replication_sync_lag_timeout(void); extern "C" { #endif /* defined(__cplusplus) */ diff --git a/src/box/lua/cfg.cc b/src/box/lua/cfg.cc index 5442723b5..bda36a2b9 100644 --- a/src/box/lua/cfg.cc +++ b/src/box/lua/cfg.cc @@ -273,6 +273,17 @@ lbox_cfg_set_replication_sync_lag(struct lua_State *L) return 0; } +static int +lbox_cfg_set_replication_sync_lag_timeout(struct lua_State *L) +{ + try { + box_set_replication_sync_lag_timeout(); + } catch (Exception *) { + luaT_error(L); + } + return 0; +} + void box_lua_cfg_init(struct lua_State *L) { @@ -298,6 +309,7 @@ box_lua_cfg_init(struct lua_State *L) {"cfg_set_replication_connect_timeout", lbox_cfg_set_replication_connect_timeout}, {"cfg_set_replication_connect_quorum", lbox_cfg_set_replication_connect_quorum}, {"cfg_set_replication_sync_lag", lbox_cfg_set_replication_sync_lag}, + {"cfg_set_replication_sync_lag_timeout", lbox_cfg_set_replication_sync_lag_timeout}, {NULL, NULL} }; diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua index f803d8987..f77a86cdd 100644 --- a/src/box/lua/load_cfg.lua +++ b/src/box/lua/load_cfg.lua @@ -72,6 +72,7 @@ local default_cfg = { worker_pool_threads = 4, replication_timeout = 1, replication_sync_lag = 10, + replication_sync_lag_timeout = 500 * 365 * 86400, replication_connect_timeout = 30, replication_connect_quorum = nil, -- connect all } @@ -128,6 +129,7 @@ local template_cfg = { worker_pool_threads = 'number', replication_timeout = 'number', replication_sync_lag = 'number', + replication_sync_lag_timeout = 'number', replication_connect_timeout = 'number', replication_connect_quorum = 'number', } @@ -200,6 +202,7 @@ local dynamic_cfg = { replication_connect_timeout = private.cfg_set_replication_connect_timeout, replication_connect_quorum = private.cfg_set_replication_connect_quorum, replication_sync_lag = private.cfg_set_replication_sync_lag, + replication_sync_lag_timeout = private.cfg_set_replication_sync_lag_timeout, instance_uuid = function() if box.cfg.instance_uuid ~= box.info.uuid then box.error(box.error.CFG, 'instance_uuid', @@ -222,6 +225,7 @@ local dynamic_cfg_skip_at_load = { replication_connect_timeout = true, replication_connect_quorum = true, replication_sync_lag = true, + replication_sync_lag_timeout = true, wal_dir_rescan_delay = true, custom_proc_title = true, force_recovery = true, diff --git a/src/box/replication.cc b/src/box/replication.cc index 861ce34ea..731b05faf 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -49,7 +49,7 @@ double replication_timeout = 1.0; /* seconds */ double replication_connect_timeout = 30.0; /* seconds */ int replication_connect_quorum = REPLICATION_CONNECT_QUORUM_ALL; double replication_sync_lag = 10.0; /* seconds */ - +double replication_sync_lag_timeout = TIMEOUT_INFINITY; struct replicaset replicaset; static int @@ -673,12 +673,18 @@ replicaset_sync(void) /* * Wait until all connected replicas synchronize up to - * replication_sync_lag + * replication_sync_lag or return on replication_sync_lag_timeout */ while (replicaset.applier.synced < quorum && replicaset.applier.connected + - replicaset.applier.loading >= quorum) - fiber_cond_wait(&replicaset.applier.cond); + replicaset.applier.loading >= quorum) { + if (fiber_cond_wait_timeout(&replicaset.applier.cond, + replication_sync_lag_timeout) != 0) { + say_crit("replication_sync_lag_timeout fired, entering orphan mode"); + return; + } + + } if (replicaset.applier.synced < quorum) { /* diff --git a/src/box/replication.h b/src/box/replication.h index 06a2867b6..71c17dc8e 100644 --- a/src/box/replication.h +++ b/src/box/replication.h @@ -126,6 +126,12 @@ extern int replication_connect_quorum; */ extern double replication_sync_lag; +/** + * Time to wait before enter orphan state in case of unsuccessful + * synchronization. + */ +extern double replication_sync_lag_timeout; + /** * Wait for the given period of time before trying to reconnect * to a master. diff --git a/test/box-tap/cfg.test.lua b/test/box-tap/cfg.test.lua index d315346de..dd883a020 100755 --- a/test/box-tap/cfg.test.lua +++ b/test/box-tap/cfg.test.lua @@ -6,7 +6,7 @@ local socket = require('socket') local fio = require('fio') local uuid = require('uuid') local msgpack = require('msgpack') -test:plan(91) +test:plan(94) -------------------------------------------------------------------------------- -- Invalid values @@ -29,6 +29,8 @@ invalid('replication_timeout', -1) invalid('replication_timeout', 0) invalid('replication_sync_lag', -1) invalid('replication_sync_lag', 0) +invalid('replication_sync_lag_timeout', -1) +invalid('replication_sync_lag_timeout', 0) invalid('replication_connect_timeout', -1) invalid('replication_connect_timeout', 0) invalid('replication_connect_quorum', -1) @@ -100,6 +102,11 @@ status, result = pcall(box.cfg, {replication_sync_lag = 1}) test:ok(status, "dynamic replication_sync_lag") pcall(box.cfg, {repliction_sync_lag = lag}) +timeout = box.cfg.replication_sync_lag_timeout +status, result = pcall(box.cfg, {replication_sync_lag_timeout = 10}) +test:ok(status, "dynamic replication_sync_lag_timeout") +pcall(box.cfg, {repliction_sync_lag_timeout = timeout}) + -------------------------------------------------------------------------------- -- gh-534: Segmentation fault after two bad wal_mode settings -------------------------------------------------------------------------------- diff --git a/test/box/admin.result b/test/box/admin.result index c3e318a6a..d7205b088 100644 --- a/test/box/admin.result +++ b/test/box/admin.result @@ -58,6 +58,8 @@ cfg_filter(box.cfg) - 30 - - replication_sync_lag - 10 + - - replication_sync_lag_timeout + - 15768000000 - - replication_timeout - 1 - - rows_per_wal diff --git a/test/box/cfg.result b/test/box/cfg.result index a2df83310..20a8e0384 100644 --- a/test/box/cfg.result +++ b/test/box/cfg.result @@ -54,6 +54,8 @@ cfg_filter(box.cfg) - 30 - - replication_sync_lag - 10 + - - replication_sync_lag_timeout + - 15768000000 - - replication_timeout - 1 - - rows_per_wal @@ -143,6 +145,8 @@ cfg_filter(box.cfg) - 30 - - replication_sync_lag - 10 + - - replication_sync_lag_timeout + - 15768000000 - - replication_timeout - 1 - - rows_per_wal -- 2.14.3 (Apple Git-98)
next prev parent reply other threads:[~2018-08-29 18:57 UTC|newest] Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top 2018-08-29 18:56 [tarantool-patches] [PATCH v2 1/3] box: make replication_sync_lag option dynamic Olga Arkhangelskaia 2018-08-29 18:56 ` Olga Arkhangelskaia [this message] 2018-08-30 10:02 ` [tarantool-patches] [PATCH 2/3] box: add replication_sync_lag_timeout Vladimir Davydov 2018-08-29 18:56 ` [tarantool-patches] [PATCH v5 3/3] box: adds replication sync after cfg. update Olga Arkhangelskaia 2018-08-30 10:11 ` Vladimir Davydov 2018-08-30 9:48 ` [tarantool-patches] [PATCH v2 1/3] box: make replication_sync_lag option dynamic Vladimir Davydov
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20180829185642.49479-2-krishtal.olja@gmail.com \ --to=krishtal.olja@gmail.com \ --cc=tarantool-patches@freelists.org \ --subject='Re: [tarantool-patches] [PATCH 2/3] box: add replication_sync_lag_timeout' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox