From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH] relay: stop relay on subscribe error Date: Fri, 2 Aug 2019 15:53:14 +0300 Message-Id: <1b90df19ae31b3fdf4be5b9c3d23f6499148f8aa.1564749904.git.vdavydov.dev@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit To: tarantool-patches@freelists.org List-ID: In case an error occurs between relay_start() and cord_costart() in relay_subscribe(), the relay status won't be reset to STOPPED. As a result, any further attempt to re-subscribe will fail with ER_CFG: duplicate connection with the same replica UUID. This may happen, for example, if the WAL directory happens to be temporarily inaccessible on the master. Closes #4399 --- https://github.com/tarantool/tarantool/issues/4399 https://github.com/tarantool/tarantool/commits/dv/gh-4399-fix-relay-stop src/box/relay.cc | 9 ++-- test/replication/misc.result | 99 ++++++++++++++++++++++++++++++++++ test/replication/misc.test.lua | 43 +++++++++++++++ 3 files changed, 147 insertions(+), 4 deletions(-) diff --git a/src/box/relay.cc b/src/box/relay.cc index e9f5bdca..efa3373f 100644 --- a/src/box/relay.cc +++ b/src/box/relay.cc @@ -663,6 +663,11 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync, } relay_start(relay, fd, sync, relay_send_row); + auto relay_guard = make_scoped_guard([=] { + relay_stop(relay); + replica_on_relay_stop(replica); + }); + vclock_copy(&relay->local_vclock_at_subscribe, &replicaset.vclock); relay->r = recovery_new(cfg_gets("wal_dir"), false, replica_clock); @@ -673,10 +678,6 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync, relay_subscribe_f, relay); if (rc == 0) rc = cord_cojoin(&relay->cord); - - relay_stop(relay); - replica_on_relay_stop(replica); - if (rc != 0) diag_raise(); } diff --git a/test/replication/misc.result b/test/replication/misc.result index f896ee4a..9963ea3d 100644 --- a/test/replication/misc.result +++ b/test/replication/misc.result @@ -642,3 +642,102 @@ test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') --- ... +-- +-- gh-4399 Check that an error reading WAL directory on subscribe +-- doesn't lead to a permanent replication failure. +-- +box.schema.user.grant("guest", "replication") +--- +... +test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +--- +- true +... +test_run:cmd("start server replica") +--- +- true +... +-- Make the WAL directory inaccessible. +fio = require('fio') +--- +... +path = fio.abspath(box.cfg.wal_dir) +--- +... +fio.chmod(path, 0) +--- +- true +... +-- Break replication on timeout. +-- Wait for error reading the WAL directory on subscribe. +replication_timeout = box.cfg.replication_timeout +--- +... +box.cfg{replication_timeout = 9000} +--- +... +test_run:cmd("switch replica") +--- +- true +... +test_run:cmd("setopt delimiter ';'") +--- +- true +... +test_run:wait_cond(function() + if box.info.replication[1].upstream.status ~= 'disconnected' then + return false + end + return string.match(box.info.replication[1].upstream.message, + 'error reading directory') ~= nil +end); +--- +- true +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +test_run:cmd("switch default") +--- +- true +... +-- Reset the timeout and restore access to the WAL directory. +-- Wait for replication to be reestablished. +box.cfg{replication_timeout = replication_timeout} +--- +... +fio.chmod(path, tonumber('777', 8)) +--- +- true +... +test_run:cmd("switch replica") +--- +- true +... +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) +--- +- true +... +test_run:cmd("switch default") +--- +- true +... +test_run:cmd("stop server replica") +--- +- true +... +test_run:cmd("cleanup server replica") +--- +- true +... +test_run:cmd("delete server replica") +--- +- true +... +test_run:cleanup_cluster() +--- +... +box.schema.user.revoke('guest', 'replication') +--- +... diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua index 4be12ba3..871ece22 100644 --- a/test/replication/misc.test.lua +++ b/test/replication/misc.test.lua @@ -258,3 +258,46 @@ test_run:cmd("cleanup server replica") test_run:cmd("delete server replica") test_run:cleanup_cluster() box.schema.user.revoke('guest', 'replication') + +-- +-- gh-4399 Check that an error reading WAL directory on subscribe +-- doesn't lead to a permanent replication failure. +-- +box.schema.user.grant("guest", "replication") +test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +test_run:cmd("start server replica") + +-- Make the WAL directory inaccessible. +fio = require('fio') +path = fio.abspath(box.cfg.wal_dir) +fio.chmod(path, 0) + +-- Break replication on timeout. +-- Wait for error reading the WAL directory on subscribe. +replication_timeout = box.cfg.replication_timeout +box.cfg{replication_timeout = 9000} +test_run:cmd("switch replica") +test_run:cmd("setopt delimiter ';'") +test_run:wait_cond(function() + if box.info.replication[1].upstream.status ~= 'disconnected' then + return false + end + return string.match(box.info.replication[1].upstream.message, + 'error reading directory') ~= nil +end); +test_run:cmd("setopt delimiter ''"); +test_run:cmd("switch default") + +-- Reset the timeout and restore access to the WAL directory. +-- Wait for replication to be reestablished. +box.cfg{replication_timeout = replication_timeout} +fio.chmod(path, tonumber('777', 8)) +test_run:cmd("switch replica") +test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end) +test_run:cmd("switch default") + +test_run:cmd("stop server replica") +test_run:cmd("cleanup server replica") +test_run:cmd("delete server replica") +test_run:cleanup_cluster() +box.schema.user.revoke('guest', 'replication') -- 2.20.1