Tarantool development patches archive
 help / color / mirror / Atom feed
* [PATCH] relay: stop relay on subscribe error
@ 2019-08-02 12:53 Vladimir Davydov
  2019-08-02 17:33 ` Vladimir Davydov
  0 siblings, 1 reply; 2+ messages in thread
From: Vladimir Davydov @ 2019-08-02 12:53 UTC (permalink / raw)
  To: tarantool-patches

In case an error occurs between relay_start() and cord_costart() in
relay_subscribe(), the relay status won't be reset to STOPPED. As a
result, any further attempt to re-subscribe will fail with ER_CFG:
duplicate connection with the same replica UUID. This may happen, for
example, if the WAL directory happens to be temporarily inaccessible on
the master.

Closes #4399
---
https://github.com/tarantool/tarantool/issues/4399
https://github.com/tarantool/tarantool/commits/dv/gh-4399-fix-relay-stop

 src/box/relay.cc               |  9 ++--
 test/replication/misc.result   | 99 ++++++++++++++++++++++++++++++++++
 test/replication/misc.test.lua | 43 +++++++++++++++
 3 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/src/box/relay.cc b/src/box/relay.cc
index e9f5bdca..efa3373f 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -663,6 +663,11 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
 	}
 
 	relay_start(relay, fd, sync, relay_send_row);
+	auto relay_guard = make_scoped_guard([=] {
+		relay_stop(relay);
+		replica_on_relay_stop(replica);
+	});
+
 	vclock_copy(&relay->local_vclock_at_subscribe, &replicaset.vclock);
 	relay->r = recovery_new(cfg_gets("wal_dir"), false,
 			        replica_clock);
@@ -673,10 +678,6 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
 			      relay_subscribe_f, relay);
 	if (rc == 0)
 		rc = cord_cojoin(&relay->cord);
-
-	relay_stop(relay);
-	replica_on_relay_stop(replica);
-
 	if (rc != 0)
 		diag_raise();
 }
diff --git a/test/replication/misc.result b/test/replication/misc.result
index f896ee4a..9963ea3d 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -642,3 +642,102 @@ test_run:cleanup_cluster()
 box.schema.user.revoke('guest', 'replication')
 ---
 ...
+--
+-- gh-4399 Check that an error reading WAL directory on subscribe
+-- doesn't lead to a permanent replication failure.
+--
+box.schema.user.grant("guest", "replication")
+---
+...
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+---
+- true
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+-- Make the WAL directory inaccessible.
+fio = require('fio')
+---
+...
+path = fio.abspath(box.cfg.wal_dir)
+---
+...
+fio.chmod(path, 0)
+---
+- true
+...
+-- Break replication on timeout.
+-- Wait for error reading the WAL directory on subscribe.
+replication_timeout = box.cfg.replication_timeout
+---
+...
+box.cfg{replication_timeout = 9000}
+---
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+test_run:wait_cond(function()
+    if box.info.replication[1].upstream.status ~= 'disconnected' then
+        return false
+    end
+    return string.match(box.info.replication[1].upstream.message,
+                        'error reading directory') ~= nil
+end);
+---
+- true
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+test_run:cmd("switch default")
+---
+- true
+...
+-- Reset the timeout and restore access to the WAL directory.
+-- Wait for replication to be reestablished.
+box.cfg{replication_timeout = replication_timeout}
+---
+...
+fio.chmod(path, tonumber('777', 8))
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end)
+---
+- true
+...
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd("cleanup server replica")
+---
+- true
+...
+test_run:cmd("delete server replica")
+---
+- true
+...
+test_run:cleanup_cluster()
+---
+...
+box.schema.user.revoke('guest', 'replication')
+---
+...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 4be12ba3..871ece22 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -258,3 +258,46 @@ test_run:cmd("cleanup server replica")
 test_run:cmd("delete server replica")
 test_run:cleanup_cluster()
 box.schema.user.revoke('guest', 'replication')
+
+--
+-- gh-4399 Check that an error reading WAL directory on subscribe
+-- doesn't lead to a permanent replication failure.
+--
+box.schema.user.grant("guest", "replication")
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+test_run:cmd("start server replica")
+
+-- Make the WAL directory inaccessible.
+fio = require('fio')
+path = fio.abspath(box.cfg.wal_dir)
+fio.chmod(path, 0)
+
+-- Break replication on timeout.
+-- Wait for error reading the WAL directory on subscribe.
+replication_timeout = box.cfg.replication_timeout
+box.cfg{replication_timeout = 9000}
+test_run:cmd("switch replica")
+test_run:cmd("setopt delimiter ';'")
+test_run:wait_cond(function()
+    if box.info.replication[1].upstream.status ~= 'disconnected' then
+        return false
+    end
+    return string.match(box.info.replication[1].upstream.message,
+                        'error reading directory') ~= nil
+end);
+test_run:cmd("setopt delimiter ''");
+test_run:cmd("switch default")
+
+-- Reset the timeout and restore access to the WAL directory.
+-- Wait for replication to be reestablished.
+box.cfg{replication_timeout = replication_timeout}
+fio.chmod(path, tonumber('777', 8))
+test_run:cmd("switch replica")
+test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end)
+test_run:cmd("switch default")
+
+test_run:cmd("stop server replica")
+test_run:cmd("cleanup server replica")
+test_run:cmd("delete server replica")
+test_run:cleanup_cluster()
+box.schema.user.revoke('guest', 'replication')
-- 
2.20.1

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] relay: stop relay on subscribe error
  2019-08-02 12:53 [PATCH] relay: stop relay on subscribe error Vladimir Davydov
@ 2019-08-02 17:33 ` Vladimir Davydov
  0 siblings, 0 replies; 2+ messages in thread
From: Vladimir Davydov @ 2019-08-02 17:33 UTC (permalink / raw)
  To: tarantool-patches

Pushed to master, 2.2, 2.1, 1.10 after slightly reworking the test,
which happened to hang on CI:

From 35ef3320089d927db43c5796494fc699a03b5eec Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Date: Fri, 2 Aug 2019 14:59:47 +0300
Subject: [PATCH] relay: stop relay on subscribe error

In case an error occurs between relay_start() and cord_costart() in
relay_subscribe(), the relay status won't be reset to STOPPED. As a
result, any further attempt to re-subscribe will fail with ER_CFG:
duplicate connection with the same replica UUID. This may happen, for
example, if the WAL directory happens to be temporarily inaccessible on
the master.

Closes #4399

diff --git a/src/box/relay.cc b/src/box/relay.cc
index e9f5bdca..efa3373f 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -663,6 +663,11 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
 	}
 
 	relay_start(relay, fd, sync, relay_send_row);
+	auto relay_guard = make_scoped_guard([=] {
+		relay_stop(relay);
+		replica_on_relay_stop(replica);
+	});
+
 	vclock_copy(&relay->local_vclock_at_subscribe, &replicaset.vclock);
 	relay->r = recovery_new(cfg_gets("wal_dir"), false,
 			        replica_clock);
@@ -673,10 +678,6 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
 			      relay_subscribe_f, relay);
 	if (rc == 0)
 		rc = cord_cojoin(&relay->cord);
-
-	relay_stop(relay);
-	replica_on_relay_stop(replica);
-
 	if (rc != 0)
 		diag_raise();
 }
diff --git a/test/replication/misc.result b/test/replication/misc.result
index f896ee4a..0a57edda 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -642,3 +642,90 @@ test_run:cleanup_cluster()
 box.schema.user.revoke('guest', 'replication')
 ---
 ...
+--
+-- gh-4399 Check that an error reading WAL directory on subscribe
+-- doesn't lead to a permanent replication failure.
+--
+box.schema.user.grant("guest", "replication")
+---
+...
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+---
+- true
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+-- Make the WAL directory inaccessible.
+fio = require('fio')
+---
+...
+path = fio.abspath(box.cfg.wal_dir)
+---
+...
+fio.chmod(path, 0)
+---
+- true
+...
+-- Break replication on timeout.
+replication_timeout = box.cfg.replication_timeout
+---
+...
+box.cfg{replication_timeout = 9000}
+---
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+test_run:wait_cond(function() return box.info.replication[1].upstream.status ~= 'follow' end)
+---
+- true
+...
+require('fiber').sleep(box.cfg.replication_timeout)
+---
+...
+test_run:cmd("switch default")
+---
+- true
+...
+box.cfg{replication_timeout = replication_timeout}
+---
+...
+-- Restore access to the WAL directory.
+-- Wait for replication to be reestablished.
+fio.chmod(path, tonumber('777', 8))
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end)
+---
+- true
+...
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd("cleanup server replica")
+---
+- true
+...
+test_run:cmd("delete server replica")
+---
+- true
+...
+test_run:cleanup_cluster()
+---
+...
+box.schema.user.revoke('guest', 'replication')
+---
+...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 4be12ba3..99e99550 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -258,3 +258,38 @@ test_run:cmd("cleanup server replica")
 test_run:cmd("delete server replica")
 test_run:cleanup_cluster()
 box.schema.user.revoke('guest', 'replication')
+
+--
+-- gh-4399 Check that an error reading WAL directory on subscribe
+-- doesn't lead to a permanent replication failure.
+--
+box.schema.user.grant("guest", "replication")
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+test_run:cmd("start server replica")
+
+-- Make the WAL directory inaccessible.
+fio = require('fio')
+path = fio.abspath(box.cfg.wal_dir)
+fio.chmod(path, 0)
+
+-- Break replication on timeout.
+replication_timeout = box.cfg.replication_timeout
+box.cfg{replication_timeout = 9000}
+test_run:cmd("switch replica")
+test_run:wait_cond(function() return box.info.replication[1].upstream.status ~= 'follow' end)
+require('fiber').sleep(box.cfg.replication_timeout)
+test_run:cmd("switch default")
+box.cfg{replication_timeout = replication_timeout}
+
+-- Restore access to the WAL directory.
+-- Wait for replication to be reestablished.
+fio.chmod(path, tonumber('777', 8))
+test_run:cmd("switch replica")
+test_run:wait_cond(function() return box.info.replication[1].upstream.status == 'follow' end)
+test_run:cmd("switch default")
+
+test_run:cmd("stop server replica")
+test_run:cmd("cleanup server replica")
+test_run:cmd("delete server replica")
+test_run:cleanup_cluster()
+box.schema.user.revoke('guest', 'replication')

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-08-02 17:33 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-02 12:53 [PATCH] relay: stop relay on subscribe error Vladimir Davydov
2019-08-02 17:33 ` Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox