[PATCH] replication: Break connection on timeout

Konstantin Belyavskiy k.belyavskiy at tarantool.org
Tue Feb 13 15:13:01 MSK 2018


In replication schema if one of the instances was powered off, it doesn't detected
by others and the connection hangs. Alive machines show 'follow' state.
Add timeout to solve this issue. It's safe since applier and relay both send
messages every replication_timeout so we can assume that if we read nothing we
have problem with connection.
Use replication_disconnect_timeout which is replication_timeout * 4 as for now.

Closes #3025
---
 branch: gh-3025-break-connection-timeout
 src/box/applier.cc               |  7 ++++-
 test/replication/errinj.result   | 63 +++++++++++++++++++++++++++++++++++++---
 test/replication/errinj.test.lua | 27 ++++++++++++++++-
 3 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index f0073bada..106a728cd 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -418,7 +418,12 @@ applier_subscribe(struct applier *applier)
 			applier_set_state(applier, APPLIER_FOLLOW);
 		}
 
-		coio_read_xrow(coio, ibuf, &row);
+		if (applier->version_id < version_id(1, 7, 7))
+			coio_read_xrow(coio, ibuf, &row);
+		else {
+			double timeout = replication_disconnect_timeout();
+			coio_read_xrow_timeout_xc(coio, ibuf, &row, timeout);
+		}
 
 		if (iproto_type_is_error(row.type))
 			xrow_decode_error_xc(&row);  /* error */
diff --git a/test/replication/errinj.result b/test/replication/errinj.result
index d1f1dbe91..572071a03 100644
--- a/test/replication/errinj.result
+++ b/test/replication/errinj.result
@@ -407,6 +407,65 @@ test_run:cmd("cleanup server replica")
 ---
 - true
 ...
+box.cfg{replication_timeout = 0.01}
+---
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+box.cfg{replication_timeout = 0.01}
+---
+...
+fiber = require('fiber')
+---
+...
+fiber.sleep(0.05)
+---
+...
+while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end
+---
+...
+box.info.replication[1].upstream.status
+---
+- follow
+...
+test_run:cmd("switch default")
+---
+- true
+...
+errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 5)
+---
+- ok
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+-- Check replica's disconnection on timeout (gh-3025).
+-- If master stops send heartbeat messages to replica,
+-- due to infinite read timeout connection never breaks,
+-- replica shows state 'follow' so old behaviour hangs
+-- here in infinite loop.
+while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end
+---
+...
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd("cleanup server replica")
+---
+- true
+...
 box.snapshot()
 ---
 - ok
@@ -426,10 +485,6 @@ test_run:cmd("switch replica_ack")
 ---
 - true
 ...
-box.info.replication[1].upstream.status
----
-- follow
-...
 test_run:cmd("stop server default")
 ---
 - true
diff --git a/test/replication/errinj.test.lua b/test/replication/errinj.test.lua
index ba83481fe..22cb2076f 100644
--- a/test/replication/errinj.test.lua
+++ b/test/replication/errinj.test.lua
@@ -169,13 +169,38 @@ test_run:cmd("switch default")
 test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
 
+box.cfg{replication_timeout = 0.01}
+
+test_run:cmd("start server replica")
+test_run:cmd("switch replica")
+
+box.cfg{replication_timeout = 0.01}
+fiber = require('fiber')
+fiber.sleep(0.05)
+while box.info.replication[1].upstream.status ~= 'follow' do fiber.sleep(0.0001) end
+box.info.replication[1].upstream.status
+
+test_run:cmd("switch default")
+errinj.set("ERRINJ_RELAY_REPORT_INTERVAL", 5)
+
+test_run:cmd("switch replica")
+-- Check replica's disconnection on timeout (gh-3025).
+-- If master stops send heartbeat messages to replica,
+-- due to infinite read timeout connection never breaks,
+-- replica shows state 'follow' so old behaviour hangs
+-- here in infinite loop.
+while box.info.replication[1].upstream.message ~= 'timed out' do fiber.sleep(0.0001) end
+
+test_run:cmd("switch default")
+test_run:cmd("stop server replica")
+test_run:cmd("cleanup server replica")
+
 box.snapshot()
 for i = 0, 9999 do box.space.test:replace({i, 4, 5, 'test'}) end
 
 test_run:cmd("create server replica_ack with rpl_master=default, script='replication/replica_ack.lua'")
 test_run:cmd("start server replica_ack")
 test_run:cmd("switch replica_ack")
-box.info.replication[1].upstream.status
 
 test_run:cmd("stop server default")
 test_run:cmd("deploy server default")
-- 
2.14.3 (Apple Git-98)




More information about the Tarantool-patches mailing list