[PATCH v2 1/2] replication: stay in orphan mode until replica is synced by vclock

Konstantin Belyavskiy k.belyavskiy at tarantool.org
Fri Mar 30 17:03:14 MSK 2018


Stay in orphan (read-only) mode until local vclock is lower than
master's to make sure that datasets are the same across replicaset.
Also revert and slightly update catch test.

Needed for 3210
---
 src/box/applier.cc              | 16 +++++++++++-----
 test/replication/catch.result   | 15 ++++++++++-----
 test/replication/catch.test.lua |  7 ++++---
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index 6bfe5a99a..12bf1f0d2 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -305,7 +305,7 @@ applier_join(struct applier *applier)
 				 * server is 1.6. Since we have
 				 * not initialized replication
 				 * vclock yet, do it now. In 1.7+
-				 * this vlcock is not used.
+				 * this vclock is not used.
 				 */
 				xrow_decode_vclock_xc(&row, &replicaset.vclock);
 			}
@@ -370,6 +370,7 @@ applier_subscribe(struct applier *applier)
 	struct ev_io *coio = &applier->io;
 	struct ibuf *ibuf = &applier->ibuf;
 	struct xrow_header row;
+	struct vclock remote_vclock_at_subscribe;
 
 	xrow_encode_subscribe_xc(&row, &REPLICASET_UUID, &INSTANCE_UUID,
 				 &replicaset.vclock);
@@ -411,9 +412,8 @@ applier_subscribe(struct applier *applier)
 		 * In case of successful subscribe, the server
 		 * responds with its current vclock.
 		 */
-		struct vclock vclock;
-		vclock_create(&vclock);
-		xrow_decode_vclock_xc(&row, &vclock);
+		vclock_create(&remote_vclock_at_subscribe);
+		xrow_decode_vclock_xc(&row, &remote_vclock_at_subscribe);
 	}
 	/**
 	 * Tarantool < 1.6.7:
@@ -452,8 +452,14 @@ applier_subscribe(struct applier *applier)
 			applier_set_state(applier, APPLIER_FOLLOW);
 		}
 
+		/*
+		 * Must stay in read-only mode, until it synchronized.
+		 * Check lag and compare local vclock with remote one.
+		 */
 		if (applier->state == APPLIER_SYNC &&
-		    applier->lag <= replication_sync_lag) {
+		    applier->lag <= replication_sync_lag &&
+		    vclock_compare(&remote_vclock_at_subscribe,
+				   &replicaset.vclock) <= 0) {
 			/* Applier is synced, switch to "follow". */
 			applier_set_state(applier, APPLIER_FOLLOW);
 		}
diff --git a/test/replication/catch.result b/test/replication/catch.result
index 7d61ad26f..681cd77ac 100644
--- a/test/replication/catch.result
+++ b/test/replication/catch.result
@@ -19,11 +19,11 @@ errinj = box.error.injection
 box.schema.user.grant('guest', 'replication')
 ---
 ...
-test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica_timeout.lua'")
 ---
 - true
 ...
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='0.1'")
 ---
 - true
 ...
@@ -69,7 +69,7 @@ errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0)
 ---
 - ok
 ...
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='0.1'")
 ---
 - true
 ...
@@ -99,10 +99,11 @@ box.space.test ~= nil
 ...
 d = box.space.test:delete{1}
 ---
+- error: Can't modify data because this instance is in read-only mode.
 ...
 box.space.test:get(1) == nil
 ---
-- true
+- false
 ...
 -- case #2: delete tuple by net.box
 test_run:cmd("switch default")
@@ -116,9 +117,13 @@ test_run:cmd("set variable r_uri to 'replica.listen'")
 c = net_box.connect(r_uri)
 ---
 ...
+d = c.space.test:delete{1}
+---
+- error: Can't modify data because this instance is in read-only mode.
+...
 c.space.test:get(1) == nil
 ---
-- true
+- false
 ...
 -- check sync
 errinj.set("ERRINJ_RELAY_TIMEOUT", 0)
diff --git a/test/replication/catch.test.lua b/test/replication/catch.test.lua
index cb865aa3c..cbfa1c19a 100644
--- a/test/replication/catch.test.lua
+++ b/test/replication/catch.test.lua
@@ -8,8 +8,8 @@ net_box = require('net.box')
 errinj = box.error.injection
 
 box.schema.user.grant('guest', 'replication')
-test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
-test_run:cmd("start server replica")
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica_timeout.lua'")
+test_run:cmd("start server replica with args='0.1'")
 test_run:cmd("switch replica")
 
 test_run:cmd("switch default")
@@ -29,7 +29,7 @@ for i=1,100 do s:insert{i, 'this is test message12345'} end
 -- sleep after every tuple
 errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0)
 
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='0.1'")
 test_run:cmd("switch replica")
 
 fiber = require('fiber')
@@ -53,6 +53,7 @@ box.space.test:get(1) == nil
 test_run:cmd("switch default")
 test_run:cmd("set variable r_uri to 'replica.listen'")
 c = net_box.connect(r_uri)
+d = c.space.test:delete{1}
 c.space.test:get(1) == nil
 
 -- check sync
-- 
2.14.3 (Apple Git-98)




More information about the Tarantool-patches mailing list