From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Konstantin Belyavskiy Subject: [PATCH v2 1/2] replication: stay in orphan mode until replica is synced by vclock Date: Fri, 30 Mar 2018 17:03:14 +0300 Message-Id: In-Reply-To: References: In-Reply-To: References: To: tarantool-patches@freelists.org, vdavydov@tarantool.org List-ID: Stay in orphan (read-only) mode until local vclock is lower than master's to make sure that datasets are the same across replicaset. Also revert and slightly update catch test. Needed for 3210 --- src/box/applier.cc | 16 +++++++++++----- test/replication/catch.result | 15 ++++++++++----- test/replication/catch.test.lua | 7 ++++--- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/box/applier.cc b/src/box/applier.cc index 6bfe5a99a..12bf1f0d2 100644 --- a/src/box/applier.cc +++ b/src/box/applier.cc @@ -305,7 +305,7 @@ applier_join(struct applier *applier) * server is 1.6. Since we have * not initialized replication * vclock yet, do it now. In 1.7+ - * this vlcock is not used. + * this vclock is not used. */ xrow_decode_vclock_xc(&row, &replicaset.vclock); } @@ -370,6 +370,7 @@ applier_subscribe(struct applier *applier) struct ev_io *coio = &applier->io; struct ibuf *ibuf = &applier->ibuf; struct xrow_header row; + struct vclock remote_vclock_at_subscribe; xrow_encode_subscribe_xc(&row, &REPLICASET_UUID, &INSTANCE_UUID, &replicaset.vclock); @@ -411,9 +412,8 @@ applier_subscribe(struct applier *applier) * In case of successful subscribe, the server * responds with its current vclock. */ - struct vclock vclock; - vclock_create(&vclock); - xrow_decode_vclock_xc(&row, &vclock); + vclock_create(&remote_vclock_at_subscribe); + xrow_decode_vclock_xc(&row, &remote_vclock_at_subscribe); } /** * Tarantool < 1.6.7: @@ -452,8 +452,14 @@ applier_subscribe(struct applier *applier) applier_set_state(applier, APPLIER_FOLLOW); } + /* + * Must stay in read-only mode, until it synchronized. + * Check lag and compare local vclock with remote one. + */ if (applier->state == APPLIER_SYNC && - applier->lag <= replication_sync_lag) { + applier->lag <= replication_sync_lag && + vclock_compare(&remote_vclock_at_subscribe, + &replicaset.vclock) <= 0) { /* Applier is synced, switch to "follow". */ applier_set_state(applier, APPLIER_FOLLOW); } diff --git a/test/replication/catch.result b/test/replication/catch.result index 7d61ad26f..681cd77ac 100644 --- a/test/replication/catch.result +++ b/test/replication/catch.result @@ -19,11 +19,11 @@ errinj = box.error.injection box.schema.user.grant('guest', 'replication') --- ... -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +test_run:cmd("create server replica with rpl_master=default, script='replication/replica_timeout.lua'") --- - true ... -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='0.1'") --- - true ... @@ -69,7 +69,7 @@ errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0) --- - ok ... -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='0.1'") --- - true ... @@ -99,10 +99,11 @@ box.space.test ~= nil ... d = box.space.test:delete{1} --- +- error: Can't modify data because this instance is in read-only mode. ... box.space.test:get(1) == nil --- -- true +- false ... -- case #2: delete tuple by net.box test_run:cmd("switch default") @@ -116,9 +117,13 @@ test_run:cmd("set variable r_uri to 'replica.listen'") c = net_box.connect(r_uri) --- ... +d = c.space.test:delete{1} +--- +- error: Can't modify data because this instance is in read-only mode. +... c.space.test:get(1) == nil --- -- true +- false ... -- check sync errinj.set("ERRINJ_RELAY_TIMEOUT", 0) diff --git a/test/replication/catch.test.lua b/test/replication/catch.test.lua index cb865aa3c..cbfa1c19a 100644 --- a/test/replication/catch.test.lua +++ b/test/replication/catch.test.lua @@ -8,8 +8,8 @@ net_box = require('net.box') errinj = box.error.injection box.schema.user.grant('guest', 'replication') -test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") -test_run:cmd("start server replica") +test_run:cmd("create server replica with rpl_master=default, script='replication/replica_timeout.lua'") +test_run:cmd("start server replica with args='0.1'") test_run:cmd("switch replica") test_run:cmd("switch default") @@ -29,7 +29,7 @@ for i=1,100 do s:insert{i, 'this is test message12345'} end -- sleep after every tuple errinj.set("ERRINJ_RELAY_TIMEOUT", 1000.0) -test_run:cmd("start server replica") +test_run:cmd("start server replica with args='0.1'") test_run:cmd("switch replica") fiber = require('fiber') @@ -53,6 +53,7 @@ box.space.test:get(1) == nil test_run:cmd("switch default") test_run:cmd("set variable r_uri to 'replica.listen'") c = net_box.connect(r_uri) +d = c.space.test:delete{1} c.space.test:get(1) == nil -- check sync -- 2.14.3 (Apple Git-98)