From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp48.i.mail.ru (smtp48.i.mail.ru [94.100.177.108]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 03944431780 for ; Thu, 27 Aug 2020 14:27:16 +0300 (MSK) References: <4c64406bcbe4f52629eba5a5c4dc1c9ea0115dea.1598454255.git.sergeyb@tarantool.org> From: Sergey Petrenko Message-ID: <347d6e10-deb8-d349-e1df-033b5075903f@tarantool.org> Date: Thu, 27 Aug 2020 14:27:14 +0300 MIME-Version: 1.0 In-Reply-To: <4c64406bcbe4f52629eba5a5c4dc1c9ea0115dea.1598454255.git.sergeyb@tarantool.org> Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Transfer-Encoding: 8bit Content-Language: en-GB Subject: Re: [Tarantool-patches] [PATCH v1] replication: change space sync mode in a loop List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: sergeyb@tarantool.org Cc: tarantool-patches@dev.tarantool.org, Vladislav Shpilevoy Hi! Thanks for the patch! Please see my comments below. 26.08.2020 18:10, sergeyb@tarantool.org пишет: > From: Sergey Bronnikov > > New regression tests covers cases when one can change synchronous mode > of space to asynchronous and vice versa. > > Closes #5055 > Part of #5144 > --- > > Branch: ligurio/gh-4842-qsync-change-mode > CI: https://gitlab.com/tarantool/tarantool/-/pipelines/182271234 > > test/replication/qsync_sync_mode.result | 164 ++++++++++++++++++++++ > test/replication/qsync_sync_mode.test.lua | 90 ++++++++++++ > 2 files changed, 254 insertions(+) > create mode 100644 test/replication/qsync_sync_mode.result > create mode 100644 test/replication/qsync_sync_mode.test.lua > > diff --git a/test/replication/qsync_sync_mode.result b/test/replication/qsync_sync_mode.result > new file mode 100644 > index 000000000..f2f95ec0f > --- /dev/null > +++ b/test/replication/qsync_sync_mode.result > @@ -0,0 +1,164 @@ > +-- test-run result file version 2 > +env = require('test_run') > + | --- > + | ... > +test_run = env.new() > + | --- > + | ... > +engine = test_run:get_cfg('engine') > + | --- > + | ... > +fiber = require('fiber') > + | --- > + | ... > +math = require('math') > + | --- > + | ... > +math.randomseed(os.time()) > + | --- > + | ... > + > +orig_synchro_quorum = box.cfg.replication_synchro_quorum > + | --- > + | ... > +orig_synchro_timeout = box.cfg.replication_synchro_timeout > + | --- > + | ... > + > +disable_sync_mode = function() \ > + local s = box.space._space:get(box.space.sync.id) \ > + local new_s = s:update({{'=', 6, {is_sync=false}}}) \ > + box.space._space:replace(new_s) \ > +end > + | --- > + | ... > + > +enable_sync_mode = function() \ > + local s = box.space._space:get(box.space.sync.id) \ > + local new_s = s:update({{'=', 6, {is_sync=true}}}) \ > + box.space._space:replace(new_s) \ > +end > + | --- > + | ... > + > +set_random_sync_mode = function() \ > + if (math.random(1, 10) > 5) then \ > + enable_sync_mode() \ > + else \ > + disable_sync_mode() \ > + end \ > +end > + | --- > + | ... > + > +set_random_quorum = function(n) \ > + box.cfg{replication_synchro_quorum=math.random(1, n)} \ > +end > + | --- > + | ... > + > +box.schema.user.grant('guest', 'replication') > + | --- > + | ... > + > +-- Setup an async cluster with two instances. > +test_run:cmd('create server replica with rpl_master=default,\ > + script="replication/replica.lua"') > + | --- > + | - true > + | ... > +test_run:cmd('start server replica with wait=True, wait_load=True') > + | --- > + | - true > + | ... > + > +-- Write data to a leader, enable and disable sync mode in background in a > +-- loop. Expected no data loss. > +-- Testcase setup. > +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) > + | --- > + | ... > +_ = box.space.sync:create_index('pk') > + | --- > + | ... > +box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.001} > + | --- > + | ... > +-- Testcase body. > +for i = 1,10 do \ > + set_random_sync_mode() \ > + if pcall(box.space.sync.insert, box.space.sync, {i}) then \ > + test_run:switch('replica') \ > + test_run:wait_cond(function() return box.space.sync:get{i} ~= nil end) \ > + end \ > + test_run:switch('default') \ > +end > + | --- > + | ... > +-- Testcase cleanup. > +test_run:switch('default') > + | --- > + | - true > + | ... > +box.space.sync:drop() > + | --- > + | ... > + > +-- Write data to a leader, enable and disable sync mode and change quorum value > +-- in background in a loop. > +-- Testcase setup. > +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) > + | --- > + | ... > +_ = box.space.sync:create_index('pk') > + | --- > + | ... > +box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.001} > + | --- > + | ... > +-- Testcase body. > +for i = 1,10 do \ > + set_random_sync_mode() \ > + set_random_quorum(5) \ > + if pcall(box.space.sync.insert, box.space.sync, {i}) then \ > + test_run:switch('replica') \ > + test_run:wait_cond(function() return box.space.sync:get{i} ~= nil end) \ > + end \ > + test_run:switch('default') \ > +end > + | --- > + | ... > +-- Testcase cleanup. > +test_run:switch('default') > + | --- > + | - true > + | ... > +box.space.sync:drop() > + | --- > + | ... > + > +-- Teardown. > +test_run:cmd('switch default') > + | --- > + | - true > + | ... > +test_run:cmd('stop server replica') > + | --- > + | - true > + | ... > +test_run:cmd('delete server replica') > + | --- > + | - true > + | ... > +test_run:cleanup_cluster() > + | --- > + | ... > +box.schema.user.revoke('guest', 'replication') > + | --- > + | ... > +box.cfg{ \ > + replication_synchro_quorum = orig_synchro_quorum, \ > + replication_synchro_timeout = orig_synchro_timeout, \ > +} > + | --- > + | ... > diff --git a/test/replication/qsync_sync_mode.test.lua b/test/replication/qsync_sync_mode.test.lua > new file mode 100644 > index 000000000..706261c10 > --- /dev/null > +++ b/test/replication/qsync_sync_mode.test.lua > @@ -0,0 +1,90 @@ > +env = require('test_run') > +test_run = env.new() > +engine = test_run:get_cfg('engine') > +fiber = require('fiber') > +math = require('math') > +math.randomseed(os.time()) > + > +orig_synchro_quorum = box.cfg.replication_synchro_quorum > +orig_synchro_timeout = box.cfg.replication_synchro_timeout > + > +disable_sync_mode = function() \ > + local s = box.space._space:get(box.space.sync.id) \ > + local new_s = s:update({{'=', 6, {is_sync=false}}}) \ > + box.space._space:replace(new_s) \ > +end > + > +enable_sync_mode = function() \ > + local s = box.space._space:get(box.space.sync.id) \ > + local new_s = s:update({{'=', 6, {is_sync=true}}}) \ > + box.space._space:replace(new_s) \ > +end > + Vlad has pushed a patch with `space:alter` a couple of days ago. So now you may say `space:alter{is_sync=true}`, `space:alter{is_sync=false}` It does the same work you do, but looks much simpler. > +set_random_sync_mode = function() \ > + if (math.random(1, 10) > 5) then \ > + enable_sync_mode() \ > + else \ > + disable_sync_mode() \ > + end \ > +end > + > +set_random_quorum = function(n) \ > + box.cfg{replication_synchro_quorum=math.random(1, n)} \ > +end > + > +box.schema.user.grant('guest', 'replication') > + > +-- Setup an async cluster with two instances. > +test_run:cmd('create server replica with rpl_master=default,\ > + script="replication/replica.lua"') > +test_run:cmd('start server replica with wait=True, wait_load=True') > + > +-- Write data to a leader, enable and disable sync mode in background in a > +-- loop. Expected no data loss. But sync mode is not set in background, it's set in the same loop where insertions happen. > +-- Testcase setup. > +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) > +_ = box.space.sync:create_index('pk') > +box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.001} Is such a tiny timeout intended? I see you wrap the errors in a pcall, but still, it may happen that you get a timeout 10 out of 10 times and then you'll test nothing. We usually use 30 second replication timeouts. > +-- Testcase body. > +for i = 1,10 do \ > + set_random_sync_mode() \ > + if pcall(box.space.sync.insert, box.space.sync, {i}) then \ > + test_run:switch('replica') \ > + test_run:wait_cond(function() return box.space.sync:get{i} ~= nil end) \ > + end \ > + test_run:switch('default') \ > +end > +-- Testcase cleanup. > +test_run:switch('default') > +box.space.sync:drop() > + > +-- Write data to a leader, enable and disable sync mode and change quorum value > +-- in background in a loop. > +-- Testcase setup. > +_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) > +_ = box.space.sync:create_index('pk') > +box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.001} Ok, I see why you need a small timeout in this testcase. Maybe set a small timeout only when the insertion is expected to fail(quorum = 3 or 4)?. And use a 30 sec timeout otherwise. > +-- Testcase body. > +for i = 1,10 do \ > + set_random_sync_mode() \ > + set_random_quorum(5) \ Math.random() range is inclusive, so it'd be more fair, to choose quorum from 1 to 4. Then there'd be 2 cases of successful write, quorum 1 and 2, and 2 cases of failed write, quorum 3 and 4. > + if pcall(box.space.sync.insert, box.space.sync, {i}) then \ > + test_run:switch('replica') \ > + test_run:wait_cond(function() return box.space.sync:get{i} ~= nil end) \ > + end \ > + test_run:switch('default') \ > +end > +-- Testcase cleanup. > +test_run:switch('default') > +box.space.sync:drop() > + > +-- Teardown. > +test_run:cmd('switch default') > +test_run:cmd('stop server replica') > +test_run:cmd('delete server replica') > +test_run:cleanup_cluster() > +box.schema.user.revoke('guest', 'replication') > +box.cfg{ \ > + replication_synchro_quorum = orig_synchro_quorum, \ > + replication_synchro_timeout = orig_synchro_timeout, \ > +}