From: Sergey Bronnikov <sergeyb@tarantool.org> To: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>, tarantool-patches@dev.tarantool.org Subject: Re: [Tarantool-patches] [PATCH 3/4] replication: add test with random leaders promotion and demotion Date: Tue, 24 Nov 2020 11:39:58 +0300 [thread overview] Message-ID: <7164d167-4386-4ea7-ae33-e19ca1f8b326@tarantool.org> (raw) In-Reply-To: <3496e5a3-fc28-880e-d71f-d9746cc56412@tarantool.org> Thanks for review! On 21.11.2020 17:41, Vladislav Shpilevoy wrote: > Thanks for the patch! > > See 8 comments below. > > On 17.11.2020 17:13, sergeyb@tarantool.org wrote: >> From: Sergey Bronnikov <sergeyb@tarantool.org> >> >> Part of #5055 >> Part of #5144 >> --- >> test/replication/qsync.lua | 31 ++++ >> test/replication/qsync1.lua | 1 + >> test/replication/qsync2.lua | 1 + >> test/replication/qsync3.lua | 1 + >> test/replication/qsync4.lua | 1 + >> test/replication/qsync5.lua | 1 + >> test/replication/qsync_random_leader.result | 148 ++++++++++++++++++ >> test/replication/qsync_random_leader.test.lua | 76 +++++++++ >> 8 files changed, 260 insertions(+) >> create mode 100644 test/replication/qsync.lua >> create mode 120000 test/replication/qsync1.lua >> create mode 120000 test/replication/qsync2.lua >> create mode 120000 test/replication/qsync3.lua >> create mode 120000 test/replication/qsync4.lua >> create mode 120000 test/replication/qsync5.lua >> create mode 100644 test/replication/qsync_random_leader.result >> create mode 100644 test/replication/qsync_random_leader.test.lua >> >> diff --git a/test/replication/qsync.lua b/test/replication/qsync.lua >> new file mode 100644 >> index 000000000..9bbc87239 >> --- /dev/null >> +++ b/test/replication/qsync.lua >> @@ -0,0 +1,31 @@ >> +#!/usr/bin/env tarantool >> + >> +-- get instance name from filename (qsync1.lua => qsync1) >> +local INSTANCE_ID = string.match(arg[0], "%d") >> + >> +local SOCKET_DIR = require('fio').cwd() >> + >> +local function instance_uri(instance_id) >> + return SOCKET_DIR..'/qsync'..instance_id..'.sock'; > 1. ';' is not needed here. local function instance_uri(instance_id) - return SOCKET_DIR..'/qsync'..instance_id..'.sock'; + return SOCKET_DIR..'/qsync'..instance_id..'.sock' end >> +end >> + >> +-- start console first > 2. Why? The comment narrates the code, but does not say why > it is done so. Updated a comment. Console is required to keep instance running. >> +require('console').listen(os.getenv('ADMIN')) >> + >> +box.cfg({ >> + listen = instance_uri(INSTANCE_ID); > 3. Please, don't use ';' as a separator. We use ',' in the > new code. box.cfg({ - listen = instance_uri(INSTANCE_ID); + listen = instance_uri(INSTANCE_ID), replication = { - instance_uri(1); - instance_uri(2); - instance_uri(3); - instance_uri(4); - instance_uri(5); - }; - replication_synchro_timeout = 1000; - replication_synchro_quorum = 5; - read_only = false; + instance_uri(1), + instance_uri(2), + instance_uri(3), + instance_uri(4), + instance_uri(5), + }, + replication_synchro_timeout = 1000, + replication_synchro_quorum = 5, + read_only = false, }) >> + replication = { >> + instance_uri(1); >> + instance_uri(2); >> + instance_uri(3); >> + instance_uri(4); >> + instance_uri(5); >> + }; >> + replication_synchro_timeout = 1000; >> + replication_synchro_quorum = 5; >> + read_only = false; >> +}) >> + >> +box.once("bootstrap", function() >> + box.schema.user.grant("guest", 'replication') >> +end) >> diff --git a/test/replication/qsync_random_leader.result b/test/replication/qsync_random_leader.result >> new file mode 100644 >> index 000000000..2b2df99db >> --- /dev/null >> +++ b/test/replication/qsync_random_leader.result >> @@ -0,0 +1,148 @@ >> +-- test-run result file version 2 >> +os = require('os') >> + | --- >> + | ... >> +env = require('test_run') >> + | --- >> + | ... >> +math = require('math') >> + | --- >> + | ... >> +fiber = require('fiber') >> + | --- >> + | ... >> +test_run = env.new() >> + | --- >> + | ... >> +netbox = require('net.box') >> + | --- >> + | ... >> + >> +orig_synchro_quorum = box.cfg.replication_synchro_quorum >> + | --- >> + | ... >> +orig_synchro_timeout = box.cfg.replication_synchro_timeout >> + | --- >> + | ... >> + >> +NUM_INSTANCES = 5 >> + | --- >> + | ... >> +SERVERS = {} >> + | --- >> + | ... >> +for i=1,NUM_INSTANCES do \ > 4. Please, use whitespaces. Here are some: ' ', > take them. Vim doesn't show any tabs before '\'. Anyway I have updated indentation before '\' using whitespaces. >> + SERVERS[i] = 'qsync' .. i \ >> +end; > 5. ';' is not needed. Removed. >> + | --- >> + | ... >> +SERVERS -- print instance names >> + | --- >> + | - - qsync1 >> + | - qsync2 >> + | - qsync3 >> + | - qsync4 >> + | - qsync5 >> + | ... >> + >> +math.randomseed(os.time()) >> + | --- >> + | ... >> +random = function(excluded_num, total) \ > 6. Why not 'function random(...)'? Why do you need to assign > it to a variable via '='? Because for my taste this form is more convenient for reading. Converted to "function random(...)". > >> + local r = math.random(1, total) \ >> + if (r == excluded_num) then \ >> + return random(excluded_num, total) \ >> + end \ >> + return r \ >> +end >> + | --- >> + | ... >> + >> +-- Set 'broken' quorum on current leader. >> +-- Write value on current leader. >> +-- Pick a random replica in a cluster. >> +-- Set 'good' quorum on it and promote to a leader. >> +-- Make sure value is there and on an old leader. >> + >> +-- Testcase setup. >> +test_run:create_cluster(SERVERS) >> + | --- >> + | ... >> +test_run:wait_fullmesh(SERVERS) >> + | --- >> + | ... >> +test_run:switch('qsync1') >> + | --- >> + | - true >> + | ... >> +_ = box.schema.space.create('sync', {is_sync=true, engine = test_run:get_cfg('engine')}) >> + | --- >> + | ... >> +_ = box.space.sync:create_index('primary') >> + | --- >> + | ... >> +box.schema.user.grant('guest', 'write', 'space', 'sync') >> + | --- >> + | ... >> +test_run:switch('default') >> + | --- >> + | - true >> + | ... >> +current_leader_id = 1 >> + | --- >> + | ... >> +test_run:eval(SERVERS[current_leader_id], "box.ctl.clear_synchro_queue()") >> + | --- >> + | - [] >> + | ... >> + >> +SOCKET_DIR = require('fio').cwd() >> + | --- >> + | ... >> + >> +-- Testcase body. >> +for i=1,30 do \ >> + test_run:eval(SERVERS[current_leader_id], \ >> + "box.cfg{replication_synchro_quorum=6, replication_synchro_timeout=1000}") \ >> + c = netbox.connect(SOCKET_DIR..'/'..SERVERS[current_leader_id]..'.sock') \ >> + fiber.create(function() c.space.sync:insert{i} end) \ >> + new_leader_id = random(current_leader_id, #SERVERS) \ >> + test_run:eval(SERVERS[new_leader_id], \ >> + "box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.01}") \ >> + test_run:eval(SERVERS[new_leader_id], "box.ctl.clear_synchro_queue()") \ >> + c:close() \ >> + replica = random(new_leader_id, #SERVERS) \ >> + test_run:wait_cond(function() return test_run:eval(SERVERS[replica], \ >> + string.format("box.space.sync:get{%d}", i))[1] ~= nil end) \ >> + test_run:wait_cond(function() return test_run:eval(SERVERS[current_leader_id], \ >> + string.format("box.space.sync:get{%d}", i))[1] ~= nil end) \ >> + new_leader_id = random(current_leader_id, #SERVERS) \ >> + current_leader_id = new_leader_id \ >> +end >> + | --- >> + | ... >> + >> +test_run:wait_cond(function() return test_run:eval('qsync1', \ >> + ("box.space.sync:count()")) == 30 end) >> + | --- >> + | - false > 7. I am not sure it is supposed to be false. If it should be - > I don't understand why. Could you elaborate please? No, it is obviously should be True here. I missed reference value for that statement in .result file. wait_cond() here was broken because it returns a list, not a number. Fixed it. test_run:wait_cond(function() return test_run:eval('qsync1', \ - ("box.space.sync:count()")) == 30 end) + ("box.space.sync:count()"))[1] == 30 end) | --- - | - false + | - true | ... > Also in the previous commit in the end of a test doing exactly > the same, but for leader-replica config, you wrote: > > Note: cluster may be in a broken state here due to nature of previous test. You got it wrong. The sense of my note is following: testcases should be isolated from each one and if testcase is failed we consider environment where test has been started as 'dirty' and should not continue to run other tests in that environment. Testcases with 'broken' quorum uses negative scenarios and I placed this testcase at the end of file to minimize impact to other tests in case of failure. Note was placed for others just to warn about negative testcase. > > But here you do it 30 times, and it is not broken (I hope). > So that statement wasn't true? > > Besides, I get > > No output during 10 seconds. Will abort after 120 seconds without output. List of workers not reporting the status: > - 001_replication [replication/qsync_random_leader.test.lua, vinyl] at var/001_replication/qsync_random_leader.result:126 > - 002_replication [replication/qsync_random_leader.test.lua, memtx] at var/002_replication/qsync_random_leader.result:126 > No output during 20 seconds. Will abort after 120 seconds without output. List of workers not reporting the status: > - 001_replication [replication/qsync_random_leader.test.lua, vinyl] at var/001_replication/qsync_random_leader.result:126 > - 002_replication [replication/qsync_random_leader.test.lua, memtx] at var/002_replication/qsync_random_leader.result:126 > > on this wait_cond(). Does not look like it works. The problem was in a value that wait_cond() returns. > > Also you don't check the space is full on other instances. Only on the > current leader, which is not so interesting. -test_run:wait_cond(function() return test_run:eval('qsync1', \ - ("box.space.sync:count()")) == 30 end) +for i=1,NUM_INSTANCES do \ + instance = string.format('qsync%d', i) \ + test_run:wait_cond(function() return test_run:eval(instance, \ + "box.space.sync:count()")[1] == 30 end) \ +end >> + | ... >> + >> +-- Teardown. >> +test_run:switch('default') >> + | --- >> + | - true >> + | ... >> +test_run:eval(SERVERS[current_leader_id], 'box.space.sync:drop()') >> + | --- >> + | - [] >> + | ... >> +test_run:drop_cluster(SERVERS) >> + | --- >> + | ... >> +box.cfg{ \ >> + replication_synchro_quorum = orig_synchro_quorum, \ >> + replication_synchro_timeout = orig_synchro_timeout, \ > 8. You didn't change these values in the 'default' instance. So after > what do you restore them? Removed these lines as well as lines before test to save settings. --- a/test/replication/qsync_random_leader.test.lua +++ b/test/replication/qsync_random_leader.test.lua @@ -5,18 +5,15 @@ fiber = require('fiber') test_run = env.new() netbox = require('net.box') -orig_synchro_quorum = box.cfg.replication_synchro_quorum -orig_synchro_timeout = box.cfg.replication_synchro_timeout - NUM_INSTANCES = 5 SERVERS = {} for i=1,NUM_INSTANCES do \ SERVERS[i] = 'qsync' .. i -- Teardown. test_run:switch('default') test_run:eval(SERVERS[current_leader_id], 'box.space.sync:drop()') test_run:drop_cluster(SERVERS) -box.cfg{ \ - replication_synchro_quorum = orig_synchro_quorum, \ - replication_synchro_timeout = orig_synchro_timeout, \ -}
next prev parent reply other threads:[~2020-11-24 8:40 UTC|newest] Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top 2020-11-17 16:13 [Tarantool-patches] [PATCH 0/4 v3] Additional qsync tests sergeyb 2020-11-17 16:13 ` [Tarantool-patches] [PATCH 1/4] replication: run clear_synchro_queue() with unconfigured box sergeyb 2020-11-21 14:41 ` Vladislav Shpilevoy 2020-11-23 14:44 ` Sergey Bronnikov 2020-11-17 16:13 ` [Tarantool-patches] [PATCH 2/4] replication: test clear_synchro_queue() function sergeyb 2020-11-21 14:41 ` Vladislav Shpilevoy 2020-11-23 15:13 ` Sergey Bronnikov 2020-11-17 16:13 ` [Tarantool-patches] [PATCH 3/4] replication: add test with random leaders promotion and demotion sergeyb 2020-11-21 14:41 ` Vladislav Shpilevoy 2020-11-24 8:39 ` Sergey Bronnikov [this message] 2020-11-25 22:02 ` Vladislav Shpilevoy 2020-11-17 16:13 ` [Tarantool-patches] [PATCH 4/4] replication: add test with change space sync mode in a loop sergeyb 2020-11-21 14:41 ` Vladislav Shpilevoy 2020-11-23 15:42 ` Sergey Bronnikov 2020-11-25 22:02 ` Vladislav Shpilevoy
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=7164d167-4386-4ea7-ae33-e19ca1f8b326@tarantool.org \ --to=sergeyb@tarantool.org \ --cc=tarantool-patches@dev.tarantool.org \ --cc=v.shpilevoy@tarantool.org \ --subject='Re: [Tarantool-patches] [PATCH 3/4] replication: add test with random leaders promotion and demotion' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox