From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp57.i.mail.ru (smtp57.i.mail.ru [217.69.128.37]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 5077C469710 for ; Tue, 24 Nov 2020 11:40:00 +0300 (MSK) References: <9eb76fb56017daca32e3fbc5b5415fc144eb0699.1605629206.git.sergeyb@tarantool.org> <3496e5a3-fc28-880e-d71f-d9746cc56412@tarantool.org> From: Sergey Bronnikov Message-ID: <7164d167-4386-4ea7-ae33-e19ca1f8b326@tarantool.org> Date: Tue, 24 Nov 2020 11:39:58 +0300 MIME-Version: 1.0 In-Reply-To: <3496e5a3-fc28-880e-d71f-d9746cc56412@tarantool.org> Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Transfer-Encoding: 8bit Content-Language: en-US Subject: Re: [Tarantool-patches] [PATCH 3/4] replication: add test with random leaders promotion and demotion List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Vladislav Shpilevoy , tarantool-patches@dev.tarantool.org Thanks for review! On 21.11.2020 17:41, Vladislav Shpilevoy wrote: > Thanks for the patch! > > See 8 comments below. > > On 17.11.2020 17:13, sergeyb@tarantool.org wrote: >> From: Sergey Bronnikov >> >> Part of #5055 >> Part of #5144 >> --- >> test/replication/qsync.lua | 31 ++++ >> test/replication/qsync1.lua | 1 + >> test/replication/qsync2.lua | 1 + >> test/replication/qsync3.lua | 1 + >> test/replication/qsync4.lua | 1 + >> test/replication/qsync5.lua | 1 + >> test/replication/qsync_random_leader.result | 148 ++++++++++++++++++ >> test/replication/qsync_random_leader.test.lua | 76 +++++++++ >> 8 files changed, 260 insertions(+) >> create mode 100644 test/replication/qsync.lua >> create mode 120000 test/replication/qsync1.lua >> create mode 120000 test/replication/qsync2.lua >> create mode 120000 test/replication/qsync3.lua >> create mode 120000 test/replication/qsync4.lua >> create mode 120000 test/replication/qsync5.lua >> create mode 100644 test/replication/qsync_random_leader.result >> create mode 100644 test/replication/qsync_random_leader.test.lua >> >> diff --git a/test/replication/qsync.lua b/test/replication/qsync.lua >> new file mode 100644 >> index 000000000..9bbc87239 >> --- /dev/null >> +++ b/test/replication/qsync.lua >> @@ -0,0 +1,31 @@ >> +#!/usr/bin/env tarantool >> + >> +-- get instance name from filename (qsync1.lua => qsync1) >> +local INSTANCE_ID = string.match(arg[0], "%d") >> + >> +local SOCKET_DIR = require('fio').cwd() >> + >> +local function instance_uri(instance_id) >> + return SOCKET_DIR..'/qsync'..instance_id..'.sock'; > 1. ';' is not needed here.  local function instance_uri(instance_id) -    return SOCKET_DIR..'/qsync'..instance_id..'.sock'; +    return SOCKET_DIR..'/qsync'..instance_id..'.sock'  end >> +end >> + >> +-- start console first > 2. Why? The comment narrates the code, but does not say why > it is done so. Updated a comment. Console is required to keep instance running. >> +require('console').listen(os.getenv('ADMIN')) >> + >> +box.cfg({ >> + listen = instance_uri(INSTANCE_ID); > 3. Please, don't use ';' as a separator. We use ',' in the > new code.  box.cfg({ -    listen = instance_uri(INSTANCE_ID); +    listen = instance_uri(INSTANCE_ID),      replication = { -        instance_uri(1); -        instance_uri(2); -        instance_uri(3); -        instance_uri(4); -        instance_uri(5); -    }; -    replication_synchro_timeout = 1000; -    replication_synchro_quorum = 5; -    read_only = false; +        instance_uri(1), +        instance_uri(2), +        instance_uri(3), +        instance_uri(4), +        instance_uri(5), +    }, +    replication_synchro_timeout = 1000, +    replication_synchro_quorum = 5, +    read_only = false,  }) >> + replication = { >> + instance_uri(1); >> + instance_uri(2); >> + instance_uri(3); >> + instance_uri(4); >> + instance_uri(5); >> + }; >> + replication_synchro_timeout = 1000; >> + replication_synchro_quorum = 5; >> + read_only = false; >> +}) >> + >> +box.once("bootstrap", function() >> + box.schema.user.grant("guest", 'replication') >> +end) >> diff --git a/test/replication/qsync_random_leader.result b/test/replication/qsync_random_leader.result >> new file mode 100644 >> index 000000000..2b2df99db >> --- /dev/null >> +++ b/test/replication/qsync_random_leader.result >> @@ -0,0 +1,148 @@ >> +-- test-run result file version 2 >> +os = require('os') >> + | --- >> + | ... >> +env = require('test_run') >> + | --- >> + | ... >> +math = require('math') >> + | --- >> + | ... >> +fiber = require('fiber') >> + | --- >> + | ... >> +test_run = env.new() >> + | --- >> + | ... >> +netbox = require('net.box') >> + | --- >> + | ... >> + >> +orig_synchro_quorum = box.cfg.replication_synchro_quorum >> + | --- >> + | ... >> +orig_synchro_timeout = box.cfg.replication_synchro_timeout >> + | --- >> + | ... >> + >> +NUM_INSTANCES = 5 >> + | --- >> + | ... >> +SERVERS = {} >> + | --- >> + | ... >> +for i=1,NUM_INSTANCES do \ > 4. Please, use whitespaces. Here are some: ' ', > take them. Vim doesn't show any tabs before '\'. Anyway I have updated indentation before '\' using whitespaces. >> + SERVERS[i] = 'qsync' .. i \ >> +end; > 5. ';' is not needed. Removed. >> + | --- >> + | ... >> +SERVERS -- print instance names >> + | --- >> + | - - qsync1 >> + | - qsync2 >> + | - qsync3 >> + | - qsync4 >> + | - qsync5 >> + | ... >> + >> +math.randomseed(os.time()) >> + | --- >> + | ... >> +random = function(excluded_num, total) \ > 6. Why not 'function random(...)'? Why do you need to assign > it to a variable via '='? Because for my taste this form is more convenient for reading. Converted to "function random(...)". > >> + local r = math.random(1, total) \ >> + if (r == excluded_num) then \ >> + return random(excluded_num, total) \ >> + end \ >> + return r \ >> +end >> + | --- >> + | ... >> + >> +-- Set 'broken' quorum on current leader. >> +-- Write value on current leader. >> +-- Pick a random replica in a cluster. >> +-- Set 'good' quorum on it and promote to a leader. >> +-- Make sure value is there and on an old leader. >> + >> +-- Testcase setup. >> +test_run:create_cluster(SERVERS) >> + | --- >> + | ... >> +test_run:wait_fullmesh(SERVERS) >> + | --- >> + | ... >> +test_run:switch('qsync1') >> + | --- >> + | - true >> + | ... >> +_ = box.schema.space.create('sync', {is_sync=true, engine = test_run:get_cfg('engine')}) >> + | --- >> + | ... >> +_ = box.space.sync:create_index('primary') >> + | --- >> + | ... >> +box.schema.user.grant('guest', 'write', 'space', 'sync') >> + | --- >> + | ... >> +test_run:switch('default') >> + | --- >> + | - true >> + | ... >> +current_leader_id = 1 >> + | --- >> + | ... >> +test_run:eval(SERVERS[current_leader_id], "box.ctl.clear_synchro_queue()") >> + | --- >> + | - [] >> + | ... >> + >> +SOCKET_DIR = require('fio').cwd() >> + | --- >> + | ... >> + >> +-- Testcase body. >> +for i=1,30 do \ >> + test_run:eval(SERVERS[current_leader_id], \ >> + "box.cfg{replication_synchro_quorum=6, replication_synchro_timeout=1000}") \ >> + c = netbox.connect(SOCKET_DIR..'/'..SERVERS[current_leader_id]..'.sock') \ >> + fiber.create(function() c.space.sync:insert{i} end) \ >> + new_leader_id = random(current_leader_id, #SERVERS) \ >> + test_run:eval(SERVERS[new_leader_id], \ >> + "box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.01}") \ >> + test_run:eval(SERVERS[new_leader_id], "box.ctl.clear_synchro_queue()") \ >> + c:close() \ >> + replica = random(new_leader_id, #SERVERS) \ >> + test_run:wait_cond(function() return test_run:eval(SERVERS[replica], \ >> + string.format("box.space.sync:get{%d}", i))[1] ~= nil end) \ >> + test_run:wait_cond(function() return test_run:eval(SERVERS[current_leader_id], \ >> + string.format("box.space.sync:get{%d}", i))[1] ~= nil end) \ >> + new_leader_id = random(current_leader_id, #SERVERS) \ >> + current_leader_id = new_leader_id \ >> +end >> + | --- >> + | ... >> + >> +test_run:wait_cond(function() return test_run:eval('qsync1', \ >> + ("box.space.sync:count()")) == 30 end) >> + | --- >> + | - false > 7. I am not sure it is supposed to be false. If it should be - > I don't understand why. Could you elaborate please? No, it is obviously should be True here. I missed reference value for that statement in .result file. wait_cond() here was broken because it returns a list, not a number. Fixed it.  test_run:wait_cond(function() return test_run:eval('qsync1',                   \ -                   ("box.space.sync:count()")) == 30 end) +                   ("box.space.sync:count()"))[1] == 30 end)   | --- - | - false + | - true   | ... > Also in the previous commit in the end of a test doing exactly > the same, but for leader-replica config, you wrote: > > Note: cluster may be in a broken state here due to nature of previous test. You got it wrong. The sense of my note is following: testcases should be isolated from each one and if testcase is failed we consider environment where test has been started as 'dirty' and should not continue to run other tests in that environment. Testcases with 'broken' quorum uses negative scenarios and I placed this testcase at the end of file to minimize impact to other tests in case of failure. Note was placed for others just to warn about negative testcase. > > But here you do it 30 times, and it is not broken (I hope). > So that statement wasn't true? > > Besides, I get > > No output during 10 seconds. Will abort after 120 seconds without output. List of workers not reporting the status: > - 001_replication [replication/qsync_random_leader.test.lua, vinyl] at var/001_replication/qsync_random_leader.result:126 > - 002_replication [replication/qsync_random_leader.test.lua, memtx] at var/002_replication/qsync_random_leader.result:126 > No output during 20 seconds. Will abort after 120 seconds without output. List of workers not reporting the status: > - 001_replication [replication/qsync_random_leader.test.lua, vinyl] at var/001_replication/qsync_random_leader.result:126 > - 002_replication [replication/qsync_random_leader.test.lua, memtx] at var/002_replication/qsync_random_leader.result:126 > > on this wait_cond(). Does not look like it works. The problem was in a value that wait_cond() returns. > > Also you don't check the space is full on other instances. Only on the > current leader, which is not so interesting. -test_run:wait_cond(function() return test_run:eval('qsync1',                   \ -                   ("box.space.sync:count()")) == 30 end) +for i=1,NUM_INSTANCES do                                                       \ +    instance = string.format('qsync%d', i)                                     \ +    test_run:wait_cond(function() return test_run:eval(instance,               \ +                       "box.space.sync:count()")[1] == 30 end)                 \ +end >> + | ... >> + >> +-- Teardown. >> +test_run:switch('default') >> + | --- >> + | - true >> + | ... >> +test_run:eval(SERVERS[current_leader_id], 'box.space.sync:drop()') >> + | --- >> + | - [] >> + | ... >> +test_run:drop_cluster(SERVERS) >> + | --- >> + | ... >> +box.cfg{ \ >> + replication_synchro_quorum = orig_synchro_quorum, \ >> + replication_synchro_timeout = orig_synchro_timeout, \ > 8. You didn't change these values in the 'default' instance. So after > what do you restore them? Removed these lines as well as lines before test to save settings. --- a/test/replication/qsync_random_leader.test.lua +++ b/test/replication/qsync_random_leader.test.lua @@ -5,18 +5,15 @@ fiber = require('fiber')  test_run = env.new()  netbox = require('net.box') -orig_synchro_quorum = box.cfg.replication_synchro_quorum -orig_synchro_timeout = box.cfg.replication_synchro_timeout -  NUM_INSTANCES = 5  SERVERS = {}  for i=1,NUM_INSTANCES do                                                       \      SERVERS[i] = 'qsync' .. i  -- Teardown.  test_run:switch('default')  test_run:eval(SERVERS[current_leader_id], 'box.space.sync:drop()')  test_run:drop_cluster(SERVERS) -box.cfg{ \ -    replication_synchro_quorum = orig_synchro_quorum,                          \ -    replication_synchro_timeout = orig_synchro_timeout,                        \ -}