From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <sergeyb@tarantool.org>
Received: from smtp57.i.mail.ru (smtp57.i.mail.ru [217.69.128.37])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (No client certificate requested)
 by dev.tarantool.org (Postfix) with ESMTPS id 5077C469710
 for <tarantool-patches@dev.tarantool.org>;
 Tue, 24 Nov 2020 11:40:00 +0300 (MSK)
References: <cover.1605629206.git.sergeyb@tarantool.org>
 <9eb76fb56017daca32e3fbc5b5415fc144eb0699.1605629206.git.sergeyb@tarantool.org>
 <3496e5a3-fc28-880e-d71f-d9746cc56412@tarantool.org>
From: Sergey Bronnikov <sergeyb@tarantool.org>
Message-ID: <7164d167-4386-4ea7-ae33-e19ca1f8b326@tarantool.org>
Date: Tue, 24 Nov 2020 11:39:58 +0300
MIME-Version: 1.0
In-Reply-To: <3496e5a3-fc28-880e-d71f-d9746cc56412@tarantool.org>
Content-Type: text/plain; charset="utf-8"; format="flowed"
Content-Transfer-Encoding: 8bit
Content-Language: en-US
Subject: Re: [Tarantool-patches] [PATCH 3/4] replication: add test with
 random leaders promotion and demotion
List-Id: Tarantool development patches <tarantool-patches.dev.tarantool.org>
List-Unsubscribe: <https://lists.tarantool.org/mailman/options/tarantool-patches>, 
 <mailto:tarantool-patches-request@dev.tarantool.org?subject=unsubscribe>
List-Archive: <https://lists.tarantool.org/pipermail/tarantool-patches/>
List-Post: <mailto:tarantool-patches@dev.tarantool.org>
List-Help: <mailto:tarantool-patches-request@dev.tarantool.org?subject=help>
List-Subscribe: <https://lists.tarantool.org/mailman/listinfo/tarantool-patches>, 
 <mailto:tarantool-patches-request@dev.tarantool.org?subject=subscribe>
To: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>, tarantool-patches@dev.tarantool.org

Thanks for review!

On 21.11.2020 17:41, Vladislav Shpilevoy wrote:
> Thanks for the patch!
>
> See 8 comments below.
>
> On 17.11.2020 17:13, sergeyb@tarantool.org wrote:
>> From: Sergey Bronnikov <sergeyb@tarantool.org>
>>
>> Part of #5055
>> Part of #5144
>> ---
>>   test/replication/qsync.lua                    |  31 ++++
>>   test/replication/qsync1.lua                   |   1 +
>>   test/replication/qsync2.lua                   |   1 +
>>   test/replication/qsync3.lua                   |   1 +
>>   test/replication/qsync4.lua                   |   1 +
>>   test/replication/qsync5.lua                   |   1 +
>>   test/replication/qsync_random_leader.result   | 148 ++++++++++++++++++
>>   test/replication/qsync_random_leader.test.lua |  76 +++++++++
>>   8 files changed, 260 insertions(+)
>>   create mode 100644 test/replication/qsync.lua
>>   create mode 120000 test/replication/qsync1.lua
>>   create mode 120000 test/replication/qsync2.lua
>>   create mode 120000 test/replication/qsync3.lua
>>   create mode 120000 test/replication/qsync4.lua
>>   create mode 120000 test/replication/qsync5.lua
>>   create mode 100644 test/replication/qsync_random_leader.result
>>   create mode 100644 test/replication/qsync_random_leader.test.lua
>>
>> diff --git a/test/replication/qsync.lua b/test/replication/qsync.lua
>> new file mode 100644
>> index 000000000..9bbc87239
>> --- /dev/null
>> +++ b/test/replication/qsync.lua
>> @@ -0,0 +1,31 @@
>> +#!/usr/bin/env tarantool
>> +
>> +-- get instance name from filename (qsync1.lua => qsync1)
>> +local INSTANCE_ID = string.match(arg[0], "%d")
>> +
>> +local SOCKET_DIR = require('fio').cwd()
>> +
>> +local function instance_uri(instance_id)
>> +    return SOCKET_DIR..'/qsync'..instance_id..'.sock';
> 1. ';' is not needed here.
  local function instance_uri(instance_id)
-    return SOCKET_DIR..'/qsync'..instance_id..'.sock';
+    return SOCKET_DIR..'/qsync'..instance_id..'.sock'
  end


>> +end
>> +
>> +-- start console first
> 2. Why? The comment narrates the code, but does not say why
> it is done so.
Updated a comment. Console is required to keep instance running.
>> +require('console').listen(os.getenv('ADMIN'))
>> +
>> +box.cfg({
>> +    listen = instance_uri(INSTANCE_ID);
> 3. Please, don't use ';' as a separator. We use ',' in the
> new code.


  box.cfg({
-    listen = instance_uri(INSTANCE_ID);
+    listen = instance_uri(INSTANCE_ID),
      replication = {
-        instance_uri(1);
-        instance_uri(2);
-        instance_uri(3);
-        instance_uri(4);
-        instance_uri(5);
-    };
-    replication_synchro_timeout = 1000;
-    replication_synchro_quorum = 5;
-    read_only = false;
+        instance_uri(1),
+        instance_uri(2),
+        instance_uri(3),
+        instance_uri(4),
+        instance_uri(5),
+    },
+    replication_synchro_timeout = 1000,
+    replication_synchro_quorum = 5,
+    read_only = false,
  })

>> +    replication = {
>> +        instance_uri(1);
>> +        instance_uri(2);
>> +        instance_uri(3);
>> +        instance_uri(4);
>> +        instance_uri(5);
>> +    };
>> +    replication_synchro_timeout = 1000;
>> +    replication_synchro_quorum = 5;
>> +    read_only = false;
>> +})
>> +
>> +box.once("bootstrap", function()
>> +    box.schema.user.grant("guest", 'replication')
>> +end)
>> diff --git a/test/replication/qsync_random_leader.result b/test/replication/qsync_random_leader.result
>> new file mode 100644
>> index 000000000..2b2df99db
>> --- /dev/null
>> +++ b/test/replication/qsync_random_leader.result
>> @@ -0,0 +1,148 @@
>> +-- test-run result file version 2
>> +os = require('os')
>> + | ---
>> + | ...
>> +env = require('test_run')
>> + | ---
>> + | ...
>> +math = require('math')
>> + | ---
>> + | ...
>> +fiber = require('fiber')
>> + | ---
>> + | ...
>> +test_run = env.new()
>> + | ---
>> + | ...
>> +netbox = require('net.box')
>> + | ---
>> + | ...
>> +
>> +orig_synchro_quorum = box.cfg.replication_synchro_quorum
>> + | ---
>> + | ...
>> +orig_synchro_timeout = box.cfg.replication_synchro_timeout
>> + | ---
>> + | ...
>> +
>> +NUM_INSTANCES = 5
>> + | ---
>> + | ...
>> +SERVERS = {}
>> + | ---
>> + | ...
>> +for i=1,NUM_INSTANCES do                                                       \
> 4. Please, use whitespaces. Here are some: '             ',
> take them.
Vim doesn't show any tabs before '\'. Anyway I have updated indentation 
before '\' using whitespaces.
>> +    SERVERS[i] = 'qsync' .. i                                                  \
>> +end;
> 5. ';' is not needed.

Removed.


>> + | ---
>> + | ...
>> +SERVERS -- print instance names
>> + | ---
>> + | - - qsync1
>> + |   - qsync2
>> + |   - qsync3
>> + |   - qsync4
>> + |   - qsync5
>> + | ...
>> +
>> +math.randomseed(os.time())
>> + | ---
>> + | ...
>> +random = function(excluded_num, total)                                         \
> 6. Why not 'function random(...)'? Why do you need to assign
> it to a variable via '='?

Because for my taste this form is more convenient for reading.

Converted to "function random(...)".

>
>> +    local r = math.random(1, total)                                            \
>> +    if (r == excluded_num) then                                                \
>> +        return random(excluded_num, total)                                     \
>> +    end                                                                        \
>> +    return r                                                                   \
>> +end
>> + | ---
>> + | ...
>> +
>> +-- Set 'broken' quorum on current leader.
>> +-- Write value on current leader.
>> +-- Pick a random replica in a cluster.
>> +-- Set 'good' quorum on it and promote to a leader.
>> +-- Make sure value is there and on an old leader.
>> +
>> +-- Testcase setup.
>> +test_run:create_cluster(SERVERS)
>> + | ---
>> + | ...
>> +test_run:wait_fullmesh(SERVERS)
>> + | ---
>> + | ...
>> +test_run:switch('qsync1')
>> + | ---
>> + | - true
>> + | ...
>> +_ = box.schema.space.create('sync', {is_sync=true, engine = test_run:get_cfg('engine')})
>> + | ---
>> + | ...
>> +_ = box.space.sync:create_index('primary')
>> + | ---
>> + | ...
>> +box.schema.user.grant('guest', 'write', 'space', 'sync')
>> + | ---
>> + | ...
>> +test_run:switch('default')
>> + | ---
>> + | - true
>> + | ...
>> +current_leader_id = 1
>> + | ---
>> + | ...
>> +test_run:eval(SERVERS[current_leader_id], "box.ctl.clear_synchro_queue()")
>> + | ---
>> + | - []
>> + | ...
>> +
>> +SOCKET_DIR = require('fio').cwd()
>> + | ---
>> + | ...
>> +
>> +-- Testcase body.
>> +for i=1,30 do                                                                  \
>> +    test_run:eval(SERVERS[current_leader_id],                                  \
>> +        "box.cfg{replication_synchro_quorum=6, replication_synchro_timeout=1000}") \
>> +    c = netbox.connect(SOCKET_DIR..'/'..SERVERS[current_leader_id]..'.sock')   \
>> +    fiber.create(function() c.space.sync:insert{i} end)                        \
>> +    new_leader_id = random(current_leader_id, #SERVERS)                        \
>> +    test_run:eval(SERVERS[new_leader_id],                                      \
>> +        "box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.01}") \
>> +    test_run:eval(SERVERS[new_leader_id], "box.ctl.clear_synchro_queue()")     \
>> +    c:close()                                                                  \
>> +    replica = random(new_leader_id, #SERVERS)                                  \
>> +    test_run:wait_cond(function() return test_run:eval(SERVERS[replica],       \
>> +                       string.format("box.space.sync:get{%d}", i))[1] ~= nil end)  \
>> +    test_run:wait_cond(function() return test_run:eval(SERVERS[current_leader_id], \
>> +                       string.format("box.space.sync:get{%d}", i))[1] ~= nil end)  \
>> +    new_leader_id = random(current_leader_id, #SERVERS)                        \
>> +    current_leader_id = new_leader_id                                          \
>> +end
>> + | ---
>> + | ...
>> +
>> +test_run:wait_cond(function() return test_run:eval('qsync1',                   \
>> +                   ("box.space.sync:count()")) == 30 end)
>> + | ---
>> + | - false
> 7. I am not sure it is supposed to be false. If it should be -
> I don't understand why. Could you elaborate please?

No, it is obviously should be True here. I missed reference value for 
that statement in .result file.

wait_cond() here was broken because it returns a list, not a number. 
Fixed it.

  test_run:wait_cond(function() return 
test_run:eval('qsync1',                   \
-                   ("box.space.sync:count()")) == 30 end)
+                   ("box.space.sync:count()"))[1] == 30 end)
   | ---
- | - false
+ | - true
   | ...

> Also in the previous commit in the end of a test doing exactly
> the same, but for leader-replica config, you wrote:
>
> 	Note: cluster may be in a broken state here due to nature of previous test.

You got it wrong. The sense of my note is following: testcases should be 
isolated from each one and if testcase

is failed we consider environment where test has been started as 'dirty' 
and should not continue to run other tests in

that environment. Testcases with 'broken' quorum uses negative scenarios 
and I placed this testcase at the end of file

to minimize impact to other tests in case of failure. Note was placed 
for others just to warn about negative testcase.

>
> But here you do it 30 times, and it is not broken (I hope).
> So that statement wasn't true?
>
> Besides, I get
>
> No output during 10 seconds. Will abort after 120 seconds without output. List of workers not reporting the status:
> - 001_replication [replication/qsync_random_leader.test.lua, vinyl] at var/001_replication/qsync_random_leader.result:126
> - 002_replication [replication/qsync_random_leader.test.lua, memtx] at var/002_replication/qsync_random_leader.result:126
> No output during 20 seconds. Will abort after 120 seconds without output. List of workers not reporting the status:
> - 001_replication [replication/qsync_random_leader.test.lua, vinyl] at var/001_replication/qsync_random_leader.result:126
> - 002_replication [replication/qsync_random_leader.test.lua, memtx] at var/002_replication/qsync_random_leader.result:126
>
> on this wait_cond(). Does not look like it works.

The problem was in a value that wait_cond() returns.

>
> Also you don't check the space is full on other instances. Only on the
> current leader, which is not so interesting.

-test_run:wait_cond(function() return 
test_run:eval('qsync1',                   \
-                   ("box.space.sync:count()")) == 30 end)
+for i=1,NUM_INSTANCES 
do                                                       \
+    instance = string.format('qsync%d', 
i)                                     \
+    test_run:wait_cond(function() return 
test_run:eval(instance,               \
+                       "box.space.sync:count()")[1] == 30 
end)                 \
+end


>> + | ...
>> +
>> +-- Teardown.
>> +test_run:switch('default')
>> + | ---
>> + | - true
>> + | ...
>> +test_run:eval(SERVERS[current_leader_id], 'box.space.sync:drop()')
>> + | ---
>> + | - []
>> + | ...
>> +test_run:drop_cluster(SERVERS)
>> + | ---
>> + | ...
>> +box.cfg{                                                                       \
>> +    replication_synchro_quorum = orig_synchro_quorum,                          \
>> +    replication_synchro_timeout = orig_synchro_timeout,                        \
> 8. You didn't change these values in the 'default' instance. So after
> what do you restore them?

Removed these lines as well as lines before test to save settings.

--- a/test/replication/qsync_random_leader.test.lua
+++ b/test/replication/qsync_random_leader.test.lua
@@ -5,18 +5,15 @@ fiber = require('fiber')
  test_run = env.new()
  netbox = require('net.box')

-orig_synchro_quorum = box.cfg.replication_synchro_quorum
-orig_synchro_timeout = box.cfg.replication_synchro_timeout
-
  NUM_INSTANCES = 5
  SERVERS = {}
  for i=1,NUM_INSTANCES 
do                                                       \
      SERVERS[i] = 'qsync' .. i


  -- Teardown.
  test_run:switch('default')
  test_run:eval(SERVERS[current_leader_id], 'box.space.sync:drop()')
  test_run:drop_cluster(SERVERS)
-box.cfg{ \
-    replication_synchro_quorum = 
orig_synchro_quorum,                          \
-    replication_synchro_timeout = 
orig_synchro_timeout,                        \
-}