Tarantool development patches archive
 help / color / mirror / Atom feed
* [Tarantool-patches] [PATCH] test: fix flaky socket test
@ 2019-12-04 13:43 Ilya Kosarev
  2019-12-05 21:31 ` Vladislav Shpilevoy
  0 siblings, 1 reply; 9+ messages in thread
From: Ilya Kosarev @ 2019-12-04 13:43 UTC (permalink / raw)
  To: tarantool-patches; +Cc: v.shpilevoy

socket.test had a number of flaky problems:
- socket readableness expectation
- race conditions on socket shutdown in emulation test cases
- tcp_server stability in socket receive inconsistent behavior case
Now they are solved. Port randomization is improved.
Socket test is not fragile anymore.

Closes #4451, #4426, #4469
---
Branch: https://github.com/tarantool/tarantool/tree/i.kosarev/gh-4426-4451-fix-socket-test
Issues: https://github.com/tarantool/tarantool/issues/4426
        https://github.com/tarantool/tarantool/issues/4451
        https://github.com/tarantool/tarantool/issues/4469

 test/app/socket.result   | 84 +++++++++++++++++++++++++---------------
 test/app/socket.test.lua | 60 +++++++++++++++++++---------
 test/app/suite.ini       |  1 -
 3 files changed, 94 insertions(+), 51 deletions(-)

diff --git a/test/app/socket.result b/test/app/socket.result
index fd299424c9..42dde8f375 100644
--- a/test/app/socket.result
+++ b/test/app/socket.result
@@ -45,6 +45,9 @@ test_run:cmd("push filter '(error: .builtin/.*[.]lua):[0-9]+' to '\\1'")
 WAIT_COND_TIME = 10
 ---
 ...
+WAIT_TCP_CONNECT_TIME = 240
+---
+...
 socket('PF_INET', 'SOCK_STREAM', 'tcp121222');
 ---
 - null
@@ -107,7 +110,7 @@ s:nonblock(true)
 ---
 - true
 ...
-s:readable(.1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -227,7 +230,7 @@ s:syswrite(ffi.cast('const char *', ping), #ping)
 ---
 - 6
 ...
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -830,7 +833,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world')
 ---
 - 12
 ...
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -842,7 +845,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world, 2')
 ---
 - 15
 ...
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -898,7 +901,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, World!')
 ---
 - 13
 ...
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -1074,6 +1077,15 @@ master:setsockopt('SOL_SOCKET', 'SO_REUSEADDR', true)
 ---
 - true
 ...
+seed = ''
+---
+...
+for d in string.gmatch(box.info.uuid, '%d') do  seed = seed .. d end
+---
+...
+math.randomseed(tonumber(seed))
+---
+...
 port = 32768 + math.random(0, 32767)
 ---
 ...
@@ -1092,7 +1104,7 @@ test_run:wait_cond(function()
         return false, master:error()
     end
     return true
-end, WAIT_COND_TIME);
+end, 100);
 ---
 - true
 ...
@@ -1822,8 +1834,14 @@ test_run:cmd("setopt delimiter ';'")
 ---
 - true
 ...
+socket_opened = true
 cfiber = fiber.create(function(sc, rch, wch)
-    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
+    while socket_opened do
+        sc:send(wch:get())
+        local data = sc:receive("*l")
+        if not socket_opened then sc:close() end
+        rch:put(data)
+    end
 end, sc, rch, wch);
 ---
 ...
@@ -1936,6 +1954,9 @@ c:receive("*l")
 ---
 - 
 ...
+socket_opened = false
+---
+...
 wch:put("Fu")
 ---
 - true
@@ -1944,10 +1965,6 @@ c:send("354 Please type your message\n")
 ---
 - 29
 ...
-sc:close()
----
-- 1
-...
 c:receive("*l", "Line: ")
 ---
 - null
@@ -2816,7 +2833,7 @@ test_run:cmd("clear filter")
 ---
 - true
 ...
--- case: sicket receive inconsistent behavior
+-- case: socket receive inconsistent behavior
 chan = fiber.channel()
 ---
 ...
@@ -2826,41 +2843,46 @@ counter = 0
 fn = function(s) counter = 0; while true do s:write((tostring(counter)):rep(chan:get())); counter = counter + 1 end end
 ---
 ...
-srv = socket.tcp_server('0.0.0.0', 8888, fn)
+srv = nil
 ---
 ...
-s = socket.connect('localhost', 8888)
+test_run:cmd("setopt delimiter ';'")
 ---
+- true
 ...
-chan:put(5)
+test_run:wait_cond(function()
+    port = 32768 + math.random(0, 32767)
+    srv = socket.tcp_server('0.0.0.0', port, fn)
+    return srv ~= nil
+end, 100);
 ---
 - true
 ...
-chan:put(5)
+receive1 = nil; receive2 = nil;
 ---
-- true
 ...
-s:receive(5)
+if srv ~= nil then
+    s = socket.connect('localhost', port)
+    chan:put(5)
+    chan:put(5)
+    receive1 = s:receive(5)
+    chan:put(5)
+    s:settimeout(1)
+    receive2 = s:receive('*a')
+    s:close()
+    srv:close()
+end;
 ---
-- '00000'
 ...
-chan:put(5)
+test_run:cmd("setopt delimiter ''");
 ---
 - true
 ...
-s:settimeout(1)
+receive1
 ---
-- 1
+- '00000'
 ...
-s:receive('*a')
+receive2
 ---
 - '1111122222'
 ...
-s:close()
----
-- 1
-...
-srv:close()
----
-- true
-...
diff --git a/test/app/socket.test.lua b/test/app/socket.test.lua
index c72d41763f..64f1e07471 100644
--- a/test/app/socket.test.lua
+++ b/test/app/socket.test.lua
@@ -14,6 +14,7 @@ test_run = env.new()
 test_run:cmd("push filter '(error: .builtin/.*[.]lua):[0-9]+' to '\\1'")
 
 WAIT_COND_TIME = 10
+WAIT_TCP_CONNECT_TIME = 240
 
 socket('PF_INET', 'SOCK_STREAM', 'tcp121222');
 
@@ -39,7 +40,7 @@ s:nonblock(false)
 s:nonblock()
 s:nonblock(true)
 
-s:readable(.1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:wait(.1)
 socket.iowait(s:fd(), 'RW')
 socket.iowait(s:fd(), 3)
@@ -75,7 +76,7 @@ ping = msgpack.encode(string.len(ping)) .. ping
 
 -- test syswrite with char *
 s:syswrite(ffi.cast('const char *', ping), #ping)
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:wait(.01)
 
 pong = s:sysread()
@@ -266,11 +267,11 @@ s = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 s:bind('127.0.0.1', 0)
 sc = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 sc:sendto('127.0.0.1', s:name().port, 'Hello, world')
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:recv()
 
 sc:sendto('127.0.0.1', s:name().port, 'Hello, world, 2')
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 d, from = s:recvfrom()
 from.port > 0
 from.port = 'Random port'
@@ -286,7 +287,7 @@ sc = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 sc:nonblock(true)
 sc:sendto('127.0.0.1', s:name().port)
 sc:sendto('127.0.0.1', s:name().port, 'Hello, World!')
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 data, from = s:recvfrom(10)
 data
 s:sendto(from.host, from.port, 'Hello, hello!')
@@ -343,6 +344,9 @@ s = nil
 -- random port
 master = socket('PF_INET', 'SOCK_STREAM', 'tcp')
 master:setsockopt('SOL_SOCKET', 'SO_REUSEADDR', true)
+seed = ''
+for d in string.gmatch(box.info.uuid, '%d') do  seed = seed .. d end
+math.randomseed(tonumber(seed))
 port = 32768 + math.random(0, 32767)
 -- SO_REUSEADDR allows to bind to the same source addr:port twice,
 -- so listen() can return EADDRINUSE and so we check it within
@@ -356,7 +360,7 @@ test_run:wait_cond(function()
         return false, master:error()
     end
     return true
-end, WAIT_COND_TIME);
+end, 100);
 function gh361()
     local s = socket('PF_INET', 'SOCK_STREAM', 'tcp')
     s:sysconnect('127.0.0.1', port)
@@ -619,8 +623,14 @@ s:settimeout(100500)
 rch, wch = fiber.channel(1), fiber.channel(1)
 sc = socket.connect(host, port)
 test_run:cmd("setopt delimiter ';'")
+socket_opened = true
 cfiber = fiber.create(function(sc, rch, wch)
-    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
+    while socket_opened do
+        sc:send(wch:get())
+        local data = sc:receive("*l")
+        if not socket_opened then sc:close() end
+        rch:put(data)
+    end
 end, sc, rch, wch);
 test_run:cmd("setopt delimiter ''");
 
@@ -651,9 +661,9 @@ rch:get()
 wch:put("DATA\n")
 c:receive(4)
 c:receive("*l")
+socket_opened = false
 wch:put("Fu")
 c:send("354 Please type your message\n")
-sc:close()
 c:receive("*l", "Line: ")
 c:receive()
 c:receive(10)
@@ -960,18 +970,30 @@ server:close()
 
 test_run:cmd("clear filter")
 
--- case: sicket receive inconsistent behavior
+-- case: socket receive inconsistent behavior
 chan = fiber.channel()
 counter = 0
 fn = function(s) counter = 0; while true do s:write((tostring(counter)):rep(chan:get())); counter = counter + 1 end end
-srv = socket.tcp_server('0.0.0.0', 8888, fn)
-s = socket.connect('localhost', 8888)
-chan:put(5)
-chan:put(5)
-s:receive(5)
-chan:put(5)
-s:settimeout(1)
-s:receive('*a')
-s:close()
-srv:close()
+srv = nil
+test_run:cmd("setopt delimiter ';'")
+test_run:wait_cond(function()
+    port = 32768 + math.random(0, 32767)
+    srv = socket.tcp_server('0.0.0.0', port, fn)
+    return srv ~= nil
+end, 100);
+receive1 = nil; receive2 = nil;
+if srv ~= nil then
+    s = socket.connect('localhost', port)
+    chan:put(5)
+    chan:put(5)
+    receive1 = s:receive(5)
+    chan:put(5)
+    s:settimeout(1)
+    receive2 = s:receive('*a')
+    s:close()
+    srv:close()
+end;
+test_run:cmd("setopt delimiter ''");
+receive1
+receive2
 
diff --git a/test/app/suite.ini b/test/app/suite.ini
index 79432e29a7..dd802d98cf 100644
--- a/test/app/suite.ini
+++ b/test/app/suite.ini
@@ -7,4 +7,3 @@ use_unix_sockets = True
 use_unix_sockets_iproto = True
 is_parallel = True
 pretest_clean = True
-fragile = socket.test.lua ; gh-4426 gh-4451
-- 
2.17.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Tarantool-patches] [PATCH] test: fix flaky socket test
  2019-12-04 13:43 [Tarantool-patches] [PATCH] test: fix flaky socket test Ilya Kosarev
@ 2019-12-05 21:31 ` Vladislav Shpilevoy
  2019-12-06 16:01   ` Ilya Kosarev
  0 siblings, 1 reply; 9+ messages in thread
From: Vladislav Shpilevoy @ 2019-12-05 21:31 UTC (permalink / raw)
  To: Ilya Kosarev, tarantool-patches

Hi! Thanks for the patch!

See 7 comments below.

On 04/12/2019 14:43, Ilya Kosarev wrote:
> socket.test had a number of flaky problems:
> - socket readableness expectation
> - race conditions on socket shutdown in emulation test cases
> - tcp_server stability in socket receive inconsistent behavior case
> Now they are solved. Port randomization is improved.
> Socket test is not fragile anymore.
> 
> Closes #4451, #4426, #4469
> ---
> Branch: https://github.com/tarantool/tarantool/tree/i.kosarev/gh-4426-4451-fix-socket-test
> Issues: https://github.com/tarantool/tarantool/issues/4426
>         https://github.com/tarantool/tarantool/issues/4451
>         https://github.com/tarantool/tarantool/issues/4469
> 
>  test/app/socket.result   | 84 +++++++++++++++++++++++++---------------
>  test/app/socket.test.lua | 60 +++++++++++++++++++---------
>  test/app/suite.ini       |  1 -
>  3 files changed, 94 insertions(+), 51 deletions(-)
> 
> diff --git a/test/app/socket.result b/test/app/socket.result
> index fd299424c9..42dde8f375 100644
> --- a/test/app/socket.result
> +++ b/test/app/socket.result
> @@ -45,6 +45,9 @@ test_run:cmd("push filter '(error: .builtin/.*[.]lua):[0-9]+' to '\\1'")
>  WAIT_COND_TIME = 10

1. This is now unused.

>  ---
>  ...
> +WAIT_TCP_CONNECT_TIME = 240
> +---
> +...
>  socket('PF_INET', 'SOCK_STREAM', 'tcp121222');
>  ---
>  - null
> @@ -107,7 +110,7 @@ s:nonblock(true)
>  ---
>  - true
>  ...
> -s:readable(.1)
> +s:readable(WAIT_TCP_CONNECT_TIME)
>  ---
>  - true
>  ...
> @@ -227,7 +230,7 @@ s:syswrite(ffi.cast('const char *', ping), #ping)
>  ---
>  - 6
>  ...
> -s:readable(1)
> +s:readable(WAIT_TCP_CONNECT_TIME)

2. I understand, why it is 'connect time' in the previous
hunk. But here the socket is already connected. So it is
not connect time.

On the other hand, there are places right after connect,
which don't use this timeout. For example, line 457
in socket.result file:

    sc:sysconnect('127.0.0.1', s:name().port) or errno() == errno.EINPROGRESS
    ---
    - true
    ...
    sc:writable(10)
    ---
    - true
    ...

Here it is 10 seconds, while in other places you started
using 240.

>  ---
>  - true
>  ...
> @@ -830,7 +833,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world')
>  ---
>  - 12
>  ...
> -s:readable(10)
> +s:readable(WAIT_TCP_CONNECT_TIME)
>  ---
>  - true
>  ...

3. There is no connect time. This is DGRAM socket with UDP.
No connections.

The same below. Moreover, there are now 2 waits for
'connect time' on the same socket.

> @@ -842,7 +845,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world, 2')
>  ---
>  - 15
>  ...
> -s:readable(10)
> +s:readable(WAIT_TCP_CONNECT_TIME)
>  ---
>  - true
>  ...
> @@ -898,7 +901,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, World!')
>  ---
>  - 13
>  ...
> -s:readable(1)
> +s:readable(WAIT_TCP_CONNECT_TIME)
>  ---
>  - true
>  ...
> @@ -1074,6 +1077,15 @@ master:setsockopt('SOL_SOCKET', 'SO_REUSEADDR', true)
>  ---
>  - true
>  ...
> +seed = ''
> +---
> +...
> +for d in string.gmatch(box.info.uuid, '%d') do  seed = seed .. d end

4. Well, this is probably the most intricate way to
get a seed. Have you considered fiber.time()? And why do
you need it? Isn't it initialized already?

> +---
> +...
> +math.randomseed(tonumber(seed))
> +---
> +...
>  port = 32768 + math.random(0, 32767)
>  ---
>  ...
> @@ -1092,7 +1104,7 @@ test_run:wait_cond(function()
>          return false, master:error()
>      end
>      return true
> -end, WAIT_COND_TIME);
> +end, 100);

5. Why? 10 seconds was not enough to try a few random
ports?

>  ---
>  - true
>  ...
> @@ -1822,8 +1834,14 @@ test_run:cmd("setopt delimiter ';'")
>  ---
>  - true
>  ...
> +socket_opened = true
>  cfiber = fiber.create(function(sc, rch, wch)
> -    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
> +    while socket_opened do
> +        sc:send(wch:get())
> +        local data = sc:receive("*l")
> +        if not socket_opened then sc:close() end
> +        rch:put(data)
> +    end
>  end, sc, rch, wch);

6. What was a problem here? In the commit message you said
'race conditions'. Where? You just moved socket close to
the background fiber from the main fiber.

>  ---
>  ...
> @@ -1936,6 +1954,9 @@ c:receive("*l")
>  ---
>  - 
>  ...
> +socket_opened = false
> +---
> +...
>  wch:put("Fu")
>  ---
>  - true
> @@ -1944,10 +1965,6 @@ c:send("354 Please type your message\n")
>  ---
>  - 29
>  ...
> -sc:close()
> ----
> -- 1
> -...
>  c:receive("*l", "Line: ")
>  ---
>  - null
> @@ -2816,7 +2833,7 @@ test_run:cmd("clear filter")
>  ---
>  - true
>  ...
> --- case: sicket receive inconsistent behavior
> +-- case: socket receive inconsistent behavior
>  chan = fiber.channel()
>  ---
>  ...
> @@ -2826,41 +2843,46 @@ counter = 0
>  fn = function(s) counter = 0; while true do s:write((tostring(counter)):rep(chan:get())); counter = counter + 1 end end
>  ---
>  ...
> -srv = socket.tcp_server('0.0.0.0', 8888, fn)
> +srv = nil
>  ---
>  ...
> -s = socket.connect('localhost', 8888)
> +test_run:cmd("setopt delimiter ';'")
>  ---
> +- true
>  ...
> -chan:put(5)
> +test_run:wait_cond(function()
> +    port = 32768 + math.random(0, 32767)
> +    srv = socket.tcp_server('0.0.0.0', port, fn)
> +    return srv ~= nil
> +end, 100);
>  ---
>  - true
>  ...
> -chan:put(5)
> +receive1 = nil; receive2 = nil;
>  ---
> -- true
>  ...
> -s:receive(5)
> +if srv ~= nil then
> +    s = socket.connect('localhost', port)
> +    chan:put(5)
> +    chan:put(5)
> +    receive1 = s:receive(5)
> +    chan:put(5)
> +    s:settimeout(1)
> +    receive2 = s:receive('*a')
> +    s:close()
> +    srv:close()
> +end;

7. How is it possible, that the server still is nil here?
At least one of 32k ports should have been free.

>  ---
> -- '00000'
>  ...
> -chan:put(5)
> +test_run:cmd("setopt delimiter ''");
>  ---
>  - true
>  ...
> -s:settimeout(1)
> +receive1
>  ---
> -- 1
> +- '00000'
>  ...
> -s:receive('*a')
> +receive2
>  ---
>  - '1111122222'
>  ...
> -s:close()
> ----
> -- 1
> -...
> -srv:close()
> ----
> -- true
> -...

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Tarantool-patches] [PATCH] test: fix flaky socket test
  2019-12-05 21:31 ` Vladislav Shpilevoy
@ 2019-12-06 16:01   ` Ilya Kosarev
  2019-12-08 15:52     ` Vladislav Shpilevoy
  0 siblings, 1 reply; 9+ messages in thread
From: Ilya Kosarev @ 2019-12-06 16:01 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

[-- Attachment #1: Type: text/plain, Size: 7266 bytes --]


Hi!

Thanks for your review.

4. I run socket.test to simulate "high load" in parallel with itself
(while ./test-run.py -j20 `for r in {1..64} ; do echo socket ; done` 2>/dev/null ; do l=$(($l+1)) ; echo ======== $l ============= ; done).
Using fiber.time() as random seed leads to the same results produced
on parallel tasks. Using instance uuid as a seed seems to be a good
decision.
5. Well, considering the new seed generation, 10s seem to be enough.
6. If we close socket in the main fiber, we might attempt to use closed
socket in background fiber.
7. Of course it shouldn't be nil and test will fail in this case. We
just want to be sure that test won't hang even if server is still nil.
Otherwise we will hang on chan:put(), which is not nice. Sent v2 considering mentioned drawbacks.

>Пятница,  6 декабря 2019, 0:31 +03:00 от Vladislav Shpilevoy < v.shpilevoy@tarantool.org >:
>
>Hi! Thanks for the patch!
>
>See 7 comments below.
>
>On 04/12/2019 14:43, Ilya Kosarev wrote:
>> socket.test had a number of flaky problems:
>> - socket readableness expectation
>> - race conditions on socket shutdown in emulation test cases
>> - tcp_server stability in socket receive inconsistent behavior case
>> Now they are solved. Port randomization is improved.
>> Socket test is not fragile anymore.
>> 
>> Closes #4451, #4426, #4469
>> ---
>> Branch:  https://github.com/tarantool/tarantool/tree/i.kosarev/gh-4426-4451-fix-socket-test
>> Issues:  https://github.com/tarantool/tarantool/issues/4426
>>  https://github.com/tarantool/tarantool/issues/4451
>>  https://github.com/tarantool/tarantool/issues/4469
>> 
>>  test/app/socket.result   | 84 +++++++++++++++++++++++++---------------
>>  test/app/socket.test.lua | 60 +++++++++++++++++++---------
>>  test/app/suite.ini       |  1 -
>>  3 files changed, 94 insertions(+), 51 deletions(-)
>> 
>> diff --git a/test/app/socket.result b/test/app/socket.result
>> index fd299424c9..42dde8f375 100644
>> --- a/test/app/socket.result
>> +++ b/test/app/socket.result
>> @@ -45,6 +45,9 @@ test_run:cmd("push filter '(error: .builtin/.*[.]lua):[0-9]+' to '\\1'")
>>  WAIT_COND_TIME = 10
>1. This is now unused.
>
>>  ---
>>  ...
>> +WAIT_TCP_CONNECT_TIME = 240
>> +---
>> +...
>>  socket('PF_INET', 'SOCK_STREAM', 'tcp121222');
>>  ---
>>  - null
>> @@ -107,7 +110,7 @@ s:nonblock(true)
>>  ---
>>  - true
>>  ...
>> -s:readable(.1)
>> +s:readable(WAIT_TCP_CONNECT_TIME)
>>  ---
>>  - true
>>  ...
>> @@ -227,7 +230,7 @@ s:syswrite(ffi.cast('const char *', ping), #ping)
>>  ---
>>  - 6
>>  ...
>> -s:readable(1)
>> +s:readable(WAIT_TCP_CONNECT_TIME)
>
>2. I understand, why it is 'connect time' in the previous
>hunk. But here the socket is already connected. So it is
>not connect time.
>
>On the other hand, there are places right after connect,
>which don't use this timeout. For example, line 457
>in socket.result file:
>
>    sc:sysconnect('127.0.0.1', s:name().port) or errno() == errno.EINPROGRESS
>    ---
>    - true
>    ...
>    sc:writable(10)
>    ---
>    - true
>    ...
>
>Here it is 10 seconds, while in other places you started
>using 240.
>
>>  ---
>>  - true
>>  ...
>> @@ -830,7 +833,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world')
>>  ---
>>  - 12
>>  ...
>> -s:readable(10)
>> +s:readable(WAIT_TCP_CONNECT_TIME)
>>  ---
>>  - true
>>  ...
>
>3. There is no connect time. This is DGRAM socket with UDP.
>No connections.
>
>The same below. Moreover, there are now 2 waits for
>'connect time' on the same socket.
>
>> @@ -842,7 +845,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world, 2')
>>  ---
>>  - 15
>>  ...
>> -s:readable(10)
>> +s:readable(WAIT_TCP_CONNECT_TIME)
>>  ---
>>  - true
>>  ...
>> @@ -898,7 +901,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, World!')
>>  ---
>>  - 13
>>  ...
>> -s:readable(1)
>> +s:readable(WAIT_TCP_CONNECT_TIME)
>>  ---
>>  - true
>>  ...
>> @@ -1074,6 +1077,15 @@ master:setsockopt('SOL_SOCKET', 'SO_REUSEADDR', true)
>>  ---
>>  - true
>>  ...
>> +seed = ''
>> +---
>> +...
>> +for d in string.gmatch(box.info.uuid, '%d') do  seed = seed .. d end
>
>4. Well, this is probably the most intricate way to
>get a seed. Have you considered fiber.time()? And why do
>you need it? Isn't it initialized already?
>
>> +---
>> +...
>> +math.randomseed(tonumber(seed))
>> +---
>> +...
>>  port = 32768 + math.random(0, 32767)
>>  ---
>>  ...
>> @@ -1092,7 +1104,7 @@ test_run:wait_cond(function()
>>          return false, master:error()
>>      end
>>      return true
>> -end, WAIT_COND_TIME);
>> +end, 100);
>
>5. Why? 10 seconds was not enough to try a few random
>ports?
>
>>  ---
>>  - true
>>  ...
>> @@ -1822,8 +1834,14 @@ test_run:cmd("setopt delimiter ';'")
>>  ---
>>  - true
>>  ...
>> +socket_opened = true
>>  cfiber = fiber.create(function(sc, rch, wch)
>> -    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
>> +    while socket_opened do
>> +        sc:send(wch:get())
>> +        local data = sc:receive("*l")
>> +        if not socket_opened then sc:close() end
>> +        rch:put(data)
>> +    end
>>  end, sc, rch, wch);
>
>6. What was a problem here? In the commit message you said
>'race conditions'. Where? You just moved socket close to
>the background fiber from the main fiber.
>
>>  ---
>>  ...
>> @@ -1936,6 +1954,9 @@ c:receive("*l")
>>  ---
>>  - 
>>  ...
>> +socket_opened = false
>> +---
>> +...
>>  wch:put("Fu")
>>  ---
>>  - true
>> @@ -1944,10 +1965,6 @@ c:send("354 Please type your message\n")
>>  ---
>>  - 29
>>  ...
>> -sc:close()
>> ----
>> -- 1
>> -...
>>  c:receive("*l", "Line: ")
>>  ---
>>  - null
>> @@ -2816,7 +2833,7 @@ test_run:cmd("clear filter")
>>  ---
>>  - true
>>  ...
>> --- case: sicket receive inconsistent behavior
>> +-- case: socket receive inconsistent behavior
>>  chan = fiber.channel()
>>  ---
>>  ...
>> @@ -2826,41 +2843,46 @@ counter = 0
>>  fn = function(s) counter = 0; while true do s:write((tostring(counter)):rep(chan:get())); counter = counter + 1 end end
>>  ---
>>  ...
>> -srv = socket.tcp_server('0.0.0.0', 8888, fn)
>> +srv = nil
>>  ---
>>  ...
>> -s = socket.connect('localhost', 8888)
>> +test_run:cmd("setopt delimiter ';'")
>>  ---
>> +- true
>>  ...
>> -chan:put(5)
>> +test_run:wait_cond(function()
>> +    port = 32768 + math.random(0, 32767)
>> +    srv = socket.tcp_server('0.0.0.0', port, fn)
>> +    return srv ~= nil
>> +end, 100);
>>  ---
>>  - true
>>  ...
>> -chan:put(5)
>> +receive1 = nil; receive2 = nil;
>>  ---
>> -- true
>>  ...
>> -s:receive(5)
>> +if srv ~= nil then
>> +    s = socket.connect('localhost', port)
>> +    chan:put(5)
>> +    chan:put(5)
>> +    receive1 = s:receive(5)
>> +    chan:put(5)
>> +    s:settimeout(1)
>> +    receive2 = s:receive('*a')
>> +    s:close()
>> +    srv:close()
>> +end;
>
>7. How is it possible, that the server still is nil here?
>At least one of 32k ports should have been free.
>
>>  ---
>> -- '00000'
>>  ...
>> -chan:put(5)
>> +test_run:cmd("setopt delimiter ''");
>>  ---
>>  - true
>>  ...
>> -s:settimeout(1)
>> +receive1
>>  ---
>> -- 1
>> +- '00000'
>>  ...
>> -s:receive('*a')
>> +receive2
>>  ---
>>  - '1111122222'
>>  ...
>> -s:close()
>> ----
>> -- 1
>> -...
>> -srv:close()
>> ----
>> -- true
>> -...


-- 
Ilya Kosarev

[-- Attachment #2: Type: text/html, Size: 10714 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Tarantool-patches] [PATCH] test: fix flaky socket test
  2019-12-06 16:01   ` Ilya Kosarev
@ 2019-12-08 15:52     ` Vladislav Shpilevoy
  2019-12-10 14:36       ` Ilya Kosarev
  0 siblings, 1 reply; 9+ messages in thread
From: Vladislav Shpilevoy @ 2019-12-08 15:52 UTC (permalink / raw)
  To: Ilya Kosarev; +Cc: tarantool-patches

Hi! Thanks for the patch!

Please, keep my comments near your answers. Because
I don't remember all my comments, and for each your
answer I need to lookup my original email to remember,
what was the problem.

This is usually simpler both to you and a reviewer,
when your answers are inlined into the original email.

Also don't drop the comments. Here is my comment 2:

>> 2. I understand, why it is 'connect time' in the previous
>> hunk. But here the socket is already connected. So it is
>> not connect time.
>> 
>> On the other hand, there are places right after connect,
>> which don't use this timeout. For example, line 457
>> in socket.result file:
>> 
>>     sc:sysconnect('127.0.0.1', s:name().port) or errno() == errno.EINPROGRESS
>>     ---
>>     - true
>>     ...
>>     sc:writable(10)
>>     ---
>>     - true
>>     ...
>> 
>> Here it is 10 seconds, while in other places you started
>> using 240.

And it is still actual. Why is it called 'CONNECT_TIME' if
it has nothing to do with connect in most of the places?
The problem is in the name. I would introduce one timeout
for conds, connects, readable, etc. You even already have
it - COND_TIME.

On 06/12/2019 17:01, Ilya Kosarev wrote:
> Hi!
> 
> Thanks for your review.
> 
> 4. I run socket.test to simulate "high load" in parallel with itself
> (while ./test-run.py -j20 `for r in {1..64} ; do echo socket ; done` 2>/dev/null ; do l=$(($l+1)) ; echo ======== $l ============= ; done).

I tried this example on your branch and got that:

=========================================================================
[009] --- app/socket.result	Sun Dec  8 15:40:58 2019
[009] +++ app/socket.reject	Sun Dec  8 16:09:23 2019
[009] @@ -2670,13 +2670,17 @@
[009]  -- case: recvfrom reads first 512 bytes from the message with tcp
[009]  received_message, from = receiving_socket:recvfrom()
[009]  ---
[009] +- error: '[string "received_message, from = receiving_socket:rec..."]:1: attempt to
[009] +    index global ''receiving_socket'' (a nil value)'
[009]  ...
[009]  e = receiving_socket:errno()
[009]  ---
[009] +- error: '[string "e = receiving_socket:errno() "]:1: attempt to index global ''receiving_socket''
[009] +    (a nil value)'
[009]  ...
[009]  received_message == message:sub(1, 512) -- expected true
[009]  ---
[009] -- true
[009] +- false
[009]  ...
[009]  received_message:len() == 512 -- expected true
[009]  ---
[009] @@ -2684,7 +2688,7 @@
[009]  ...
[009]  received_message
[009]  ---
[009] -- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
[009] +- yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
[009]  ...
[009]  received_message:len()
[009]  ---
[009] @@ -2692,11 +2696,13 @@
[009]  ...
[009]  from == nil -- expected true
[009]  ---
[009] -- true
[009] +- false
[009]  ...
[009]  from
[009]  ---
[009] -- null
[009] +- host: 127.0.0.1
[009] +  family: AF_INET
[009] +  port: 57721
[009]  ...
[009]  e == 0 -- expected true
[009]  ---
[009] @@ -2709,33 +2715,39 @@
[009]  -- case: recvfrom does not discard the message tail with tcp
[009]  received_message, from = receiving_socket:recvfrom()
[009]  ---
[009] +- error: '[string "received_message, from = receiving_socket:rec..."]:1: attempt to
[009] +    index global ''receiving_socket'' (a nil value)'
[009]  ...
[009]  e = receiving_socket:errno()
[009]  ---
[009] +- error: '[string "e = receiving_socket:errno() "]:1: attempt to index global ''receiving_socket''
[009] +    (a nil value)'
[009]  ...
[009]  received_message == message:sub(513, 513) -- expected true
[009]  ---
[009] -- true
[009] +- false
[009]  ...
[009]  received_message:len() == 1 -- expected true
[009]  ---
[009] -- true
[009] +- false
[009]  ...
[009]  received_message
[009]  ---
[009] -- x
[009] +- yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
[009]  ...
[009]  received_message:len()
[009]  ---
[009] -- 1
[009] +- 512
[009]  ...
[009]  from == nil -- expected true
[009]  ---
[009] -- true
[009] +- false
[009]  ...
[009]  from
[009]  ---
[009] -- null
[009] +- host: 127.0.0.1
[009] +  family: AF_INET
[009] +  port: 57721
[009]  ...
[009]  e == 0 -- expected true
[009]  ---
[009] @@ -2747,7 +2759,8 @@
[009]  ...
[009]  receiving_socket:close()
[009]  ---
[009] -- true
[009] +- error: '[string "return receiving_socket:close() "]:1: attempt to index global ''receiving_socket''
[009] +    (a nil value)'
[009]  ...
[009]  sending_socket:close()
[009]  ---
[009] 
=========================================================================

Sometimes I get this:

=========================================================================
[008] app/socket.test.lua                                             [ fail ]
[008] 
[008] Test failed! Result content mismatch:
[008] --- app/socket.result	Sun Dec  8 15:40:58 2019
[008] +++ app/socket.reject	Sun Dec  8 16:11:27 2019
[008] @@ -2340,23 +2340,24 @@
[008]  ...
[008]  received_message == '' -- expected true
[008]  ---
[008] -- true
[008] +- false
[008]  ...
[008]  received_message
[008]  ---
[008] -- 
[008] +- null
[008]  ...
[008]  from ~= nil -- expected true
[008]  ---
[008] -- true
[008] +- false
[008]  ...
[008]  from.host == '127.0.0.1' -- expected true
[008]  ---
[008] -- true
[008] +- error: '[string "return from.host == ''127.0.0.1'' -- expected t..."]:1: attempt
[008] +    to index global ''from'' (a nil value)'
[008]  ...
[008]  e == 0 -- expected true
[008]  ---
[008] -- true
[008] +- false
[008]  ...
[008]  -- case: recv, no datagram, explicit size
[008]  received_message = receiving_socket:recv(512)
[008] @@ -2386,23 +2387,25 @@
[008]  ...
[008]  received_message == nil -- expected true
[008]  ---
[008] -- true
[008] +- false
[008]  ...
[008]  received_message
[008]  ---
[008] -- null
[008] +- 
[008]  ...
[008]  from == nil -- expected true
[008]  ---
[008] -- true
[008] +- false
[008]  ...
[008]  from
[008]  ---
[008] -- null
[008] +- host: 127.0.0.1
[008] +  family: AF_INET
[008] +  port: 59867
[008]  ...
[008]  e == errno.EAGAIN -- expected true
[008]  ---
[008] -- true
[008] +- false
[008]  ...
[008]  -- case: recv, zero datagram, explicit size
[008]  sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
=========================================================================

So socket.test.lua still is unstable. The command with 64
socket.test.lua gives me an error in 100% cases.

> Using fiber.time() as random seed leads to the same results produced
> on parallel tasks. Using instance uuid as a seed seems to be a good
> decision.

I thought that it is strange, because UUIDs are also generated
randomly, and by the same random generator as is used in Lua.
But now I see why math.random() gives you the same numbers
even though UUIDs are different: Lua math.random() has own
random number generator. Not from the standard library.

> 5. Well, considering the new seed generation, 10s seem to be enough.

So you could skip this strange extraction of a number from
UUID, but increase the timeout? I vote for it. Because the
failing example with 64 instances of socket.test.lua is very
artificial, and even then it will work in case the timeout
is big enough, right? UUID parsing adds unnecessary complexity
and raises questions like I asked.

> 6. If we close socket in the main fiber, we might attempt to use closed
> socket in background fiber.

And what? We don't care anymore for the background
fiber after the socket is not needed anymore. Ok, it will
stumble into errors in that fiber, but how does it affect
further execution of the test?

> 
> 7. Of course it shouldn't be nil and test will fail in this case. We
> just want to be sure that test won't hang even if server is still nil.
> Otherwise we will hang on chan:put(), which is not nice.

If it will always be nil in case of an error, we won't notice the
problem. Please, drop it. Increase a timeout somewhere if necessary,
but consideration of a test passed, if in fact it didn't, is worse.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Tarantool-patches] [PATCH] test: fix flaky socket test
  2019-12-08 15:52     ` Vladislav Shpilevoy
@ 2019-12-10 14:36       ` Ilya Kosarev
  2019-12-17  0:03         ` Vladislav Shpilevoy
  0 siblings, 1 reply; 9+ messages in thread
From: Ilya Kosarev @ 2019-12-10 14:36 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

[-- Attachment #1: Type: text/plain, Size: 10736 bytes --]

Hi!

Thanks for the comments!

5 answers are below.

Sent v3 considering discussed questions.

>Воскресенье,  8 декабря 2019, 18:52 +03:00 от Vladislav Shpilevoy <v.shpilevoy@tarantool.org>:
>
>Hi! Thanks for the patch!
>
>Please, keep my comments near your answers. Because
>I don't remember all my comments, and for each your
>answer I need to lookup my original email to remember,
>what was the problem.
>
>This is usually simpler both to you and a reviewer,
>when your answers are inlined into the original email.
>
>Also don't drop the comments. Here is my comment 2:
>
>>> 2. I understand, why it is 'connect time' in the previous
>>> hunk. But here the socket is already connected. So it is
>>> not connect time.
>>> 
>>> On the other hand, there are places right after connect,
>>> which don't use this timeout. For example, line 457
>>> in socket.result file:
>>> 
>>>     sc:sysconnect('127.0.0.1', s:name().port) or errno() == errno.EINPROGRESS
>>>     ---
>>>     - true
>>>     ...
>>>     sc:writable(10)
>>>     ---
>>>     - true
>>>     ...
>>> 
>>> Here it is 10 seconds, while in other places you started
>>> using 240.
>And it is still actual. Why is it called 'CONNECT_TIME' if
>it has nothing to do with connect in most of the places?
>The problem is in the name. I would introduce one timeout
>for conds, connects, readable, etc. You even already have
>it - COND_TIME. 
2. Ok, I see the problem here. I will carefully reconsider such
places to make them consistent.
>
>
>On 06/12/2019 17:01, Ilya Kosarev wrote:
>> Hi!
>> 
>> Thanks for your review.
>> 
>> 4. I run socket.test to simulate "high load" in parallel with itself
>> (while ./test-run.py -j20 `for r in {1..64} ; do echo socket ; done` 2>/dev/null ; do l=$(($l+1)) ; echo ======== $l ============= ; done).
>
>I tried this example on your branch and got that:
>
>=========================================================================
>[009] --- app/socket.result	Sun Dec  8 15:40:58 2019
>[009] +++ app/socket.reject	Sun Dec  8 16:09:23 2019
>[009] @@ -2670,13 +2670,17 @@
>[009]  -- case: recvfrom reads first 512 bytes from the message with tcp
>[009]  received_message, from = receiving_socket:recvfrom()
>[009]  ---
>[009] +- error: '[string "received_message, from = receiving_socket:rec..."]:1: attempt to
>[009] +    index global ''receiving_socket'' (a nil value)'
>[009]  ...
>[009]  e = receiving_socket:errno()
>[009]  ---
>[009] +- error: '[string "e = receiving_socket:errno() "]:1: attempt to index global ''receiving_socket''
>[009] +    (a nil value)'
>[009]  ...
>[009]  received_message == message:sub(1, 512) -- expected true
>[009]  ---
>[009] -- true
>[009] +- false
>[009]  ...
>[009]  received_message:len() == 512 -- expected true
>[009]  ---
>[009] @@ -2684,7 +2688,7 @@
>[009]  ...
>[009]  received_message
>[009]  ---
>[009] -- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
>[009] +- yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
>[009]  ...
>[009]  received_message:len()
>[009]  ---
>[009] @@ -2692,11 +2696,13 @@
>[009]  ...
>[009]  from == nil -- expected true
>[009]  ---
>[009] -- true
>[009] +- false
>[009]  ...
>[009]  from
>[009]  ---
>[009] -- null
>[009] +- host: 127.0.0.1
>[009] +  family: AF_INET
>[009] +  port: 57721
>[009]  ...
>[009]  e == 0 -- expected true
>[009]  ---
>[009] @@ -2709,33 +2715,39 @@
>[009]  -- case: recvfrom does not discard the message tail with tcp
>[009]  received_message, from = receiving_socket:recvfrom()
>[009]  ---
>[009] +- error: '[string "received_message, from = receiving_socket:rec..."]:1: attempt to
>[009] +    index global ''receiving_socket'' (a nil value)'
>[009]  ...
>[009]  e = receiving_socket:errno()
>[009]  ---
>[009] +- error: '[string "e = receiving_socket:errno() "]:1: attempt to index global ''receiving_socket''
>[009] +    (a nil value)'
>[009]  ...
>[009]  received_message == message:sub(513, 513) -- expected true
>[009]  ---
>[009] -- true
>[009] +- false
>[009]  ...
>[009]  received_message:len() == 1 -- expected true
>[009]  ---
>[009] -- true
>[009] +- false
>[009]  ...
>[009]  received_message
>[009]  ---
>[009] -- x
>[009] +- yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
>[009]  ...
>[009]  received_message:len()
>[009]  ---
>[009] -- 1
>[009] +- 512
>[009]  ...
>[009]  from == nil -- expected true
>[009]  ---
>[009] -- true
>[009] +- false
>[009]  ...
>[009]  from
>[009]  ---
>[009] -- null
>[009] +- host: 127.0.0.1
>[009] +  family: AF_INET
>[009] +  port: 57721
>[009]  ...
>[009]  e == 0 -- expected true
>[009]  ---
>[009] @@ -2747,7 +2759,8 @@
>[009]  ...
>[009]  receiving_socket:close()
>[009]  ---
>[009] -- true
>[009] +- error: '[string "return receiving_socket:close() "]:1: attempt to index global ''receiving_socket''
>[009] +    (a nil value)'
>[009]  ...
>[009]  sending_socket:close()
>[009]  ---
>[009] 
>=========================================================================
>
>Sometimes I get this:
>
>=========================================================================
>[008] app/socket.test.lua                                             [ fail ]
>[008] 
>[008] Test failed! Result content mismatch:
>[008] --- app/socket.result	Sun Dec  8 15:40:58 2019
>[008] +++ app/socket.reject	Sun Dec  8 16:11:27 2019
>[008] @@ -2340,23 +2340,24 @@
>[008]  ...
>[008]  received_message == '' -- expected true
>[008]  ---
>[008] -- true
>[008] +- false
>[008]  ...
>[008]  received_message
>[008]  ---
>[008] -- 
>[008] +- null
>[008]  ...
>[008]  from ~= nil -- expected true
>[008]  ---
>[008] -- true
>[008] +- false
>[008]  ...
>[008]  from.host == '127.0.0.1' -- expected true
>[008]  ---
>[008] -- true
>[008] +- error: '[string "return from.host == ''127.0.0.1'' -- expected t..."]:1: attempt
>[008] +    to index global ''from'' (a nil value)'
>[008]  ...
>[008]  e == 0 -- expected true
>[008]  ---
>[008] -- true
>[008] +- false
>[008]  ...
>[008]  -- case: recv, no datagram, explicit size
>[008]  received_message = receiving_socket:recv(512)
>[008] @@ -2386,23 +2387,25 @@
>[008]  ...
>[008]  received_message == nil -- expected true
>[008]  ---
>[008] -- true
>[008] +- false
>[008]  ...
>[008]  received_message
>[008]  ---
>[008] -- null
>[008] +- 
>[008]  ...
>[008]  from == nil -- expected true
>[008]  ---
>[008] -- true
>[008] +- false
>[008]  ...
>[008]  from
>[008]  ---
>[008] -- null
>[008] +- host: 127.0.0.1
>[008] +  family: AF_INET
>[008] +  port: 59867
>[008]  ...
>[008]  e == errno.EAGAIN -- expected true
>[008]  ---
>[008] -- true
>[008] +- false
>[008]  ...
>[008]  -- case: recv, zero datagram, explicit size
>[008]  sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
>=========================================================================
>
>So socket.test.lua still is unstable. The command with 64
>socket.test.lua gives me an error in 100% cases. 
4. Ok, I guess these are mac os specific issues. I haven't got mac os
machine access yet, however, I hope to get it soon and fix these problems.
I still tried to stabilize these cases; if it don't work, I will reconsider them later.
>
>
>> Using fiber.time() as random seed leads to the same results produced
>> on parallel tasks. Using instance uuid as a seed seems to be a good
>> decision.
>
>I thought that it is strange, because UUIDs are also generated
>randomly, and by the same random generator as is used in Lua.
>But now I see why math.random() gives you the same numbers
>even though UUIDs are different: Lua math.random() has own
>random number generator. Not from the standard library.
>
>> 5. Well, considering the new seed generation, 10s seem to be enough.
>
>So you could skip this strange extraction of a number from
>UUID, but increase the timeout? I vote for it. Because the
>failing example with 64 instances of socket.test.lua is very
>artificial, and even then it will work in case the timeout
>is big enough, right? UUID parsing adds unnecessary complexity
>and raises questions like I asked. 
5. Ok, I agree.
>
>
>> 6. If we close socket in the main fiber, we might attempt to use closed
>> socket in background fiber.
>
>And what? We don't care anymore for the background
>fiber after the socket is not needed anymore. Ok, it will
>stumble into errors in that fiber, but how does it affect
>further execution of the test? 
6. It affects further test execution, as far as it leads
to the following error:
[015]  c:receive("*l", "Line: ")
[015]  ---
[015]  - null
[015] -- closed
[015] -- 'Line: Fu'
[015] +- Connection reset by peer
>
>
>> 
>> 7. Of course it shouldn't be nil and test will fail in this case. We
>> just want to be sure that test won't hang even if server is still nil.
>> Otherwise we will hang on chan:put(), which is not nice.
>
>If it will always be nil in case of an error, we won't notice the
>problem. Please, drop it. Increase a timeout somewhere if necessary,
>but consideration of a test passed, if in fact it didn't, is worse. 
7. We don't consider the test passed because of this construction.
We will notice the problem, as far as  test_run:wait_cond()
will return false instead of true, and also  receive1 &  receive2
values will be nil instead of  '00000' &  '1111122222'.
We just don't want to hang on chan:put and wait for extra 120 sec.
for test to fail, that is why it is hidden under if clause.
>


-- 
Ilya Kosarev

[-- Attachment #2: Type: text/html, Size: 15239 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Tarantool-patches] [PATCH] test: fix flaky socket test
  2019-12-10 14:36       ` Ilya Kosarev
@ 2019-12-17  0:03         ` Vladislav Shpilevoy
  0 siblings, 0 replies; 9+ messages in thread
From: Vladislav Shpilevoy @ 2019-12-17  0:03 UTC (permalink / raw)
  To: Ilya Kosarev; +Cc: tarantool-patches

Hi! Thanks for the fixes!

>     [008] -- true
>     [008] +- false
>     [008] ...
>     [008] received_message
>     [008] ---
>     [008] -- null
>     [008] +-
>     [008] ...
>     [008] from == nil -- expected true
>     [008] ---
>     [008] -- true
>     [008] +- false
>     [008] ...
>     [008] from
>     [008] ---
>     [008] -- null
>     [008] +- host: 127.0.0.1
>     [008] + family: AF_INET
>     [008] + port: 59867
>     [008] ...
>     [008] e == errno.EAGAIN -- expected true
>     [008] ---
>     [008] -- true
>     [008] +- false
>     [008] ...
>     [008] -- case: recv, zero datagram, explicit size
>     [008] sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
>     =========================================================================
> 
>     So socket.test.lua still is unstable. The command with 64
>     socket.test.lua gives me an error in 100% cases.
> 
> 4. Ok, I guess these are mac os specific issues. I haven't got mac os
> machine access yet, however, I hope to get it soon and fix these problems.
> I still tried to stabilize these cases; if it don't work, I will reconsider them later.
> 

These are UDP issues, not Mac. See my commit with review fixes for details.

>     > 6. If we close socket in the main fiber, we might attempt to use closed
>     > socket in background fiber.
> 
>     And what? We don't care anymore for the background
>     fiber after the socket is not needed anymore. Ok, it will
>     stumble into errors in that fiber, but how does it affect
>     further execution of the test?
> 
> 6. It affects further test execution, as far as it leads
> to the following error:
> 
> |[015] c:receive("*l", "Line: ") [015] --- [015] - null [015] -- closed [015] -- 'Line: Fu' [015] +- Connection reset by peer|
> 
> 

Yeah, I've seen that error. But what I wanted to see here is why
that error happens. Not just what happens. But why.

I did investigation, and found that here:

    wch:put("Fu")
    c:send("354 Please type your message\n")
    sc:close()
    c:receive("*l", "Line: ")

it was possible, that put(), send() and close() finish
before the worker fiber manages to send anything to the
client. I've tried to find a better fix, but all of them
after all are about having a flag in the worker fiber,
so your version is ok.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Tarantool-patches] [PATCH] test: fix flaky socket test
@ 2019-11-26 21:45 Ilya Kosarev
  0 siblings, 0 replies; 9+ messages in thread
From: Ilya Kosarev @ 2019-11-26 21:45 UTC (permalink / raw)
  To: tarantool-patches

socket.test had a number of flaky problems:
- socket readableness expectation
- race conditions on socket shutdown in emulation test cases
- tcp_server stability in socket receive inconsistent behavior case
Now they are solved. Port randomization is improved.
Socket test is not fragile anymore.

Closes #4451, #4426, #4469
---
Branch: https://github.com/tarantool/tarantool/tree/i.kosarev/gh-4426-4451-fix-socket-test
Issues: https://github.com/tarantool/tarantool/issues/4426
        https://github.com/tarantool/tarantool/issues/4451
        https://github.com/tarantool/tarantool/issues/4469

 test/app/socket.result   | 84 +++++++++++++++++++++++++---------------
 test/app/socket.test.lua | 60 +++++++++++++++++++---------
 test/app/suite.ini       |  1 -
 3 files changed, 94 insertions(+), 51 deletions(-)

diff --git a/test/app/socket.result b/test/app/socket.result
index fd299424c9..42dde8f375 100644
--- a/test/app/socket.result
+++ b/test/app/socket.result
@@ -45,6 +45,9 @@ test_run:cmd("push filter '(error: .builtin/.*[.]lua):[0-9]+' to '\\1'")
 WAIT_COND_TIME = 10
 ---
 ...
+WAIT_TCP_CONNECT_TIME = 240
+---
+...
 socket('PF_INET', 'SOCK_STREAM', 'tcp121222');
 ---
 - null
@@ -107,7 +110,7 @@ s:nonblock(true)
 ---
 - true
 ...
-s:readable(.1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -227,7 +230,7 @@ s:syswrite(ffi.cast('const char *', ping), #ping)
 ---
 - 6
 ...
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -830,7 +833,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world')
 ---
 - 12
 ...
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -842,7 +845,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world, 2')
 ---
 - 15
 ...
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -898,7 +901,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, World!')
 ---
 - 13
 ...
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -1074,6 +1077,15 @@ master:setsockopt('SOL_SOCKET', 'SO_REUSEADDR', true)
 ---
 - true
 ...
+seed = ''
+---
+...
+for d in string.gmatch(box.info.uuid, '%d') do  seed = seed .. d end
+---
+...
+math.randomseed(tonumber(seed))
+---
+...
 port = 32768 + math.random(0, 32767)
 ---
 ...
@@ -1092,7 +1104,7 @@ test_run:wait_cond(function()
         return false, master:error()
     end
     return true
-end, WAIT_COND_TIME);
+end, 100);
 ---
 - true
 ...
@@ -1822,8 +1834,14 @@ test_run:cmd("setopt delimiter ';'")
 ---
 - true
 ...
+socket_opened = true
 cfiber = fiber.create(function(sc, rch, wch)
-    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
+    while socket_opened do
+        sc:send(wch:get())
+        local data = sc:receive("*l")
+        if not socket_opened then sc:close() end
+        rch:put(data)
+    end
 end, sc, rch, wch);
 ---
 ...
@@ -1936,6 +1954,9 @@ c:receive("*l")
 ---
 - 
 ...
+socket_opened = false
+---
+...
 wch:put("Fu")
 ---
 - true
@@ -1944,10 +1965,6 @@ c:send("354 Please type your message\n")
 ---
 - 29
 ...
-sc:close()
----
-- 1
-...
 c:receive("*l", "Line: ")
 ---
 - null
@@ -2816,7 +2833,7 @@ test_run:cmd("clear filter")
 ---
 - true
 ...
--- case: sicket receive inconsistent behavior
+-- case: socket receive inconsistent behavior
 chan = fiber.channel()
 ---
 ...
@@ -2826,41 +2843,46 @@ counter = 0
 fn = function(s) counter = 0; while true do s:write((tostring(counter)):rep(chan:get())); counter = counter + 1 end end
 ---
 ...
-srv = socket.tcp_server('0.0.0.0', 8888, fn)
+srv = nil
 ---
 ...
-s = socket.connect('localhost', 8888)
+test_run:cmd("setopt delimiter ';'")
 ---
+- true
 ...
-chan:put(5)
+test_run:wait_cond(function()
+    port = 32768 + math.random(0, 32767)
+    srv = socket.tcp_server('0.0.0.0', port, fn)
+    return srv ~= nil
+end, 100);
 ---
 - true
 ...
-chan:put(5)
+receive1 = nil; receive2 = nil;
 ---
-- true
 ...
-s:receive(5)
+if srv ~= nil then
+    s = socket.connect('localhost', port)
+    chan:put(5)
+    chan:put(5)
+    receive1 = s:receive(5)
+    chan:put(5)
+    s:settimeout(1)
+    receive2 = s:receive('*a')
+    s:close()
+    srv:close()
+end;
 ---
-- '00000'
 ...
-chan:put(5)
+test_run:cmd("setopt delimiter ''");
 ---
 - true
 ...
-s:settimeout(1)
+receive1
 ---
-- 1
+- '00000'
 ...
-s:receive('*a')
+receive2
 ---
 - '1111122222'
 ...
-s:close()
----
-- 1
-...
-srv:close()
----
-- true
-...
diff --git a/test/app/socket.test.lua b/test/app/socket.test.lua
index c72d41763f..64f1e07471 100644
--- a/test/app/socket.test.lua
+++ b/test/app/socket.test.lua
@@ -14,6 +14,7 @@ test_run = env.new()
 test_run:cmd("push filter '(error: .builtin/.*[.]lua):[0-9]+' to '\\1'")
 
 WAIT_COND_TIME = 10
+WAIT_TCP_CONNECT_TIME = 240
 
 socket('PF_INET', 'SOCK_STREAM', 'tcp121222');
 
@@ -39,7 +40,7 @@ s:nonblock(false)
 s:nonblock()
 s:nonblock(true)
 
-s:readable(.1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:wait(.1)
 socket.iowait(s:fd(), 'RW')
 socket.iowait(s:fd(), 3)
@@ -75,7 +76,7 @@ ping = msgpack.encode(string.len(ping)) .. ping
 
 -- test syswrite with char *
 s:syswrite(ffi.cast('const char *', ping), #ping)
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:wait(.01)
 
 pong = s:sysread()
@@ -266,11 +267,11 @@ s = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 s:bind('127.0.0.1', 0)
 sc = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 sc:sendto('127.0.0.1', s:name().port, 'Hello, world')
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:recv()
 
 sc:sendto('127.0.0.1', s:name().port, 'Hello, world, 2')
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 d, from = s:recvfrom()
 from.port > 0
 from.port = 'Random port'
@@ -286,7 +287,7 @@ sc = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 sc:nonblock(true)
 sc:sendto('127.0.0.1', s:name().port)
 sc:sendto('127.0.0.1', s:name().port, 'Hello, World!')
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 data, from = s:recvfrom(10)
 data
 s:sendto(from.host, from.port, 'Hello, hello!')
@@ -343,6 +344,9 @@ s = nil
 -- random port
 master = socket('PF_INET', 'SOCK_STREAM', 'tcp')
 master:setsockopt('SOL_SOCKET', 'SO_REUSEADDR', true)
+seed = ''
+for d in string.gmatch(box.info.uuid, '%d') do  seed = seed .. d end
+math.randomseed(tonumber(seed))
 port = 32768 + math.random(0, 32767)
 -- SO_REUSEADDR allows to bind to the same source addr:port twice,
 -- so listen() can return EADDRINUSE and so we check it within
@@ -356,7 +360,7 @@ test_run:wait_cond(function()
         return false, master:error()
     end
     return true
-end, WAIT_COND_TIME);
+end, 100);
 function gh361()
     local s = socket('PF_INET', 'SOCK_STREAM', 'tcp')
     s:sysconnect('127.0.0.1', port)
@@ -619,8 +623,14 @@ s:settimeout(100500)
 rch, wch = fiber.channel(1), fiber.channel(1)
 sc = socket.connect(host, port)
 test_run:cmd("setopt delimiter ';'")
+socket_opened = true
 cfiber = fiber.create(function(sc, rch, wch)
-    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
+    while socket_opened do
+        sc:send(wch:get())
+        local data = sc:receive("*l")
+        if not socket_opened then sc:close() end
+        rch:put(data)
+    end
 end, sc, rch, wch);
 test_run:cmd("setopt delimiter ''");
 
@@ -651,9 +661,9 @@ rch:get()
 wch:put("DATA\n")
 c:receive(4)
 c:receive("*l")
+socket_opened = false
 wch:put("Fu")
 c:send("354 Please type your message\n")
-sc:close()
 c:receive("*l", "Line: ")
 c:receive()
 c:receive(10)
@@ -960,18 +970,30 @@ server:close()
 
 test_run:cmd("clear filter")
 
--- case: sicket receive inconsistent behavior
+-- case: socket receive inconsistent behavior
 chan = fiber.channel()
 counter = 0
 fn = function(s) counter = 0; while true do s:write((tostring(counter)):rep(chan:get())); counter = counter + 1 end end
-srv = socket.tcp_server('0.0.0.0', 8888, fn)
-s = socket.connect('localhost', 8888)
-chan:put(5)
-chan:put(5)
-s:receive(5)
-chan:put(5)
-s:settimeout(1)
-s:receive('*a')
-s:close()
-srv:close()
+srv = nil
+test_run:cmd("setopt delimiter ';'")
+test_run:wait_cond(function()
+    port = 32768 + math.random(0, 32767)
+    srv = socket.tcp_server('0.0.0.0', port, fn)
+    return srv ~= nil
+end, 100);
+receive1 = nil; receive2 = nil;
+if srv ~= nil then
+    s = socket.connect('localhost', port)
+    chan:put(5)
+    chan:put(5)
+    receive1 = s:receive(5)
+    chan:put(5)
+    s:settimeout(1)
+    receive2 = s:receive('*a')
+    s:close()
+    srv:close()
+end;
+test_run:cmd("setopt delimiter ''");
+receive1
+receive2
 
diff --git a/test/app/suite.ini b/test/app/suite.ini
index 79432e29a7..dd802d98cf 100644
--- a/test/app/suite.ini
+++ b/test/app/suite.ini
@@ -7,4 +7,3 @@ use_unix_sockets = True
 use_unix_sockets_iproto = True
 is_parallel = True
 pretest_clean = True
-fragile = socket.test.lua ; gh-4426 gh-4451
-- 
2.17.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Tarantool-patches] [PATCH] test: fix flaky socket test
@ 2019-11-26  1:19 Ilya Kosarev
  0 siblings, 0 replies; 9+ messages in thread
From: Ilya Kosarev @ 2019-11-26  1:19 UTC (permalink / raw)
  To: tarantool-patches

socket.test had a number of flaky problems:
- socket readableness expectation
- race conditions on socket shutdown in emulation test cases
- tcp_server stability in socket receive inconsistent behavior case
Now they are solved. Socket test is not fragile anymore.

Closes #4451, #4426, #4469
---
Branch: https://github.com/tarantool/tarantool/tree/i.kosarev/gh-4426-4451-fix-socket-test
Issues: https://github.com/tarantool/tarantool/issues/4426
        https://github.com/tarantool/tarantool/issues/4451
        https://github.com/tarantool/tarantool/issues/4469

 test/app/socket.result   | 52 +++++++++++++++++++++++++++++-----------
 test/app/socket.test.lua | 34 +++++++++++++++++---------
 test/app/suite.ini       |  1 -
 3 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/test/app/socket.result b/test/app/socket.result
index fd299424c..d6250a796 100644
--- a/test/app/socket.result
+++ b/test/app/socket.result
@@ -45,6 +45,9 @@ test_run:cmd("push filter '(error: .builtin/.*[.]lua):[0-9]+' to '\\1'")
 WAIT_COND_TIME = 10
 ---
 ...
+WAIT_TCP_CONNECT_TIME = 240
+---
+...
 socket('PF_INET', 'SOCK_STREAM', 'tcp121222');
 ---
 - null
@@ -107,7 +110,7 @@ s:nonblock(true)
 ---
 - true
 ...
-s:readable(.1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -227,7 +230,7 @@ s:syswrite(ffi.cast('const char *', ping), #ping)
 ---
 - 6
 ...
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -830,7 +833,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world')
 ---
 - 12
 ...
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -842,7 +845,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, world, 2')
 ---
 - 15
 ...
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -898,7 +901,7 @@ sc:sendto('127.0.0.1', s:name().port, 'Hello, World!')
 ---
 - 13
 ...
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 ---
 - true
 ...
@@ -1092,7 +1095,7 @@ test_run:wait_cond(function()
         return false, master:error()
     end
     return true
-end, WAIT_COND_TIME);
+end, 100);
 ---
 - true
 ...
@@ -1822,8 +1825,14 @@ test_run:cmd("setopt delimiter ';'")
 ---
 - true
 ...
+socket_opened = true
 cfiber = fiber.create(function(sc, rch, wch)
-    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
+    while socket_opened do
+        sc:send(wch:get())
+        local data = sc:receive("*l")
+        if not socket_opened then sc:close() end
+        rch:put(data)
+    end
 end, sc, rch, wch);
 ---
 ...
@@ -1936,6 +1945,9 @@ c:receive("*l")
 ---
 - 
 ...
+socket_opened = false
+---
+...
 wch:put("Fu")
 ---
 - true
@@ -1944,10 +1956,6 @@ c:send("354 Please type your message\n")
 ---
 - 29
 ...
-sc:close()
----
-- 1
-...
 c:receive("*l", "Line: ")
 ---
 - null
@@ -2816,20 +2824,36 @@ test_run:cmd("clear filter")
 ---
 - true
 ...
--- case: sicket receive inconsistent behavior
+-- case: socket receive inconsistent behavior
 chan = fiber.channel()
 ---
 ...
+seed = ''
+---
+...
+for d in string.gmatch(box.info.cluster.uuid, '%d') do  seed = seed .. d end
+---
+...
+math.randomseed(tonumber(seed))
+---
+...
+port = 32768 + math.random(0, 32767)
+---
+...
 counter = 0
 ---
 ...
 fn = function(s) counter = 0; while true do s:write((tostring(counter)):rep(chan:get())); counter = counter + 1 end end
 ---
 ...
-srv = socket.tcp_server('0.0.0.0', 8888, fn)
+srv = nil
 ---
 ...
-s = socket.connect('localhost', 8888)
+test_run:wait_cond(function() srv = socket.tcp_server('0.0.0.0', port, fn); return srv ~= nil end, 100)
+---
+- true
+...
+s = socket.connect('localhost', port)
 ---
 ...
 chan:put(5)
diff --git a/test/app/socket.test.lua b/test/app/socket.test.lua
index c72d41763..413d0b065 100644
--- a/test/app/socket.test.lua
+++ b/test/app/socket.test.lua
@@ -14,6 +14,7 @@ test_run = env.new()
 test_run:cmd("push filter '(error: .builtin/.*[.]lua):[0-9]+' to '\\1'")
 
 WAIT_COND_TIME = 10
+WAIT_TCP_CONNECT_TIME = 240
 
 socket('PF_INET', 'SOCK_STREAM', 'tcp121222');
 
@@ -39,7 +40,7 @@ s:nonblock(false)
 s:nonblock()
 s:nonblock(true)
 
-s:readable(.1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:wait(.1)
 socket.iowait(s:fd(), 'RW')
 socket.iowait(s:fd(), 3)
@@ -75,7 +76,7 @@ ping = msgpack.encode(string.len(ping)) .. ping
 
 -- test syswrite with char *
 s:syswrite(ffi.cast('const char *', ping), #ping)
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:wait(.01)
 
 pong = s:sysread()
@@ -266,11 +267,11 @@ s = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 s:bind('127.0.0.1', 0)
 sc = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 sc:sendto('127.0.0.1', s:name().port, 'Hello, world')
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 s:recv()
 
 sc:sendto('127.0.0.1', s:name().port, 'Hello, world, 2')
-s:readable(10)
+s:readable(WAIT_TCP_CONNECT_TIME)
 d, from = s:recvfrom()
 from.port > 0
 from.port = 'Random port'
@@ -286,7 +287,7 @@ sc = socket('AF_INET', 'SOCK_DGRAM', 'udp')
 sc:nonblock(true)
 sc:sendto('127.0.0.1', s:name().port)
 sc:sendto('127.0.0.1', s:name().port, 'Hello, World!')
-s:readable(1)
+s:readable(WAIT_TCP_CONNECT_TIME)
 data, from = s:recvfrom(10)
 data
 s:sendto(from.host, from.port, 'Hello, hello!')
@@ -356,7 +357,7 @@ test_run:wait_cond(function()
         return false, master:error()
     end
     return true
-end, WAIT_COND_TIME);
+end, 100);
 function gh361()
     local s = socket('PF_INET', 'SOCK_STREAM', 'tcp')
     s:sysconnect('127.0.0.1', port)
@@ -619,8 +620,14 @@ s:settimeout(100500)
 rch, wch = fiber.channel(1), fiber.channel(1)
 sc = socket.connect(host, port)
 test_run:cmd("setopt delimiter ';'")
+socket_opened = true
 cfiber = fiber.create(function(sc, rch, wch)
-    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
+    while socket_opened do
+        sc:send(wch:get())
+        local data = sc:receive("*l")
+        if not socket_opened then sc:close() end
+        rch:put(data)
+    end
 end, sc, rch, wch);
 test_run:cmd("setopt delimiter ''");
 
@@ -651,9 +658,9 @@ rch:get()
 wch:put("DATA\n")
 c:receive(4)
 c:receive("*l")
+socket_opened = false
 wch:put("Fu")
 c:send("354 Please type your message\n")
-sc:close()
 c:receive("*l", "Line: ")
 c:receive()
 c:receive(10)
@@ -960,12 +967,17 @@ server:close()
 
 test_run:cmd("clear filter")
 
--- case: sicket receive inconsistent behavior
+-- case: socket receive inconsistent behavior
 chan = fiber.channel()
+seed = ''
+for d in string.gmatch(box.info.cluster.uuid, '%d') do  seed = seed .. d end
+math.randomseed(tonumber(seed))
+port = 32768 + math.random(0, 32767)
 counter = 0
 fn = function(s) counter = 0; while true do s:write((tostring(counter)):rep(chan:get())); counter = counter + 1 end end
-srv = socket.tcp_server('0.0.0.0', 8888, fn)
-s = socket.connect('localhost', 8888)
+srv = nil
+test_run:wait_cond(function() srv = socket.tcp_server('0.0.0.0', port, fn); return srv ~= nil end, 100)
+s = socket.connect('localhost', port)
 chan:put(5)
 chan:put(5)
 s:receive(5)
diff --git a/test/app/suite.ini b/test/app/suite.ini
index 79432e29a..dd802d98c 100644
--- a/test/app/suite.ini
+++ b/test/app/suite.ini
@@ -7,4 +7,3 @@ use_unix_sockets = True
 use_unix_sockets_iproto = True
 is_parallel = True
 pretest_clean = True
-fragile = socket.test.lua ; gh-4426 gh-4451
-- 
2.17.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tarantool-patches] [PATCH] test: fix flaky socket test
@ 2019-08-30 15:28 Ilya Kosarev
  0 siblings, 0 replies; 9+ messages in thread
From: Ilya Kosarev @ 2019-08-30 15:28 UTC (permalink / raw)
  To: tarantool-patches; +Cc: georgy, alexander.turenko, avtikhon, Ilya Kosarev

socket.test had a number of problems:
- racing condition on closing socket around line 640
- racing condition on waiting around line 42
- racing conditions on "receiving_socket:recv" around line 777
Now they are solved.

Closes #4451, #4426
---
Branch: https://github.com/tarantool/tarantool/tree/i.kosarev/gh-4426-4451-fix-socket-test
Issues: https://github.com/tarantool/tarantool/issues/4426 https://github.com/tarantool/tarantool/issues/4451

 test/app/socket.result   | 43 +++++++++++++++++++++++++++++++++-------
 test/app/socket.test.lua | 22 ++++++++++++++++----
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/test/app/socket.result b/test/app/socket.result
index fd299424c..811f6b4e9 100644
--- a/test/app/socket.result
+++ b/test/app/socket.result
@@ -107,11 +107,11 @@ s:nonblock(true)
 ---
 - true
 ...
-s:readable(.1)
+s:readable(.5)
 ---
 - true
 ...
-s:wait(.1)
+s:wait(.5)
 ---
 - RW
 ...
@@ -1822,8 +1822,14 @@ test_run:cmd("setopt delimiter ';'")
 ---
 - true
 ...
+socket_opened = true
 cfiber = fiber.create(function(sc, rch, wch)
-    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
+    while socket_opened do
+        sc:send(wch:get())
+        local data = sc:receive("*l")
+        if not socket_opened then sc:close() end
+        rch:put(data)
+    end
 end, sc, rch, wch);
 ---
 ...
@@ -1936,6 +1942,9 @@ c:receive("*l")
 ---
 - 
 ...
+socket_opened = false
+---
+...
 wch:put("Fu")
 ---
 - true
@@ -1944,10 +1953,6 @@ c:send("354 Please type your message\n")
 ---
 - 29
 ...
-sc:close()
----
-- 1
-...
 c:receive("*l", "Line: ")
 ---
 - null
@@ -2292,6 +2297,9 @@ sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
 ---
 - 0
 ...
+fiber.yield()
+---
+...
 received_message = receiving_socket:recv()
 ---
 ...
@@ -2315,6 +2323,9 @@ sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
 ---
 - 0
 ...
+fiber.yield()
+---
+...
 received_message, from = receiving_socket:recvfrom()
 ---
 ...
@@ -2392,6 +2403,9 @@ sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
 ---
 - 0
 ...
+fiber.yield()
+---
+...
 received_message = receiving_socket:recv(512)
 ---
 ...
@@ -2415,6 +2429,9 @@ sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
 ---
 - 0
 ...
+fiber.yield()
+---
+...
 received_message, from = receiving_socket:recvfrom(512)
 ---
 ...
@@ -2452,6 +2469,9 @@ sending_socket:sendto('127.0.0.1', receiving_socket_port, message)
 ---
 - 1025
 ...
+fiber.yield()
+---
+...
 received_message = receiving_socket:recv()
 ---
 ...
@@ -2484,6 +2504,9 @@ sending_socket:sendto('127.0.0.1', receiving_socket_port, message)
 ---
 - 1025
 ...
+fiber.yield()
+---
+...
 received_message, from = receiving_socket:recvfrom()
 ---
 ...
@@ -2523,6 +2546,9 @@ sending_socket:sendto('127.0.0.1', receiving_socket_port, message)
 ---
 - 1025
 ...
+fiber.yield()
+---
+...
 received_message = receiving_socket:recv(512)
 ---
 ...
@@ -2566,6 +2592,9 @@ sending_socket:sendto('127.0.0.1', receiving_socket_port, message)
 ---
 - 1025
 ...
+fiber.yield()
+---
+...
 received_message, from = receiving_socket:recvfrom(512)
 ---
 ...
diff --git a/test/app/socket.test.lua b/test/app/socket.test.lua
index c72d41763..8cb693735 100644
--- a/test/app/socket.test.lua
+++ b/test/app/socket.test.lua
@@ -39,8 +39,8 @@ s:nonblock(false)
 s:nonblock()
 s:nonblock(true)
 
-s:readable(.1)
-s:wait(.1)
+s:readable(.5)
+s:wait(.5)
 socket.iowait(s:fd(), 'RW')
 socket.iowait(s:fd(), 3)
 socket.iowait(s:fd(), 'R')
@@ -619,8 +619,14 @@ s:settimeout(100500)
 rch, wch = fiber.channel(1), fiber.channel(1)
 sc = socket.connect(host, port)
 test_run:cmd("setopt delimiter ';'")
+socket_opened = true
 cfiber = fiber.create(function(sc, rch, wch)
-    while sc:send(wch:get()) and rch:put(sc:receive("*l")) do end
+    while socket_opened do
+        sc:send(wch:get())
+        local data = sc:receive("*l")
+        if not socket_opened then sc:close() end
+        rch:put(data)
+    end
 end, sc, rch, wch);
 test_run:cmd("setopt delimiter ''");
 
@@ -651,9 +657,9 @@ rch:get()
 wch:put("DATA\n")
 c:receive(4)
 c:receive("*l")
+socket_opened = false
 wch:put("Fu")
 c:send("354 Please type your message\n")
-sc:close()
 c:receive("*l", "Line: ")
 c:receive()
 c:receive(10)
@@ -778,6 +784,7 @@ e == errno.EAGAIN -- expected true
 
 -- case: recv, zero datagram
 sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
+fiber.yield()
 received_message = receiving_socket:recv()
 e = receiving_socket:errno()
 received_message == '' -- expected true
@@ -786,6 +793,7 @@ e == 0 -- expected true
 
 -- case: recvfrom, zero datagram
 sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
+fiber.yield()
 received_message, from = receiving_socket:recvfrom()
 e = receiving_socket:errno()
 received_message == '' -- expected true
@@ -812,6 +820,7 @@ e == errno.EAGAIN -- expected true
 
 -- case: recv, zero datagram, explicit size
 sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
+fiber.yield()
 received_message = receiving_socket:recv(512)
 e = receiving_socket:errno()
 received_message == '' -- expected true
@@ -820,6 +829,7 @@ e == 0 -- expected true
 
 -- case: recvfrom, zero datagram, explicit size
 sendto_zero(sending_socket, '127.0.0.1', receiving_socket_port)
+fiber.yield()
 received_message, from = receiving_socket:recvfrom(512)
 e = receiving_socket:errno()
 received_message == '' -- expected true
@@ -833,6 +843,7 @@ message = string.rep('x', message_len)
 
 -- case: recv, non-zero length datagram, the buffer size should be evaluated
 sending_socket:sendto('127.0.0.1', receiving_socket_port, message)
+fiber.yield()
 received_message = receiving_socket:recv()
 e = receiving_socket:errno()
 received_message == message -- expected true
@@ -844,6 +855,7 @@ e
 -- case: recvfrom, non-zero length datagram, the buffer size should be
 -- evaluated
 sending_socket:sendto('127.0.0.1', receiving_socket_port, message)
+fiber.yield()
 received_message, from = receiving_socket:recvfrom()
 e = receiving_socket:errno()
 received_message == message -- expected true
@@ -856,6 +868,7 @@ e
 
 -- case: recv truncates a datagram larger then the buffer of an explicit size
 sending_socket:sendto('127.0.0.1', receiving_socket_port, message)
+fiber.yield()
 received_message = receiving_socket:recv(512)
 e = receiving_socket:errno()
 received_message == message:sub(1, 512) -- expected true
@@ -872,6 +885,7 @@ message = string.rep('y', message_len)
 
 -- case: recvfrom truncates a datagram larger then the buffer of an explicit size
 sending_socket:sendto('127.0.0.1', receiving_socket_port, message)
+fiber.yield()
 received_message, from = receiving_socket:recvfrom(512)
 e = receiving_socket:errno()
 received_message == message:sub(1, 512) -- expected true
-- 
2.17.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-12-17  0:03 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-04 13:43 [Tarantool-patches] [PATCH] test: fix flaky socket test Ilya Kosarev
2019-12-05 21:31 ` Vladislav Shpilevoy
2019-12-06 16:01   ` Ilya Kosarev
2019-12-08 15:52     ` Vladislav Shpilevoy
2019-12-10 14:36       ` Ilya Kosarev
2019-12-17  0:03         ` Vladislav Shpilevoy
  -- strict thread matches above, loose matches on Subject: below --
2019-11-26 21:45 Ilya Kosarev
2019-11-26  1:19 Ilya Kosarev
2019-08-30 15:28 [tarantool-patches] " Ilya Kosarev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox