[PATCH] socket: fix race between unix tcp server stop and start

Vladimir Davydov vdavydov.dev at gmail.com
Mon Jun 11 17:03:46 MSK 2018


If called on a unix socket, bind(2) creates a new file, see unix(7).
When we stop a unix tcp server, we should remove that file. Currently,
we do it from the tcp server fiber, after the server loop is broken,
which happens when the socket is closed, see tcp_server_loop(). This
opens a time window for another tcp server to reuse the same path:

    main fiber                  tcp server loop
    ----------                  ---------------

    -- Start a tcp server.
    s = socket.tcp_server('unix/', sock_path, ...)
    -- Stop the server.
    s:close()

                                socket_readable? => no, break loop

    -- Start a new tcp server. Use the same path as before.
    -- This function succeeds, because the socket is closed
    -- so tcp_server_bind_addr() will clean up by itself.
    s = socket.tcp_server('unix/', sock_path, ...)

     tcp_server_bind
      tcp_server_bind_addr
       socket_bind => EADDRINUSE
       tcp_connect => ECONNREFUSED
       -- Remove dead unix socket.
       fio.unlink(addr.port)
       socket_bind => success

                                -- Deletes unix socket used
                                -- by the new server.
                                fio.unlink(addr.port)

In particular, the race results in sporadic failures of app-tap/console
test, which restarts a tcp server using the same file path.

To fix this issue, let's close the socket after removing the socket
file. This is absolutely legit on any UNIX system, and this eliminates
the race shown above, because a new server that tries to bind on the
same path as the one already used by a dying server will not receive
ECONNREFUSED until the socket fd is closed and hence the file is
removed.

A note about the app-tap/console test. After this patch is applied,
socket.close() takes a little longer for unix tcp server, because it
yields twice, once for removing the socket file and once for closing the
socket file descriptor. As a result, on_disconnect() trigger left from
the previous test case has time to run after session.type() check.
Actually, those triggers have already been tested and we should have
cleared them before proceeding to the next test case. So instead of
adding two new on_disconnect checks to the test plan, let's clear the
triggers before session.type() test case and remove 3 on_connect and 5
on_auth checks from the test plan.

Closes #3168
---
https://github.com/tarantool/tarantool/issues/3168
https://github.com/tarantool/tarantool/commits/gh-3168-fix-unix-tcp-server-stop-vs-start-race

 src/lua/socket.lua            | 24 +++++++++++++++++++-----
 test/app-tap/console.test.lua |  9 +++++----
 test/app/socket.result        | 36 ++++++++++++++++++++++++++++++++++++
 test/app/socket.test.lua      | 23 +++++++++++++++++++++++
 4 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/src/lua/socket.lua b/src/lua/socket.lua
index 10c6f8fb..06306eae 100644
--- a/src/lua/socket.lua
+++ b/src/lua/socket.lua
@@ -1021,9 +1021,6 @@ local function tcp_server_loop(server, s, addr)
         end
     end
     -- Socket was closed
-    if addr.family == 'AF_UNIX' and addr.port then
-        fio.unlink(addr.port) -- remove unix socket
-    end
     log.info("stopped")
 end
 
@@ -1031,8 +1028,25 @@ local function tcp_server_usage()
     error('Usage: socket.tcp_server(host, port, handler | opts)')
 end
 
-local function tcp_server_bind_addr(s, addr)
+local function tcp_server_do_bind(s, addr)
     if socket_bind(s, addr.host, addr.port) then
+        if addr.family == 'AF_UNIX' then
+            -- Make close() remove the unix socket file created
+            -- by bind(). Note, this must be done before closing
+            -- the socket fd so that no other tcp server can
+            -- reuse the same path before we remove the file.
+            s.close = function(self)
+                fio.unlink(addr.port)
+                return socket_close(self)
+            end
+        end
+        return true
+    end
+    return false
+end
+
+local function tcp_server_bind_addr(s, addr)
+    if tcp_server_do_bind(s, addr) then
         return true
     end
 
@@ -1064,7 +1078,7 @@ local function tcp_server_bind_addr(s, addr)
         boxerrno(save_errno)
         return false
     end
-    return socket_bind(s, addr.host, addr.port)
+    return tcp_server_do_bind(s, addr)
 end
 
 
diff --git a/test/app-tap/console.test.lua b/test/app-tap/console.test.lua
index fecfa52c..2beb88ae 100755
--- a/test/app-tap/console.test.lua
+++ b/test/app-tap/console.test.lua
@@ -21,7 +21,7 @@ local EOL = "\n...\n"
 
 test = tap.test("console")
 
-test:plan(59)
+test:plan(51)
 
 -- Start console and connect to it
 local server = console.listen(CONSOLE_SOCKET)
@@ -227,6 +227,10 @@ while triggers_ran < 4 do fiber.yield() end
 test:is(triggers_ran, 4, "on_connect -> on_auth_error order")
 box.session.on_auth(nil, console_on_auth_error)
 
+box.session.on_connect(nil, console_on_connect)
+box.session.on_disconnect(nil, console_on_disconnect)
+box.session.on_auth(nil, console_on_auth)
+
 --
 -- gh-2642 "box.session.type()"
 --
@@ -238,9 +242,6 @@ client:close()
 
 server:close()
 
-box.session.on_connect(nil, console_on_connect)
-box.session.on_disconnect(nil, console_on_disconnect)
-box.session.on_auth(nil, console_on_auth)
 box.schema.user.drop('test')
 box.schema.user.drop('test2')
 session_id = nil
diff --git a/test/app/socket.result b/test/app/socket.result
index 36077633..0c41b184 100644
--- a/test/app/socket.result
+++ b/test/app/socket.result
@@ -1682,6 +1682,42 @@ test_run:cmd("setopt delimiter ''");
 f:cancel()
 ---
 ...
+-- gh-3168: a stopped tcp server removes the unix socket created
+-- and used by a newly started one.
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+err = nil;
+---
+...
+path = '/tmp/tarantool-test-socket';
+---
+...
+for i = 1, 10 do
+    local server = socket.tcp_server('unix/', path, function() end)
+    if not server then
+        err = 'tcp_server: ' .. errno.strerror()
+        break
+    end
+    local client = socket.tcp_connect("unix/", path)
+    if not client then
+        err = 'tcp_connect: ' .. errno.strerror()
+        break
+    end
+    client:close()
+    server:close()
+end;
+---
+...
+err == nil or err;
+---
+- true
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
 --------------------------------------------------------------------------------
 -- Lua Socket Emulation
 --------------------------------------------------------------------------------
diff --git a/test/app/socket.test.lua b/test/app/socket.test.lua
index 1ab2bef0..f6539438 100644
--- a/test/app/socket.test.lua
+++ b/test/app/socket.test.lua
@@ -565,6 +565,29 @@ end);
 test_run:cmd("setopt delimiter ''");
 f:cancel()
 
+-- gh-3168: a stopped tcp server removes the unix socket created
+-- and used by a newly started one.
+test_run:cmd("setopt delimiter ';'")
+err = nil;
+path = '/tmp/tarantool-test-socket';
+for i = 1, 10 do
+    local server = socket.tcp_server('unix/', path, function() end)
+    if not server then
+        err = 'tcp_server: ' .. errno.strerror()
+        break
+    end
+    local client = socket.tcp_connect("unix/", path)
+    if not client then
+        err = 'tcp_connect: ' .. errno.strerror()
+        break
+    end
+    client:close()
+    server:close()
+end;
+err == nil or err;
+test_run:cmd("setopt delimiter ''");
+
+
 --------------------------------------------------------------------------------
 -- Lua Socket Emulation
 --------------------------------------------------------------------------------
-- 
2.11.0




More information about the Tarantool-patches mailing list