From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH] socket: fix race between unix tcp server stop and start Date: Mon, 11 Jun 2018 17:03:46 +0300 Message-Id: To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: If called on a unix socket, bind(2) creates a new file, see unix(7). When we stop a unix tcp server, we should remove that file. Currently, we do it from the tcp server fiber, after the server loop is broken, which happens when the socket is closed, see tcp_server_loop(). This opens a time window for another tcp server to reuse the same path: main fiber tcp server loop ---------- --------------- -- Start a tcp server. s = socket.tcp_server('unix/', sock_path, ...) -- Stop the server. s:close() socket_readable? => no, break loop -- Start a new tcp server. Use the same path as before. -- This function succeeds, because the socket is closed -- so tcp_server_bind_addr() will clean up by itself. s = socket.tcp_server('unix/', sock_path, ...) tcp_server_bind tcp_server_bind_addr socket_bind => EADDRINUSE tcp_connect => ECONNREFUSED -- Remove dead unix socket. fio.unlink(addr.port) socket_bind => success -- Deletes unix socket used -- by the new server. fio.unlink(addr.port) In particular, the race results in sporadic failures of app-tap/console test, which restarts a tcp server using the same file path. To fix this issue, let's close the socket after removing the socket file. This is absolutely legit on any UNIX system, and this eliminates the race shown above, because a new server that tries to bind on the same path as the one already used by a dying server will not receive ECONNREFUSED until the socket fd is closed and hence the file is removed. A note about the app-tap/console test. After this patch is applied, socket.close() takes a little longer for unix tcp server, because it yields twice, once for removing the socket file and once for closing the socket file descriptor. As a result, on_disconnect() trigger left from the previous test case has time to run after session.type() check. Actually, those triggers have already been tested and we should have cleared them before proceeding to the next test case. So instead of adding two new on_disconnect checks to the test plan, let's clear the triggers before session.type() test case and remove 3 on_connect and 5 on_auth checks from the test plan. Closes #3168 --- https://github.com/tarantool/tarantool/issues/3168 https://github.com/tarantool/tarantool/commits/gh-3168-fix-unix-tcp-server-stop-vs-start-race src/lua/socket.lua | 24 +++++++++++++++++++----- test/app-tap/console.test.lua | 9 +++++---- test/app/socket.result | 36 ++++++++++++++++++++++++++++++++++++ test/app/socket.test.lua | 23 +++++++++++++++++++++++ 4 files changed, 83 insertions(+), 9 deletions(-) diff --git a/src/lua/socket.lua b/src/lua/socket.lua index 10c6f8fb..06306eae 100644 --- a/src/lua/socket.lua +++ b/src/lua/socket.lua @@ -1021,9 +1021,6 @@ local function tcp_server_loop(server, s, addr) end end -- Socket was closed - if addr.family == 'AF_UNIX' and addr.port then - fio.unlink(addr.port) -- remove unix socket - end log.info("stopped") end @@ -1031,8 +1028,25 @@ local function tcp_server_usage() error('Usage: socket.tcp_server(host, port, handler | opts)') end -local function tcp_server_bind_addr(s, addr) +local function tcp_server_do_bind(s, addr) if socket_bind(s, addr.host, addr.port) then + if addr.family == 'AF_UNIX' then + -- Make close() remove the unix socket file created + -- by bind(). Note, this must be done before closing + -- the socket fd so that no other tcp server can + -- reuse the same path before we remove the file. + s.close = function(self) + fio.unlink(addr.port) + return socket_close(self) + end + end + return true + end + return false +end + +local function tcp_server_bind_addr(s, addr) + if tcp_server_do_bind(s, addr) then return true end @@ -1064,7 +1078,7 @@ local function tcp_server_bind_addr(s, addr) boxerrno(save_errno) return false end - return socket_bind(s, addr.host, addr.port) + return tcp_server_do_bind(s, addr) end diff --git a/test/app-tap/console.test.lua b/test/app-tap/console.test.lua index fecfa52c..2beb88ae 100755 --- a/test/app-tap/console.test.lua +++ b/test/app-tap/console.test.lua @@ -21,7 +21,7 @@ local EOL = "\n...\n" test = tap.test("console") -test:plan(59) +test:plan(51) -- Start console and connect to it local server = console.listen(CONSOLE_SOCKET) @@ -227,6 +227,10 @@ while triggers_ran < 4 do fiber.yield() end test:is(triggers_ran, 4, "on_connect -> on_auth_error order") box.session.on_auth(nil, console_on_auth_error) +box.session.on_connect(nil, console_on_connect) +box.session.on_disconnect(nil, console_on_disconnect) +box.session.on_auth(nil, console_on_auth) + -- -- gh-2642 "box.session.type()" -- @@ -238,9 +242,6 @@ client:close() server:close() -box.session.on_connect(nil, console_on_connect) -box.session.on_disconnect(nil, console_on_disconnect) -box.session.on_auth(nil, console_on_auth) box.schema.user.drop('test') box.schema.user.drop('test2') session_id = nil diff --git a/test/app/socket.result b/test/app/socket.result index 36077633..0c41b184 100644 --- a/test/app/socket.result +++ b/test/app/socket.result @@ -1682,6 +1682,42 @@ test_run:cmd("setopt delimiter ''"); f:cancel() --- ... +-- gh-3168: a stopped tcp server removes the unix socket created +-- and used by a newly started one. +test_run:cmd("setopt delimiter ';'") +--- +- true +... +err = nil; +--- +... +path = '/tmp/tarantool-test-socket'; +--- +... +for i = 1, 10 do + local server = socket.tcp_server('unix/', path, function() end) + if not server then + err = 'tcp_server: ' .. errno.strerror() + break + end + local client = socket.tcp_connect("unix/", path) + if not client then + err = 'tcp_connect: ' .. errno.strerror() + break + end + client:close() + server:close() +end; +--- +... +err == nil or err; +--- +- true +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... -------------------------------------------------------------------------------- -- Lua Socket Emulation -------------------------------------------------------------------------------- diff --git a/test/app/socket.test.lua b/test/app/socket.test.lua index 1ab2bef0..f6539438 100644 --- a/test/app/socket.test.lua +++ b/test/app/socket.test.lua @@ -565,6 +565,29 @@ end); test_run:cmd("setopt delimiter ''"); f:cancel() +-- gh-3168: a stopped tcp server removes the unix socket created +-- and used by a newly started one. +test_run:cmd("setopt delimiter ';'") +err = nil; +path = '/tmp/tarantool-test-socket'; +for i = 1, 10 do + local server = socket.tcp_server('unix/', path, function() end) + if not server then + err = 'tcp_server: ' .. errno.strerror() + break + end + local client = socket.tcp_connect("unix/", path) + if not client then + err = 'tcp_connect: ' .. errno.strerror() + break + end + client:close() + server:close() +end; +err == nil or err; +test_run:cmd("setopt delimiter ''"); + + -------------------------------------------------------------------------------- -- Lua Socket Emulation -------------------------------------------------------------------------------- -- 2.11.0