From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id EB86121F1C for ; Tue, 26 Jun 2018 10:03:28 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id i2lwnJzmu8JS for ; Tue, 26 Jun 2018 10:03:28 -0400 (EDT) Received: from smtp55.i.mail.ru (smtp55.i.mail.ru [217.69.128.35]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id A5A6F21C5D for ; Tue, 26 Jun 2018 10:03:28 -0400 (EDT) Subject: [tarantool-patches] Re: [PATCH 2/2] Fix discovery/reconfigure race References: From: Alex Khatskevich Message-ID: <7790cb4a-ff5e-99bf-03bc-996fea379185@tarantool.org> Date: Tue, 26 Jun 2018 17:03:26 +0300 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Transfer-Encoding: 8bit Content-Language: en-US Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: Vladislav Shpilevoy , tarantool-patches@freelists.org > 1. I removed this cycle together with any wakeups and the test passed. > So you are > wrong. Please, make the test be testing that discovery really works. > > 2. Please, just remove this single sleeps and do 'while cond do > fiber.sleep(0.01) end'. > The fact that the test is passed on your computer means nothing. It > can always pass on your > machine, but sometimes fail on mine, easily. Or on the travis, or on a > customer machine. > > You do not need any 'synchronization and block/unblock phases'. You > already know what do > you expect to see after the sleep, so just put this condition in the > while. > > If you want to test some deadlines like it is done for auto network > timeouts for the > router, then just remember the current time before while, afterwards > and test that > the duration was < your limit. > > I have already debugged several of such '100% passing tests with hard > timeouts' > and do not want to do it again. > Fixed as discussed verbally > 3. Please, put a new patch version here. below > >> diff --git a/vshard/router/init.lua b/vshard/router/init.lua >> index 7e765fa..5fc0f59 100644 >> --- a/vshard/router/init.lua >> +++ b/vshard/router/init.lua >> @@ -127,10 +128,23 @@ local function discovery_f(module_version) >>      local iterations_until_lua_gc = >>          consts.COLLECT_LUA_GARBAGE_INTERVAL / consts.DISCOVERY_INTERVAL >>      while module_version == M.module_version do >> -        for _, replicaset in pairs(M.replicasets) do >> +        local old_replicasets = M.replicasets >> +        for rs_uuid, replicaset in pairs(M.replicasets) do >>              local active_buckets, err = >> replicaset:callro('vshard.storage.buckets_discovery', {}, >>                                    {timeout = 2}) >> +            while M.errinj.ERRINJ_LONG_DISCOVERY do >> +                -- Stuck on the first replicaset. > > 4. When you set this errinj, it sticks on the first rs with no special > checks and blocks other replicasets too. So remove this check. Just do > sleep until the errinj is set. > > To prevent your dissension: I removed the check and the test passed too. > Together with the diff above about wakeups removal. Fixed as discussed verbally > > 5. select(1... here does nothing. It is equivalent to > >     rs_uuid ~= next(M.replicasets) deleted Full diff: commit bc90c3db5f5b1b663c747b8c2829fc8528af70cf Author: AKhatskevich Date:   Thu Jun 14 16:03:09 2018 +0300     Fix discovery/reconfigure race     This commit prevents discovery fiber from discovering old replicasets     and spoiling `route_map`. diff --git a/test/router/router.result b/test/router/router.result index 5643f3e..36d54bf 100644 --- a/test/router/router.result +++ b/test/router/router.result @@ -1095,6 +1095,55 @@ for bucket, old_rs in pairs(bucket_to_old_rs) do  end;  ---  ... +-- +-- Check route_map is not filled with old replica objects after +-- reconfigure. +-- +-- Simulate long `callro`. +vshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = true; +--- +... +while not vshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY == 'waiting' do +    vshard.router.discovery_wakeup() +    fiber.sleep(0.02) +end; +--- +... +vshard.router.cfg(cfg); +--- +... +route_map = vshard.router.internal.route_map +for bucket_id, _ in pairs(route_map) do +    route_map[bucket_id] = nil +end; +--- +... +vshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = false; +--- +... +-- Do discovery iteration. Upload buckets from the +-- first replicaset. +while not next(vshard.router.internal.route_map) do +    vshard.router.discovery_wakeup() +    fiber.sleep(0.01) +end; +--- +... +new_replicasets = {}; +--- +... +for _, rs in pairs(vshard.router.internal.replicasets) do +    new_replicasets[rs] = true +end; +--- +... +_, rs = next(vshard.router.internal.route_map); +--- +... +new_replicasets[rs] == true; +--- +- true +...  test_run:cmd("setopt delimiter ''");  ---  - true diff --git a/test/router/router.test.lua b/test/router/router.test.lua index 106f3d8..843db46 100644 --- a/test/router/router.test.lua +++ b/test/router/router.test.lua @@ -411,6 +411,35 @@ for bucket, old_rs in pairs(bucket_to_old_rs) do          error("route_map was not updataed.")      end  end; + +-- +-- Check route_map is not filled with old replica objects after +-- reconfigure. +-- +-- Simulate long `callro`. +vshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = true; +while not vshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY == 'waiting' do +    vshard.router.discovery_wakeup() +    fiber.sleep(0.02) +end; +vshard.router.cfg(cfg); +route_map = vshard.router.internal.route_map +for bucket_id, _ in pairs(route_map) do +    route_map[bucket_id] = nil +end; +vshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = false; +-- Do discovery iteration. Upload buckets from the +-- first replicaset. +while not next(vshard.router.internal.route_map) do +    vshard.router.discovery_wakeup() +    fiber.sleep(0.01) +end; +new_replicasets = {}; +for _, rs in pairs(vshard.router.internal.replicasets) do +    new_replicasets[rs] = true +end; +_, rs = next(vshard.router.internal.route_map); +new_replicasets[rs] == true;  test_run:cmd("setopt delimiter ''");  _ = test_run:cmd("switch default") diff --git a/vshard/router/init.lua b/vshard/router/init.lua index 7e765fa..c125125 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -13,6 +13,7 @@ if not M then          errinj = {              ERRINJ_FAILOVER_CHANGE_CFG = false,              ERRINJ_RELOAD = false, +            ERRINJ_LONG_DISCOVERY = false,          },          -- Bucket map cache.          route_map = {}, @@ -127,10 +128,19 @@ local function discovery_f(module_version)      local iterations_until_lua_gc =          consts.COLLECT_LUA_GARBAGE_INTERVAL / consts.DISCOVERY_INTERVAL      while module_version == M.module_version do -        for _, replicaset in pairs(M.replicasets) do +        local old_replicasets = M.replicasets +        for rs_uuid, replicaset in pairs(M.replicasets) do              local active_buckets, err = replicaset:callro('vshard.storage.buckets_discovery', {},                                    {timeout = 2}) +            while M.errinj.ERRINJ_LONG_DISCOVERY do +                lfiber.sleep(0.01) +            end +            -- Renew replicasets object captured by the for loop +            -- in case of reconfigure and reload events. +            if M.replicasets ~= old_replicasets then +                break +            end              if not active_buckets then                  log.error('Error during discovery %s: %s', replicaset, err)              else