From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp40.i.mail.ru (smtp40.i.mail.ru [94.100.177.100]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 44310445320 for ; Fri, 7 Aug 2020 11:07:18 +0300 (MSK) From: Oleg Babin References: Message-ID: Date: Fri, 7 Aug 2020 11:07:16 +0300 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit Content-Language: en-GB Subject: Re: [Tarantool-patches] [PATCH 1/1] router: update known bucket count when rs removed List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Vladislav Shpilevoy , tarantool-patches@dev.tarantool.org, yaroslav.dynnikov@tarantool.org Thanks for your patch! LGTM. On 07/08/2020 01:11, Vladislav Shpilevoy wrote: > When replicaset was removed, router didn't update > router.known_bucket_count value. It is used by discovery to decide > when to turn on aggressive discovery. And by router.info() to > provide alerts, change status. > > As a result, in case of replicaset removal, known_bucket_count > was left bigger than it actually is, and it could be fixed only > by restart. > > Discovery could become non-agressive too early, and router.info() > could show weird alerts about bucke_count being configured > differently on storages and router. Besides, router.info().bucket > was showing not realistic numbers. > --- > Branch:http://github.com/tarantool/tarantool/tree/gerold103/router-known-buckets > > test/router/router2.result | 61 ++++++++++++++++++++++++++++++++++++ > test/router/router2.test.lua | 24 ++++++++++++++ > vshard/router/init.lua | 3 ++ > 3 files changed, 88 insertions(+) > > diff --git a/test/router/router2.result b/test/router/router2.result > index 40d6bff..492421b 100644 > --- a/test/router/router2.result > +++ b/test/router/router2.result > @@ -250,6 +250,67 @@ vshard.router.static.discovery_fiber > | - null > | ... > > +-- > +-- Known bucket count should be updated properly when replicaset > +-- is removed from the config. > +-- > +vshard.router.info().bucket > + | --- > + | - unreachable: 0 > + | available_ro: 0 > + | unknown: 0 > + | available_rw: 3000 > + | ... > +rs1_uuid = util.replicasets[1] > + | --- > + | ... > +rs1 = cfg.sharding[rs1_uuid] > + | --- > + | ... > +cfg.sharding[rs1_uuid] = nil > + | --- > + | ... > +vshard.router.cfg(cfg) > + | --- > + | ... > +vshard.router.info().bucket > + | --- > + | - unreachable: 0 > + | available_ro: 0 > + | unknown: 1500 > + | available_rw: 1500 > + | ... > +cfg.sharding[rs1_uuid] = rs1 > + | --- > + | ... > +vshard.router.cfg(cfg) > + | --- > + | ... > +vshard.router.discovery_set('on') > + | --- > + | ... > +function wait_all_rw() \ > + local total = vshard.router.bucket_count() \ > + local res = vshard.router.info().bucket.available_rw == total \ > + if not res then \ > + vshard.router.discovery_wakeup() \ > + end \ > + return res \ > +end > + | --- > + | ... > +test_run:wait_cond(wait_all_rw) > + | --- > + | - true > + | ... > +vshard.router.info().bucket > + | --- > + | - unreachable: 0 > + | available_ro: 0 > + | unknown: 0 > + | available_rw: 3000 > + | ... > + > _ = test_run:switch("default") > | --- > | ... > diff --git a/test/router/router2.test.lua b/test/router/router2.test.lua > index afdf4e1..72c99e6 100644 > --- a/test/router/router2.test.lua > +++ b/test/router/router2.test.lua > @@ -95,6 +95,30 @@ vshard.router.static.discovery_fiber > vshard.router.discovery_set('once') > vshard.router.static.discovery_fiber > > +-- > +-- Known bucket count should be updated properly when replicaset > +-- is removed from the config. > +-- > +vshard.router.info().bucket > +rs1_uuid = util.replicasets[1] > +rs1 = cfg.sharding[rs1_uuid] > +cfg.sharding[rs1_uuid] = nil > +vshard.router.cfg(cfg) > +vshard.router.info().bucket > +cfg.sharding[rs1_uuid] = rs1 > +vshard.router.cfg(cfg) > +vshard.router.discovery_set('on') > +function wait_all_rw() \ > + local total = vshard.router.bucket_count() \ > + local res = vshard.router.info().bucket.available_rw == total \ > + if not res then \ > + vshard.router.discovery_wakeup() \ > + end \ > + return res \ > +end > +test_run:wait_cond(wait_all_rw) > +vshard.router.info().bucket > + > _ = test_run:switch("default") > _ = test_run:cmd("stop server router_1") > _ = test_run:cmd("cleanup server router_1") > diff --git a/vshard/router/init.lua b/vshard/router/init.lua > index d8c9246..0d94210 100644 > --- a/vshard/router/init.lua > +++ b/vshard/router/init.lua > @@ -908,14 +908,17 @@ local function router_cfg(router, cfg, is_reload) > router.failover_ping_timeout = vshard_cfg.failover_ping_timeout > router.sync_timeout = vshard_cfg.sync_timeout > local old_route_map = router.route_map > + local known_bucket_count = 0 > router.route_map = table_new(router.total_bucket_count, 0) > for bucket, rs in pairs(old_route_map) do > local new_rs = router.replicasets[rs.uuid] > if new_rs then > router.route_map[bucket] = new_rs > new_rs.bucket_count = new_rs.bucket_count + 1 > + known_bucket_count = known_bucket_count + 1 > end > end > + router.known_bucket_count = known_bucket_count > if router.failover_fiber == nil then > router.failover_fiber = util.reloadable_fiber_create( > 'vshard.failover.' .. router.name, M, 'failover_f', router)