[tarantool-patches] [PATCH vshard 5/7] rebalancer: introduce pinned bucket concept into rebalancer algo

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Wed Mar 28 00:24:12 MSK 2018


Pinned bucket is the bucket, that can not be sent out of its
replicaset. Taking pinned buckets into account changes rebalancer
algorithm, since now on some replicasets the perfect balance can
not be reached.

Iterative algorithm is used to learn the best balance in a
cluster. On each step it calculates perfect bucket count for each
replicaset. If this count can not be satisfied due to pinned
buckets, the algorithm does best effort to get the perfect
balance. This is done via ignoring of replicasets disbalanced via
pinning, and their pinned buckets. After that a new balance is
calculated. And it can happen, that it can not be satisfied too.
It is possible, because ignoring of pinned buckets in
overpopulated replicasets leads to decrease of perfect bucket
count in other replicasets, and a new values can become less that
their pinned bucket count.

Part of #71
---
 test/unit/rebalancer.result   | 333 +++++++++++++++++++++++++++++++++++++++++-
 test/unit/rebalancer.test.lua |  79 ++++++++++
 vshard/consts.lua             |   1 +
 vshard/replicaset.lua         | 102 +++++++++----
 vshard/storage/init.lua       |  22 +--
 5 files changed, 500 insertions(+), 37 deletions(-)

diff --git a/test/unit/rebalancer.result b/test/unit/rebalancer.result
index af909eb..ff7fcfc 100644
--- a/test/unit/rebalancer.result
+++ b/test/unit/rebalancer.result
@@ -369,7 +369,8 @@ _bucket:replace{3, consts.BUCKET.SENT}
 ...
 get_state()
 ---
-- 2
+- bucket_active_count: 2
+  bucket_pinned_count: 0
 ...
 _bucket:replace{1, consts.BUCKET.RECEIVING}
 ---
@@ -643,6 +644,336 @@ replicasets
     ethalon_bucket_count: 34
     bucket_count: 25
 ...
+--
+-- gh-71: allow to pin buckets. A pinned bucket can not be sent
+-- out of its replicaset even to satisfy perfect balance.
+--
+-- For this case the rebalancer does best effort balance. The
+-- perfect balance here is unrechable, since on each replicaset
+-- too many buckets are pinned.
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+replicasets = {
+	uuid1 = {bucket_count = 33, pinned_count = 26, weight = 1},
+	uuid2 = {bucket_count = 33, pinned_count = 24, weight = 1},
+	uuid3 = {bucket_count = 34, pinned_count = 30, weight = 1},
+	uuid4 = {bucket_count = 0, weight = 1},
+};
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+calc_ethalon(replicasets, 100)
+---
+...
+replicasets
+---
+- uuid4:
+    weight: 1
+    ethalon_bucket_count: 20
+    bucket_count: 0
+  uuid1:
+    bucket_count: 33
+    ethalon_bucket_count: 26
+    ignore_disbalance: true
+    weight: 1
+    pinned_count: 26
+  uuid3:
+    bucket_count: 34
+    ethalon_bucket_count: 30
+    ignore_disbalance: true
+    weight: 1
+    pinned_count: 30
+  uuid2:
+    bucket_count: 33
+    ethalon_bucket_count: 24
+    ignore_disbalance: true
+    weight: 1
+    pinned_count: 24
+...
+calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING)
+---
+- 100
+...
+replicasets
+---
+- uuid4:
+    needed: 20
+    weight: 1
+    ethalon_bucket_count: 20
+    bucket_count: 0
+  uuid1:
+    bucket_count: 33
+    ethalon_bucket_count: 26
+    ignore_disbalance: true
+    needed: -7
+    weight: 1
+    pinned_count: 26
+  uuid3:
+    bucket_count: 34
+    ethalon_bucket_count: 30
+    ignore_disbalance: true
+    needed: -4
+    weight: 1
+    pinned_count: 30
+  uuid2:
+    bucket_count: 33
+    ethalon_bucket_count: 24
+    ignore_disbalance: true
+    needed: -9
+    weight: 1
+    pinned_count: 24
+...
+--
+-- Here the disbalance is ok for the replicaset with uuid1 only -
+-- other replicasets have pinned buckets too, but not enough to
+-- break the balance: buckets are moved ok to uuid4.
+--
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+replicasets = {
+	uuid1 = {bucket_count = 33, pinned_count = 30, weight = 1},
+	uuid2 = {bucket_count = 33, pinned_count = 10, weight = 1},
+	uuid3 = {bucket_count = 34, pinned_count = 15, weight = 1},
+	uuid4 = {bucket_count = 0, weight = 1},
+};
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+calc_ethalon(replicasets, 100)
+---
+...
+replicasets
+---
+- uuid4:
+    weight: 1
+    ethalon_bucket_count: 23
+    bucket_count: 0
+  uuid1:
+    bucket_count: 33
+    ethalon_bucket_count: 30
+    ignore_disbalance: true
+    weight: 1
+    pinned_count: 30
+  uuid3:
+    ethalon_bucket_count: 23
+    bucket_count: 34
+    pinned_count: 15
+    weight: 1
+  uuid2:
+    ethalon_bucket_count: 24
+    bucket_count: 33
+    pinned_count: 10
+    weight: 1
+...
+calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING)
+---
+- 100
+...
+replicasets
+---
+- uuid4:
+    needed: 23
+    weight: 1
+    ethalon_bucket_count: 23
+    bucket_count: 0
+  uuid1:
+    bucket_count: 33
+    ethalon_bucket_count: 30
+    ignore_disbalance: true
+    needed: -3
+    weight: 1
+    pinned_count: 30
+  uuid3:
+    bucket_count: 34
+    ethalon_bucket_count: 23
+    needed: -11
+    weight: 1
+    pinned_count: 15
+  uuid2:
+    bucket_count: 33
+    ethalon_bucket_count: 24
+    needed: -9
+    weight: 1
+    pinned_count: 10
+...
+--
+-- Non-locked replicaset with any pinned bucket count can receive
+-- more buckets, if the rebalancer decides it is the best balance.
+--
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+replicasets = {
+	uuid1 = {bucket_count = 30, pinned_count = 25, weight = 0},
+	uuid2 = {bucket_count = 25, pinned_count = 25, weight = 1},
+	uuid3 = {bucket_count = 25, pinned_count = 25, weight = 1},
+	uuid4 = {bucket_count = 20, weight = 0},
+};
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+calc_ethalon(replicasets, 100)
+---
+...
+replicasets
+---
+- uuid4:
+    weight: 0
+    ethalon_bucket_count: 0
+    bucket_count: 20
+  uuid1:
+    bucket_count: 30
+    ethalon_bucket_count: 25
+    ignore_disbalance: true
+    weight: 0
+    pinned_count: 25
+  uuid3:
+    ethalon_bucket_count: 37
+    bucket_count: 25
+    pinned_count: 25
+    weight: 1
+  uuid2:
+    ethalon_bucket_count: 38
+    bucket_count: 25
+    pinned_count: 25
+    weight: 1
+...
+calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING)
+---
+- inf
+...
+replicasets
+---
+- uuid4:
+    needed: -20
+    weight: 0
+    ethalon_bucket_count: 0
+    bucket_count: 20
+  uuid1:
+    bucket_count: 30
+    ethalon_bucket_count: 25
+    ignore_disbalance: true
+    needed: -5
+    weight: 0
+    pinned_count: 25
+  uuid3:
+    bucket_count: 25
+    ethalon_bucket_count: 37
+    needed: 12
+    weight: 1
+    pinned_count: 25
+  uuid2:
+    bucket_count: 25
+    ethalon_bucket_count: 38
+    needed: 13
+    weight: 1
+    pinned_count: 25
+...
+--
+-- Check that the rebalancer can calculate a complex case, when a
+-- perfect balance is learned in several steps of the algorithm.
+-- Here on the first step it is calculated, that each replicaset
+-- must contain 25 buckets. But UUID1 can not satisfy it, so it
+-- is ignored. On the next step there are 100 - 30 pinned buckets
+-- from UUID1 = 70 buckets, and 3 replicasets. A new perfect
+-- balance is 23-23-24. But it can not be satisfied too - UUID2
+-- has 25 pinned buckets, so it is ignored. On the third step
+-- there are 70 - 25 = 45 buckets and 2 replicasets. A new perfect
+-- balance is 22-23. But it is unreachable too, because UUID3 has
+-- 24 pinned buckets. So only UUID4 is not ignored, and it
+-- receives all non-pinned buckets: 45 - 24 = 21.
+--
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+replicasets = {
+	uuid1 = {bucket_count = 33, pinned_count = 30, weight = 1},
+	uuid2 = {bucket_count = 33, pinned_count = 25, weight = 1},
+	uuid3 = {bucket_count = 34, pinned_count = 24, weight = 1},
+	uuid4 = {bucket_count = 0, weight = 1},
+};
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+calc_ethalon(replicasets, 100)
+---
+...
+replicasets
+---
+- uuid4:
+    weight: 1
+    ethalon_bucket_count: 21
+    bucket_count: 0
+  uuid1:
+    bucket_count: 33
+    ethalon_bucket_count: 30
+    ignore_disbalance: true
+    weight: 1
+    pinned_count: 30
+  uuid3:
+    bucket_count: 34
+    ethalon_bucket_count: 24
+    ignore_disbalance: true
+    weight: 1
+    pinned_count: 24
+  uuid2:
+    bucket_count: 33
+    ethalon_bucket_count: 25
+    ignore_disbalance: true
+    weight: 1
+    pinned_count: 25
+...
+calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING)
+---
+- 100
+...
+replicasets
+---
+- uuid4:
+    needed: 21
+    weight: 1
+    ethalon_bucket_count: 21
+    bucket_count: 0
+  uuid1:
+    bucket_count: 33
+    ethalon_bucket_count: 30
+    ignore_disbalance: true
+    needed: -3
+    weight: 1
+    pinned_count: 30
+  uuid3:
+    bucket_count: 34
+    ethalon_bucket_count: 24
+    ignore_disbalance: true
+    needed: -10
+    weight: 1
+    pinned_count: 24
+  uuid2:
+    bucket_count: 33
+    ethalon_bucket_count: 25
+    ignore_disbalance: true
+    needed: -8
+    weight: 1
+    pinned_count: 25
+...
 _bucket:drop()
 ---
 ...
diff --git a/test/unit/rebalancer.test.lua b/test/unit/rebalancer.test.lua
index d22116b..71411bb 100644
--- a/test/unit/rebalancer.test.lua
+++ b/test/unit/rebalancer.test.lua
@@ -162,4 +162,83 @@ test_run:cmd("setopt delimiter ''");
 calc_ethalon(replicasets, 100)
 replicasets
 
+--
+-- gh-71: allow to pin buckets. A pinned bucket can not be sent
+-- out of its replicaset even to satisfy perfect balance.
+--
+-- For this case the rebalancer does best effort balance. The
+-- perfect balance here is unrechable, since on each replicaset
+-- too many buckets are pinned.
+test_run:cmd("setopt delimiter ';'")
+replicasets = {
+	uuid1 = {bucket_count = 33, pinned_count = 26, weight = 1},
+	uuid2 = {bucket_count = 33, pinned_count = 24, weight = 1},
+	uuid3 = {bucket_count = 34, pinned_count = 30, weight = 1},
+	uuid4 = {bucket_count = 0, weight = 1},
+};
+test_run:cmd("setopt delimiter ''");
+calc_ethalon(replicasets, 100)
+replicasets
+calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING)
+replicasets
+--
+-- Here the disbalance is ok for the replicaset with uuid1 only -
+-- other replicasets have pinned buckets too, but not enough to
+-- break the balance: buckets are moved ok to uuid4.
+--
+test_run:cmd("setopt delimiter ';'")
+replicasets = {
+	uuid1 = {bucket_count = 33, pinned_count = 30, weight = 1},
+	uuid2 = {bucket_count = 33, pinned_count = 10, weight = 1},
+	uuid3 = {bucket_count = 34, pinned_count = 15, weight = 1},
+	uuid4 = {bucket_count = 0, weight = 1},
+};
+test_run:cmd("setopt delimiter ''");
+calc_ethalon(replicasets, 100)
+replicasets
+calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING)
+replicasets
+--
+-- Non-locked replicaset with any pinned bucket count can receive
+-- more buckets, if the rebalancer decides it is the best balance.
+--
+test_run:cmd("setopt delimiter ';'")
+replicasets = {
+	uuid1 = {bucket_count = 30, pinned_count = 25, weight = 0},
+	uuid2 = {bucket_count = 25, pinned_count = 25, weight = 1},
+	uuid3 = {bucket_count = 25, pinned_count = 25, weight = 1},
+	uuid4 = {bucket_count = 20, weight = 0},
+};
+test_run:cmd("setopt delimiter ''");
+calc_ethalon(replicasets, 100)
+replicasets
+calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING)
+replicasets
+--
+-- Check that the rebalancer can calculate a complex case, when a
+-- perfect balance is learned in several steps of the algorithm.
+-- Here on the first step it is calculated, that each replicaset
+-- must contain 25 buckets. But UUID1 can not satisfy it, so it
+-- is ignored. On the next step there are 100 - 30 pinned buckets
+-- from UUID1 = 70 buckets, and 3 replicasets. A new perfect
+-- balance is 23-23-24. But it can not be satisfied too - UUID2
+-- has 25 pinned buckets, so it is ignored. On the third step
+-- there are 70 - 25 = 45 buckets and 2 replicasets. A new perfect
+-- balance is 22-23. But it is unreachable too, because UUID3 has
+-- 24 pinned buckets. So only UUID4 is not ignored, and it
+-- receives all non-pinned buckets: 45 - 24 = 21.
+--
+test_run:cmd("setopt delimiter ';'")
+replicasets = {
+	uuid1 = {bucket_count = 33, pinned_count = 30, weight = 1},
+	uuid2 = {bucket_count = 33, pinned_count = 25, weight = 1},
+	uuid3 = {bucket_count = 34, pinned_count = 24, weight = 1},
+	uuid4 = {bucket_count = 0, weight = 1},
+};
+test_run:cmd("setopt delimiter ''");
+calc_ethalon(replicasets, 100)
+replicasets
+calc_metrics(replicasets, consts.DEFAULT_REBALANCER_MAX_RECEIVING)
+replicasets
+
 _bucket:drop()
diff --git a/vshard/consts.lua b/vshard/consts.lua
index 1a0e4ab..db0daf4 100644
--- a/vshard/consts.lua
+++ b/vshard/consts.lua
@@ -2,6 +2,7 @@ return {
     -- Bucket FSM
     BUCKET = {
         ACTIVE = 'active',
+        PINNED = 'pinned',
         SENDING = 'sending',
         SENT = 'sent',
         RECEIVING = 'receiving',
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
index 724b1b3..6a888d7 100644
--- a/vshard/replicaset.lua
+++ b/vshard/replicaset.lua
@@ -396,42 +396,90 @@ local replica_mt = {
 
 --
 -- Calculate for each replicaset its ethalon bucket count.
+-- Iterative algorithm is used to learn the best balance in a
+-- cluster. On each step it calculates perfect bucket count for
+-- each replicaset. If this count can not be satisfied due to
+-- pinned buckets, the algorithm does best effort to get the
+-- perfect balance. This is done via ignoring of replicasets
+-- disbalanced via pinning, and their pinned buckets. After that a
+-- new balance is calculated. And it can happen, that it can not
+-- be satisfied too. It is possible, because ignoring of pinned
+-- buckets in overpopulated replicasets leads to decrease of
+-- perfect bucket count in other replicasets, and a new values can
+-- become less that their pinned bucket count.
+--
+-- On each step the algorithm either is finished, or ignores at
+-- least one new overpopulated replicaset, so it has complexity
+-- O(N^2), where N - replicaset count.
 --
 local function cluster_calculate_ethalon_balance(replicasets, bucket_count)
+    local is_balance_found = false
     local weight_sum = 0
+    local step_count = 0
+    local replicaset_count = 0
     for _, replicaset in pairs(replicasets) do
         weight_sum = weight_sum + replicaset.weight
+        replicaset_count = replicaset_count + 1
     end
-    assert(weight_sum > 0)
-    local bucket_per_weight = bucket_count / weight_sum
-    local buckets_calculated = 0
-    for _, replicaset in pairs(replicasets) do
-        replicaset.ethalon_bucket_count =
-            math.ceil(replicaset.weight * bucket_per_weight)
-        buckets_calculated =
-            buckets_calculated + replicaset.ethalon_bucket_count
-    end
-    if buckets_calculated == bucket_count then
-        return
-    end
-    -- A situation is possible, when bucket_per_weight is not
-    -- integer. Lets spread this disbalance over cluster to
-    -- make for any replicaset pair
-    -- |replicaset_1 - replicaset_2| <= 1 - this difference is
-    -- admissible.
-    local buckets_rest = buckets_calculated - bucket_count
-    for _, replicaset in pairs(replicasets) do
-        local ceil = math.ceil(replicaset.weight * bucket_per_weight)
-        local floor = math.floor(replicaset.weight * bucket_per_weight)
-        if replicaset.ethalon_bucket_count > 0 and ceil ~= floor then
-            replicaset.ethalon_bucket_count = replicaset.ethalon_bucket_count - 1
-            buckets_rest = buckets_rest - 1
-            if buckets_rest == 0 then
-                return
+    while not is_balance_found do
+        step_count = step_count + 1
+        assert(weight_sum > 0)
+        local bucket_per_weight = bucket_count / weight_sum
+        local buckets_calculated = 0
+        for _, replicaset in pairs(replicasets) do
+            if not replicaset.ignore_disbalance then
+                replicaset.ethalon_bucket_count =
+                    math.ceil(replicaset.weight * bucket_per_weight)
+                buckets_calculated =
+                    buckets_calculated + replicaset.ethalon_bucket_count
             end
         end
+        local buckets_rest = buckets_calculated - bucket_count
+        is_balance_found = true
+        for _, replicaset in pairs(replicasets) do
+            if not replicaset.ignore_disbalance then
+                -- A situation is possible, when bucket_per_weight
+                -- is not integer. Lets spread this disbalance
+                -- over the cluster.
+                if buckets_rest > 0 then
+                    local n = replicaset.weight * bucket_per_weight
+                    local ceil = math.ceil(n)
+                    local floor = math.floor(n)
+                    if replicaset.ethalon_bucket_count > 0 and ceil ~= floor then
+                        replicaset.ethalon_bucket_count =
+                            replicaset.ethalon_bucket_count - 1
+                        buckets_rest = buckets_rest - 1
+                    end
+                end
+                --
+                -- Search for incorrigible disbalance due to
+                -- pinned buckets.
+                --
+                local pinned = replicaset.pinned_count
+                if pinned and replicaset.ethalon_bucket_count < pinned then
+                    -- This replicaset can not send out enough
+                    -- buckets to reach a balance. So do the best
+                    -- effort balance by sending from the
+                    -- replicaset though non-pinned buckets. This
+                    -- replicaset and its pinned buckets does not
+                    -- participate in the next steps of balance
+                    -- calculation.
+                    is_balance_found = false
+                    bucket_count = bucket_count - replicaset.pinned_count
+                    replicaset.ethalon_bucket_count = replicaset.pinned_count
+                    replicaset.ignore_disbalance = true
+                    weight_sum = weight_sum - replicaset.weight
+                end
+            end
+        end
+        assert(buckets_rest == 0)
+        if step_count > replicaset_count then
+            -- This can happed only because of a bug in this
+            -- algorithm. But it occupies 100% of transaction
+            -- thread, so check step count explicitly.
+            return error('PANIC: the rebalancer is broken')
+        end
     end
-    assert(buckets_rest == 0)
 end
 
 --
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index fe8a40f..99fbf7b 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -1213,19 +1213,20 @@ local function rebalancer_download_states()
     local total_bucket_locked_count = 0
     local total_bucket_active_count = 0
     for uuid, replicaset in pairs(M.replicasets) do
-        local bucket_active_count =
+        local state =
             replicaset:callrw('vshard.storage.rebalancer_request_state', {})
-        if bucket_active_count == nil then
+        if state == nil then
             return
         end
+        local bucket_count = state.bucket_active_count +
+                             state.bucket_pinned_count
         if replicaset.lock then
-            total_bucket_locked_count =
-                total_bucket_locked_count + bucket_active_count
+            total_bucket_locked_count = total_bucket_locked_count + bucket_count
         else
-            total_bucket_active_count =
-                total_bucket_active_count + bucket_active_count
-            replicasets[uuid] = {bucket_count = bucket_active_count,
-                                 weight = replicaset.weight}
+            total_bucket_active_count = total_bucket_active_count + bucket_count
+            replicasets[uuid] = {bucket_count = bucket_count,
+                                 weight = replicaset.weight,
+                                 pinned_count = state.bucket_pinned_count}
         end
     end
     local sum = total_bucket_active_count + total_bucket_locked_count
@@ -1325,7 +1326,10 @@ local function rebalancer_request_state()
         return
     end
     local bucket_count = _bucket:count()
-    return status_index:count({consts.BUCKET.ACTIVE})
+    return {
+        bucket_active_count = status_index:count({consts.BUCKET.ACTIVE}),
+        bucket_pinned_count = status_index:count({consts.BUCKET.PINNED}),
+    }
 end
 
 --
-- 
2.14.3 (Apple Git-98)





More information about the Tarantool-patches mailing list