From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp43.i.mail.ru (smtp43.i.mail.ru [94.100.177.103]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 0C825469719 for ; Wed, 26 Feb 2020 02:52:44 +0300 (MSK) From: Vladislav Shpilevoy Date: Wed, 26 Feb 2020 00:52:41 +0100 Message-Id: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH vshard 1/1] router: bucket_id_strcrc32 and bucket_id_mpcrc32 List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: tarantool-patches@dev.tarantool.org, olegrok@tarantool.org Closes #207 @TarantoolBot document Title: vshard.router.bucket_id_strcrc32() and .bucket_id_mpcrc32() vshard.router.bucket_id() is deprecated, each its usage logs a warning. It still works, but will be deleted in future. Behaviour of the old bucket_id() function is now available as vshard.router.bucket_id_strcrc32(). It works exactly like the old function, but does not log a warning. The reason why there is a new function bucket_id_mpcrc32() is that the old bucket_id() and the new bucket_id_strcrc32() are not consistent for cdata numbers. In particular, they return 3 different values for normal Lua numbers like 123, for unsigned long long cdata (like 123ULL, or ffi.cast('unsigned long long', 123)), and for signed long long cdata (like 123LL, or ffi.cast('long long', 123)). Note, this is important! vshard.router.bucket_id(123) vshard.router.bucket_id(123LL) vshard.router.bucket_id(123ULL) Return 3 different values!!! For float and double cdata (ffi.cast('float', number), ffi.cast('double', number)) these functions return different values even for the same numbers of the same floating point type. This is because tostring() on a floating point cdata number returns not the number, but a pointer at it. Different on each call. vshard.router.bucket_id_strcrc32() behaves exactly the same. vshard.router.bucket_id_mpcrc32() is safer. It takes a CRC32 from MessagePack encoded value. That is, bucket_id of integers does not depend on their Lua type. However it still may return different values for not equal floating point types. That is, ffi.cast('float', number) may be reflected onto a bucket id not equal to ffi.cast('double', number). This can't be fixed, because a float value, even being casted to double, may have a garbage tail in its fraction. Floating point keys should not be used to calculate a bucket id, usually. A final note - bucket_id_mpcrc32() in case of a string key does not encode it into MessagePack, but takes hash right from the string. This does not affect consistency of the function, but makes it as fast as bucket_id_strcrc32(). --- Branch: http://github.com/tarantool/vshard/tree/gerold103/gh-207-bucket_id-cdata-numbers Issue: https://github.com/tarantool/vshard/issues/207 test/router/router.result | 27 ------- test/router/router.test.lua | 11 --- test/unit/router.result | 149 ++++++++++++++++++++++++++++++++++++ test/unit/router.test.lua | 49 ++++++++++++ vshard/hash.lua | 41 +++++++++- vshard/router/init.lua | 23 +++++- 6 files changed, 257 insertions(+), 43 deletions(-) create mode 100644 test/unit/router.result create mode 100644 test/unit/router.test.lua diff --git a/test/router/router.result b/test/router/router.result index 47d3278..31a351f 100644 --- a/test/router/router.result +++ b/test/router/router.result @@ -139,33 +139,6 @@ _ = test_run:cmd("setopt delimiter ''"); --- ... -- --- bucket_id and bucket_count --- -util.check_error(vshard.router.bucket_id) -- invalid arguments ---- -- 'Usage: vshard.router.bucket_id(key)' -... -vshard.router.bucket_id(1) ---- -- 477 -... -vshard.router.bucket_id(2) ---- -- 401 -... -vshard.router.bucket_id({2}) ---- -- 401 -... -vshard.router.bucket_id('2') ---- -- 401 -... -vshard.router.bucket_count() ---- -- 3000 -... --- -- Initial distribution -- util.check_error(vshard.router.call, 1, 'read', 'echo', {123}) diff --git a/test/router/router.test.lua b/test/router/router.test.lua index d659515..571759a 100644 --- a/test/router/router.test.lua +++ b/test/router/router.test.lua @@ -59,17 +59,6 @@ for _, new_rs in pairs(new_replicasets) do end; _ = test_run:cmd("setopt delimiter ''"); --- --- bucket_id and bucket_count --- - -util.check_error(vshard.router.bucket_id) -- invalid arguments -vshard.router.bucket_id(1) -vshard.router.bucket_id(2) -vshard.router.bucket_id({2}) -vshard.router.bucket_id('2') -vshard.router.bucket_count() - -- -- Initial distribution -- diff --git a/test/unit/router.result b/test/unit/router.result new file mode 100644 index 0000000..36366ac --- /dev/null +++ b/test/unit/router.result @@ -0,0 +1,149 @@ +-- test-run result file version 2 +ffi = require('ffi') + | --- + | ... +util = require('util') + | --- + | ... +vshard = require('vshard') + | --- + | ... +vshard.router.cfg({sharding = {}}) + | --- + | ... + +-- +-- gh-207: vshard.router.bucket_id() was not consistent when +-- values were cdata numbers. +-- + +-- Decimal is not available in 1.10, but vshard and its tests +-- should work on 1.10. Decimal test is optional. +has_decimal, decimal = pcall(require, 'decimal') + | --- + | ... + +function check_values(bid, values) \ + local result = {} \ + for _, v in pairs(values) do \ + local v1 = bid(v) \ + local v2 = bid(v) \ + local t = type(v) \ + if t == 'cdata' then \ + t = ffi.typeof(v) \ + end \ + if v1 ~= v2 then \ + table.insert(result, {'not stable', {t, v, v1, v2}}) \ + else \ + table.insert(result, {t, v, v1}) \ + end \ + end \ + return result \ +end + | --- + | ... + +util.check_error(vshard.router.bucket_id) + | --- + | - 'Usage: vshard.router.bucket_id(key)' + | ... + +values = { \ + 1, 1LL, 1ULL, ffi.cast('uint64_t', 1), ffi.cast('int64_t', 1), '1', \ + has_decimal and decimal.new(1) or 1 \ +} + | --- + | ... + +check_values(vshard.router.bucket_id, values) + | --- + | - - - number + | - 1 + | - 477 + | - - ctype + | - 1 + | - 730 + | - - ctype + | - 1 + | - 982 + | - - ctype + | - 1 + | - 982 + | - - ctype + | - 1 + | - 730 + | - - string + | - '1' + | - 477 + | - - ctype + | - 1 + | - 477 + | ... +check_values(vshard.router.bucket_id_strcrc32, values) + | --- + | - - - number + | - 1 + | - 477 + | - - ctype + | - 1 + | - 730 + | - - ctype + | - 1 + | - 982 + | - - ctype + | - 1 + | - 982 + | - - ctype + | - 1 + | - 730 + | - - string + | - '1' + | - 477 + | - - ctype + | - 1 + | - 477 + | ... + +-- Floating point cdata is not tested in strcrc32, because +-- tostring() is not stable on them, and returns a pointer string. +table.insert(values, ffi.cast('double', 1)) + | --- + | ... +table.insert(values, ffi.cast('float', 1)) + | --- + | ... +check_values(vshard.router.bucket_id_mpcrc32, values) + | --- + | - - - number + | - 1 + | - 1614 + | - - ctype + | - 1 + | - 1614 + | - - ctype + | - 1 + | - 1614 + | - - ctype + | - 1 + | - 1614 + | - - ctype + | - 1 + | - 1614 + | - - string + | - '1' + | - 477 + | - - ctype + | - 1 + | - 1696 + | - - ctype + | - 1 + | - 193 + | - - ctype + | - 1 + | - 1411 + | ... + +vshard.router.bucket_count() + | --- + | - 3000 + | ... diff --git a/test/unit/router.test.lua b/test/unit/router.test.lua new file mode 100644 index 0000000..06c92f1 --- /dev/null +++ b/test/unit/router.test.lua @@ -0,0 +1,49 @@ +ffi = require('ffi') +util = require('util') +vshard = require('vshard') +vshard.router.cfg({sharding = {}}) + +-- +-- gh-207: vshard.router.bucket_id() was not consistent when +-- values were cdata numbers. +-- + +-- Decimal is not available in 1.10, but vshard and its tests +-- should work on 1.10. Decimal test is optional. +has_decimal, decimal = pcall(require, 'decimal') + +function check_values(bid, values) \ + local result = {} \ + for _, v in pairs(values) do \ + local v1 = bid(v) \ + local v2 = bid(v) \ + local t = type(v) \ + if t == 'cdata' then \ + t = ffi.typeof(v) \ + end \ + if v1 ~= v2 then \ + table.insert(result, {'not stable', {t, v, v1, v2}}) \ + else \ + table.insert(result, {t, v, v1}) \ + end \ + end \ + return result \ +end + +util.check_error(vshard.router.bucket_id) + +values = { \ + 1, 1LL, 1ULL, ffi.cast('uint64_t', 1), ffi.cast('int64_t', 1), '1', \ + has_decimal and decimal.new(1) or 1 \ +} + +check_values(vshard.router.bucket_id, values) +check_values(vshard.router.bucket_id_strcrc32, values) + +-- Floating point cdata is not tested in strcrc32, because +-- tostring() is not stable on them, and returns a pointer string. +table.insert(values, ffi.cast('double', 1)) +table.insert(values, ffi.cast('float', 1)) +check_values(vshard.router.bucket_id_mpcrc32, values) + +vshard.router.bucket_count() diff --git a/vshard/hash.lua b/vshard/hash.lua index b972eae..477b312 100644 --- a/vshard/hash.lua +++ b/vshard/hash.lua @@ -1,10 +1,13 @@ -- hash.lua local ldigest = require('digest') +local mpencode = require('msgpackffi').encode -- --- Simple hash function based on crc32 +-- Fast and simple hash. However it works incorrectly with +-- floating point cdata values. Also hash of an integer value +-- depends on its type: Lua number, cdata int64, cdata uint64. -- -local function key_hash(shard_key) +local function strcrc32(shard_key) if type(shard_key) ~= 'table' then return ldigest.crc32(tostring(shard_key)) else @@ -16,6 +19,38 @@ local function key_hash(shard_key) end end +local function mpcrc32_one(value) + if type(value) ~= 'string' then + return mpencode(value) + else + -- Despite the function called 'mp', strings are not + -- encoded. This is because it does not make much sense to + -- copy the whole string onto a temporary buffer just to + -- add a small MessagePack header. Such 'hack' makes + -- hashing of strings several magnitudes of order faster. + return value + end +end + +-- +-- Stable hash providing the correct values for integers not +-- depending on their size. However may return different hashes +-- for the same floating point value if it is cdata float or cdata +-- double. +-- +local function mpcrc32(shard_key) + if type(shard_key) ~= 'table' then + return ldigest.crc32(mpcrc32_one(shard_key)) + else + local crc32 = ldigest.crc32.new() + for _, v in ipairs(shard_key) do + crc32:update(mpcrc32_one(v)) + end + return crc32:result() + end +end + return { - key_hash = key_hash; + strcrc32 = strcrc32, + mpcrc32 = mpcrc32, } diff --git a/vshard/router/init.lua b/vshard/router/init.lua index 91ad7e6..370398b 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -1067,7 +1067,24 @@ local function router_bucket_id(router, key) if key == nil then error("Usage: vshard.router.bucket_id(key)") end - return lhash.key_hash(key) % router.total_bucket_count + 1 + log.warn('vshard.router.bucket_id() is deprecated, use '.. + 'vshard.router.bucket_id_strcrc32() or '.. + 'vshard.router.bucket_id_mpcrc32()') + return lhash.strcrc32(key) % router.total_bucket_count + 1 +end + +local function router_bucket_id_strcrc32(router, key) + if key == nil then + error("Usage: vshard.router.bucket_id_strcrc32(key)") + end + return lhash.strcrc32(key) % router.total_bucket_count + 1 +end + +local function router_bucket_id_mpcrc32(router, key) + if key == nil then + error("Usage: vshard.router.bucket_id_mpcrc32(key)") + end + return lhash.mpcrc32(key) % router.total_bucket_count + 1 end local function router_bucket_count(router) @@ -1124,7 +1141,9 @@ local router_mt = { callbre = router_callbre; route = router_route; routeall = router_routeall; - bucket_id = router_bucket_id; + bucket_id = router_bucket_id, + bucket_id_strcrc32 = router_bucket_id_strcrc32, + bucket_id_mpcrc32 = router_bucket_id_mpcrc32, bucket_count = router_bucket_count; sync = router_sync; bootstrap = cluster_bootstrap; -- 2.21.1 (Apple Git-122.3)