From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Mon, 25 Feb 2019 20:44:37 +0300 From: Vladimir Davydov Subject: Re: [PATCH v3 7/7] memtx: introduce tuple compare hint Message-ID: <20190225174437.yagywystuzbl7skp@esperanza> References: MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: To: Kirill Shcherbatov Cc: tarantool-patches@freelists.org, Alexandr Lyapunov List-ID: On Fri, Feb 22, 2019 at 06:42:32PM +0300, Kirill Shcherbatov wrote: > From: Alexandr Lyapunov > > Implement functions for retrieving tuple hints for a particular > key_def. Hint is an integer that can be used for tuple comparison > optimization: if a hint of one tuple is less than a hint of another > then the first tuple is definitely less than the second; only if > hints are equal tuple_compare must be called for getting comparison > result. > > Hints are calculated using only the first part of key_def. > > Hints are only useful when: > * they are precalculated and stored along with the tuple; > calculation is not cheap (cheaper than tuple_compare btw) but > precalculated hints allow to compare tuples without even fetching > tuple data. > * first part of key_def is 'string'(for now) > * since hint is calculated using only the first part of key_def > (and only first several characters if it is a string) this part > must be different with high probability for every tuple pair. > > Closes #3961 > --- > src/box/CMakeLists.txt | 1 + > src/box/key_def.c | 2 + > src/box/key_def.h | 44 ++++++++ > src/box/memtx_tree_decl.c | 89 ++++++++++++++++ > src/box/tuple_hint.cc | 210 ++++++++++++++++++++++++++++++++++++++ > src/box/tuple_hint.h | 51 +++++++++ > src/coll.c | 33 ++++++ > src/coll.h | 4 + > 8 files changed, 434 insertions(+) > create mode 100644 src/box/tuple_hint.cc > create mode 100644 src/box/tuple_hint.h See a few minor comments inline. Apart from them the patch is fine by me, but as Kostja justifiably noted, may be it's worth enabling hints for all data types, including floating point numbers. Then we wouldn't need the memtx tree parametrization introduced by the previous patches. OTOH we will probably need it anyway for multikey indexes. > diff --git a/src/box/tuple_hint.cc b/src/box/tuple_hint.cc > new file mode 100644 > index 000000000..7b73403be > --- /dev/null > +++ b/src/box/tuple_hint.cc > @@ -0,0 +1,210 @@ > +/* > + * Copyright 2010-2019, Tarantool AUTHORS, please see AUTHORS file. > + * > + * Redistribution and use in source and binary forms, with or > + * without modification, are permitted provided that the following > + * conditions are met: > + * > + * 1. Redistributions of source code must retain the above > + * copyright notice, this list of conditions and the > + * following disclaimer. > + * > + * 2. Redistributions in binary form must reproduce the above > + * copyright notice, this list of conditions and the following > + * disclaimer in the documentation and/or other materials > + * provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED > + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL > + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, > + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF > + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR > + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF > + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF > + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > +#include "coll.h" > +#include "tuple.h" > +#include "tuple_hint.h" > + > +static uint64_t > +key_hint_default(const char *key, struct key_def *key_def) > +{ > + (void)key; > + (void)key_def; > + return 0; > +} > + > +static uint64_t > +tuple_hint_default(const struct tuple *tuple, struct key_def *key_def) > +{ > + (void)tuple; > + (void)key_def; > + return 0; > +} > + > +template > +static uint64_t > +key_hint_uint(const char *key, struct key_def *key_def) > +{ > + (void)key_def; > + assert(key_part_is_nullable(key_def->parts) == is_nullable); > + assert(key_def->parts->type == FIELD_TYPE_UNSIGNED || > + key_def->parts->type == FIELD_TYPE_INTEGER); FIELD_TYPE_INTEGER? How can that be? > + if (is_nullable && mp_typeof(*key) == MP_NIL) > + return 0; > + assert(mp_typeof(*key) == MP_UINT); > + uint64_t val = mp_decode_uint(&key); > + if (val > INT64_MAX) > + return INT64_MAX; > + return val - (uint64_t)INT64_MIN; > +} > + > +template > +static uint64_t > +tuple_hint_uint(const struct tuple *tuple, struct key_def *key_def) > +{ > + assert(key_part_is_nullable(key_def->parts) == is_nullable); > + assert(key_def->parts->type == FIELD_TYPE_UNSIGNED); > + const char *field = tuple_field_by_part(tuple, key_def->parts); > + if (is_nullable && field == NULL) > + return 0; > + return key_hint_uint(field, key_def); > +} > + > +template > +static uint64_t > +key_hint_int(const char *key, struct key_def *key_def) > +{ > + assert(key_part_is_nullable(key_def->parts) == is_nullable); > + assert(key_def->parts->type == FIELD_TYPE_INTEGER); > + if (is_nullable && mp_typeof(*key) == MP_NIL) > + return 0; > + if (mp_typeof(*key) == MP_UINT) { > + return key_hint_uint(key, key_def); > + } else { > + assert(mp_typeof(*key) == MP_INT); > + int64_t val = mp_decode_int(&key); > + return (uint64_t)val - (uint64_t)INT64_MIN; > + } > +} > + > +template > +static uint64_t > +tuple_hint_int(const struct tuple *tuple, struct key_def *key_def) > +{ > + assert(key_part_is_nullable(key_def->parts) == is_nullable); > + assert(key_def->parts->type == FIELD_TYPE_INTEGER); > + const char *field = tuple_field_by_part(tuple, key_def->parts); > + if (is_nullable && field == NULL) > + return 0; > + return key_hint_int(field, key_def); > +} > + > +template > +static uint64_t > +key_hint_string(const char *key, struct key_def *key_def) > +{ > + (void)key_def; > + assert(key_part_is_nullable(key_def->parts) == is_nullable); > + assert(key_def->parts->type == FIELD_TYPE_STRING); Assert that coll is NULL. > + if (is_nullable && mp_typeof(*key) == MP_NIL) > + return 0; > + assert(key_def->parts->type == FIELD_TYPE_STRING); Duplicate assertion. > + assert(mp_typeof(*key) == MP_STR); > + uint32_t len; > + const unsigned char *str = > + (const unsigned char *)mp_decode_str(&key, &len); > + uint64_t result = 0; > + uint32_t process_len = MIN(len, 8); > + for (uint32_t i = 0; i < process_len; i++) { > + result <<= 8; > + result |= str[i]; > + } > + result <<= 8 * (8 - process_len); > + return result; > +} > + > +template > +static uint64_t > +tuple_hint_string(const struct tuple *tuple, struct key_def *key_def) > +{ > + assert(key_part_is_nullable(key_def->parts) == is_nullable); > + assert(key_def->parts->type == FIELD_TYPE_STRING); Assert that coll is NULL. > + const char *field = tuple_field_by_part(tuple, key_def->parts); > + if (is_nullable && field == NULL) > + return 0; > + return key_hint_string(field, key_def); > +} > + > +template > +static uint64_t > +key_hint_string_coll(const char *key, struct key_def *key_def) > +{ > + assert(key_part_is_nullable(key_def->parts) == is_nullable); > + assert(key_def->parts->type == FIELD_TYPE_STRING && > + key_def->parts->coll != NULL); > + if (is_nullable && mp_typeof(*key) == MP_NIL) > + return 0; > + assert(key_def->parts->type == FIELD_TYPE_STRING); Duplicate assertion. > + assert(mp_typeof(*key) == MP_STR); > + assert(key_def->parts->coll != NULL); Duplicate assertion. > + uint32_t len; > + const char *str = mp_decode_str(&key, &len); > + return key_def->parts->coll->hint(str, len, key_def->parts->coll); > +} > diff --git a/src/coll.c b/src/coll.c > index 6d9c44dbf..bf892087d 100644 > --- a/src/coll.c > +++ b/src/coll.c > @@ -133,6 +133,37 @@ coll_bin_hash(const char *s, size_t s_len, uint32_t *ph, uint32_t *pcarry, > return s_len; > } > > +/** Get a compare hint of a binary collation. */ > +static uint64_t > +coll_bin_hint(const char *s, size_t s_len, struct coll *coll) > +{ > + (void)coll; > + uint64_t result = 0; > + uint32_t process_len = MIN(s_len, 8); > + for (uint32_t i = 0; i < process_len; i++) { > + result <<= 8; > + result |= ((unsigned char *)s)[i]; > + } > + result <<= 8 * (8 - process_len); > + return result; > +} > + > +/** Get a compare hint of a string using ICU collation. */ > +static uint64_t > +coll_icu_hint(const char *s, size_t s_len, struct coll *coll) > +{ > + assert(coll->type == COLL_TYPE_ICU); > + UCharIterator itr; > + uiter_setUTF8(&itr, s, s_len); > + uint8_t buf[7]; Should be buf[8]? > + uint32_t state[2] = {0, 0}; > + UErrorCode status = U_ZERO_ERROR; > + int32_t got = ucol_nextSortKeyPart(coll->collator, &itr, state, buf, > + sizeof(buf), &status); > + assert(got >= 0 && got <= 7); > + return coll_bin_hint((const char *)buf, got, NULL); > +}